diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index f67bfbf5596b..30785340ef12 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -145,7 +145,8 @@ private: // Optimise the base and offsets of the given address bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); // Try to fold consecutive geps together into one - Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale, + IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, return true; } -static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, - IRBuilder<> &Builder) { +static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y, + unsigned ScaleY, IRBuilder<> &Builder) { // Splat the non-vector value to a vector of the given type - if the value is // a constant (and its value isn't too big), we can even use this opportunity // to scale it to the size of the vector elements @@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, ConstantInt *ConstYEl = dyn_cast(ConstY->getAggregateElement(i)); if (!ConstXEl || !ConstYEl || - ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + ConstXEl->getZExtValue() * ScaleX + + ConstYEl->getZExtValue() * ScaleY >= (unsigned)(1 << (TargetElemSize - 1))) return nullptr; } } - Value *Add = Builder.CreateAdd(X, Y); + Value *XScale = Builder.CreateVectorSplat( + XElType->getNumElements(), + Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX)); + Value *YScale = Builder.CreateVectorSplat( + YElType->getNumElements(), + Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY)); + Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale), + Builder.CreateMul(Y, YScale)); - FixedVectorType *GEPType = cast(GEP->getType()); - if (checkOffsetSize(Add, GEPType->getNumElements())) + if (checkOffsetSize(Add, XElType->getNumElements())) return Add; else return nullptr; } Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, - Value *&Offsets, + Value *&Offsets, unsigned &Scale, IRBuilder<> &Builder) { Value *GEPPtr = GEP->getPointerOperand(); Offsets = GEP->getOperand(1); + Scale = DL->getTypeAllocSize(GEP->getSourceElementType()); // We only merge geps with constant offsets, because only for those // we can make sure that we do not cause an overflow - if (!isa(Offsets)) + if (GEP->getNumIndices() != 1 || !isa(Offsets)) return nullptr; - GetElementPtrInst *BaseGEP; - if ((BaseGEP = dyn_cast(GEPPtr))) { + if (GetElementPtrInst *BaseGEP = dyn_cast(GEPPtr)) { // Merge the two geps into one - Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder); if (!BaseBasePtr) return nullptr; - Offsets = - CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + Offsets = CheckAndCreateOffsetAdd( + Offsets, Scale, GEP->getOperand(1), + DL->getTypeAllocSize(GEP->getSourceElementType()), Builder); if (Offsets == nullptr) return nullptr; + Scale = 1; // Scale is always an i8 at this point. return BaseBasePtr; } return GEPPtr; @@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, Builder.SetInsertPoint(GEP); Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); Value *Offsets; - Value *Base = foldGEP(GEP, Offsets, Builder); + unsigned Scale; + Value *Base = foldGEP(GEP, Offsets, Scale, Builder); // We only want to merge the geps if there is a real chance that they can be // used by an MVE gather; thus the offset has to have the correct size // (always i32 if it is not of vector type) and the base has to be a // pointer. if (Offsets && Base && Base != GEP) { + assert(Scale == 1 && "Expected to fold GEP to a scale of 1"); + Type *BaseTy = Builder.getInt8PtrTy(); + if (auto *VecTy = dyn_cast(Base->getType())) + BaseTy = FixedVectorType::get(BaseTy, VecTy); GetElementPtrInst *NewAddress = GetElementPtrInst::Create( - GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP); - GEP->replaceAllUsesWith(NewAddress); + Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets, + "gep.merged", GEP); + LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP + << "\n new : " << *NewAddress << "\n"); + GEP->replaceAllUsesWith( + Builder.CreateBitCast(NewAddress, GEP->getType())); GEP = NewAddress; Changed = true; } diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll index a40e7b8af3e1..d6dddf756895 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -294,19 +294,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI14_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI14_0: -; CHECK-NEXT: .short 20 @ 0x14 -; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .short 26 @ 0x1a -; CHECK-NEXT: .short 29 @ 0x1d -; CHECK-NEXT: .short 32 @ 0x20 -; CHECK-NEXT: .short 35 @ 0x23 -; CHECK-NEXT: .short 38 @ 0x26 -; CHECK-NEXT: .short 41 @ 0x29 +; CHECK-NEXT: .short 40 @ 0x28 +; CHECK-NEXT: .short 46 @ 0x2e +; CHECK-NEXT: .short 52 @ 0x34 +; CHECK-NEXT: .short 58 @ 0x3a +; CHECK-NEXT: .short 64 @ 0x40 +; CHECK-NEXT: .short 70 @ 0x46 +; CHECK-NEXT: .short 76 @ 0x4c +; CHECK-NEXT: .short 82 @ 0x52 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 @@ -319,19 +319,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI15_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI15_0: -; CHECK-NEXT: .short 20 @ 0x14 -; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .short 26 @ 0x1a -; CHECK-NEXT: .short 29 @ 0x1d -; CHECK-NEXT: .short 32 @ 0x20 -; CHECK-NEXT: .short 35 @ 0x23 -; CHECK-NEXT: .short 38 @ 0x26 -; CHECK-NEXT: .short 41 @ 0x29 +; CHECK-NEXT: .short 40 @ 0x28 +; CHECK-NEXT: .short 46 @ 0x2e +; CHECK-NEXT: .short 52 @ 0x34 +; CHECK-NEXT: .short 58 @ 0x3a +; CHECK-NEXT: .short 64 @ 0x40 +; CHECK-NEXT: .short 70 @ 0x46 +; CHECK-NEXT: .short 76 @ 0x4c +; CHECK-NEXT: .short 82 @ 0x52 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll index 9372a98b047a..a7261265552f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -318,15 +318,15 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI21_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI21_0: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 56 @ 0x38 entry: %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index 81119a1a1f19..344cfd415037 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -856,15 +856,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(i8* %base) { ; CHECK-OPAQ: @ %bb.0: ; CHECK-OPAQ-NEXT: adr r1, .LCPI31_0 ; CHECK-OPAQ-NEXT: vldrw.u32 q1, [r1] -; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-OPAQ-NEXT: bx lr ; CHECK-OPAQ-NEXT: .p2align 4 ; CHECK-OPAQ-NEXT: @ %bb.1: ; CHECK-OPAQ-NEXT: .LCPI31_0: -; CHECK-OPAQ-NEXT: .long 4294967295 @ 0xffffffff -; CHECK-OPAQ-NEXT: .long 15 @ 0xf -; CHECK-OPAQ-NEXT: .long 31 @ 0x1f -; CHECK-OPAQ-NEXT: .long 47 @ 0x2f +; CHECK-OPAQ-NEXT: .long 4294967292 @ 0xfffffffc +; CHECK-OPAQ-NEXT: .long 12 @ 0xc +; CHECK-OPAQ-NEXT: .long 28 @ 0x1c +; CHECK-OPAQ-NEXT: .long 44 @ 0x2c %a = getelementptr i8, i8* %base, <4 x i32> %b = bitcast <4 x i8*> %a to <4 x i32*> %c = getelementptr inbounds i32, <4 x i32*> %b, i32 -1 @@ -892,15 +892,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(i16* %base) { ; CHECK-OPAQ: @ %bb.0: ; CHECK-OPAQ-NEXT: adr r1, .LCPI32_0 ; CHECK-OPAQ-NEXT: vldrw.u32 q1, [r1] -; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-OPAQ-NEXT: bx lr ; CHECK-OPAQ-NEXT: .p2align 4 ; CHECK-OPAQ-NEXT: @ %bb.1: ; CHECK-OPAQ-NEXT: .LCPI32_0: -; CHECK-OPAQ-NEXT: .long 15 @ 0xf -; CHECK-OPAQ-NEXT: .long 5 @ 0x5 -; CHECK-OPAQ-NEXT: .long 29 @ 0x1d -; CHECK-OPAQ-NEXT: .long 235 @ 0xeb +; CHECK-OPAQ-NEXT: .long 12 @ 0xc +; CHECK-OPAQ-NEXT: .long 18 @ 0x12 +; CHECK-OPAQ-NEXT: .long 58 @ 0x3a +; CHECK-OPAQ-NEXT: .long 280 @ 0x118 %a = getelementptr i16, i16* %base, <4 x i32> %b = bitcast <4 x i16*> %a to <4 x i8*> %c = getelementptr i8, <4 x i8*> %b, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll index a4007a1077ff..96a0b5332211 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -62,9 +62,9 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q2, [r0, q0] ; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vstrw.32 q2, [r0, q1] ; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -72,15 +72,15 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 52 @ 0x34 +; CHECK-NEXT: .long 68 @ 0x44 ; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 60 @ 0x3c vector.ph: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -173,10 +173,10 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias ; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vldrh.u16 q2, [r0, q0] ; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: vadd.i16 q2, q2, r2 -; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] +; CHECK-NEXT: vstrh.16 q2, [r1, q1] ; CHECK-NEXT: adds r1, #64 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -184,23 +184,23 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .short 5 @ 0x5 -; CHECK-NEXT: .short 9 @ 0x9 -; CHECK-NEXT: .short 13 @ 0xd -; CHECK-NEXT: .short 17 @ 0x11 -; CHECK-NEXT: .short 21 @ 0x15 -; CHECK-NEXT: .short 25 @ 0x19 -; CHECK-NEXT: .short 29 @ 0x1d -; CHECK-NEXT: .short 33 @ 0x21 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 34 @ 0x22 +; CHECK-NEXT: .short 42 @ 0x2a +; CHECK-NEXT: .short 50 @ 0x32 +; CHECK-NEXT: .short 58 @ 0x3a +; CHECK-NEXT: .short 66 @ 0x42 ; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .short 3 @ 0x3 -; CHECK-NEXT: .short 7 @ 0x7 -; CHECK-NEXT: .short 11 @ 0xb -; CHECK-NEXT: .short 15 @ 0xf -; CHECK-NEXT: .short 19 @ 0x13 -; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .short 27 @ 0x1b -; CHECK-NEXT: .short 31 @ 0x1f +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .short 30 @ 0x1e +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 46 @ 0x2e +; CHECK-NEXT: .short 54 @ 0x36 +; CHECK-NEXT: .short 62 @ 0x3e vector.ph: %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -432,9 +432,9 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q2, [r0, q0] ; CHECK-NEXT: vadd.f32 q2, q2, r2 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vstrw.32 q2, [r0, q1] ; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -442,15 +442,15 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 52 @ 0x34 +; CHECK-NEXT: .long 68 @ 0x44 ; CHECK-NEXT: .LCPI7_1: -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 60 @ 0x3c vector.ph: ; preds = %entry %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer @@ -549,9 +549,9 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vldrh.u16 q2, [r0, q0] ; CHECK-NEXT: vadd.f16 q2, q2, r1 -; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: vstrh.16 q2, [r0, q1] ; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -559,23 +559,23 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .short 3 @ 0x3 -; CHECK-NEXT: .short 7 @ 0x7 -; CHECK-NEXT: .short 11 @ 0xb -; CHECK-NEXT: .short 15 @ 0xf -; CHECK-NEXT: .short 19 @ 0x13 -; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .short 27 @ 0x1b -; CHECK-NEXT: .short 31 @ 0x1f +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .short 30 @ 0x1e +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 46 @ 0x2e +; CHECK-NEXT: .short 54 @ 0x36 +; CHECK-NEXT: .short 62 @ 0x3e ; CHECK-NEXT: .LCPI9_1: -; CHECK-NEXT: .short 5 @ 0x5 -; CHECK-NEXT: .short 9 @ 0x9 -; CHECK-NEXT: .short 13 @ 0xd -; CHECK-NEXT: .short 17 @ 0x11 -; CHECK-NEXT: .short 21 @ 0x15 -; CHECK-NEXT: .short 25 @ 0x19 -; CHECK-NEXT: .short 29 @ 0x1d -; CHECK-NEXT: .short 33 @ 0x21 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 34 @ 0x22 +; CHECK-NEXT: .short 42 @ 0x2a +; CHECK-NEXT: .short 50 @ 0x32 +; CHECK-NEXT: .short 58 @ 0x3a +; CHECK-NEXT: .short 66 @ 0x42 vector.ph: %y.trunc = fptrunc float %y to half %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 @@ -620,17 +620,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, ; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q3, [r0, q0] ; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] -; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r0, q2] ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i32 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #48 ; CHECK-NEXT: vmul.i32 q5, q4, q5 ; CHECK-NEXT: vmul.i32 q4, q4, r3 ; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] -; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] -; CHECK-NEXT: vstrw.32 q3, [r1, q0, uxtw #2] +; CHECK-NEXT: vstrw.32 q5, [r1, q2] +; CHECK-NEXT: vstrw.32 q3, [r1, q0] ; CHECK-NEXT: add.w r1, r1, #48 ; CHECK-NEXT: bne .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -639,20 +639,20 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 40 @ 0x28 ; CHECK-NEXT: .LCPI10_1: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 9 @ 0x9 ; CHECK-NEXT: .LCPI10_2: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 44 @ 0x2c vector.ph: br label %vector.body @@ -790,17 +790,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, ; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q3, [r0, q0, uxtw #1] +; CHECK-NEXT: vldrh.u16 q3, [r0, q0] ; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] -; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] +; CHECK-NEXT: vldrh.u16 q5, [r0, q2] ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i16 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #48 ; CHECK-NEXT: vmul.i16 q5, q4, q5 ; CHECK-NEXT: vmul.i16 q4, q4, r3 ; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] -; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] -; CHECK-NEXT: vstrh.16 q3, [r1, q0, uxtw #1] +; CHECK-NEXT: vstrh.16 q5, [r1, q2] +; CHECK-NEXT: vstrh.16 q3, [r1, q0] ; CHECK-NEXT: add.w r1, r1, #48 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -809,14 +809,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .short 1 @ 0x1 -; CHECK-NEXT: .short 4 @ 0x4 -; CHECK-NEXT: .short 7 @ 0x7 -; CHECK-NEXT: .short 10 @ 0xa -; CHECK-NEXT: .short 13 @ 0xd -; CHECK-NEXT: .short 16 @ 0x10 -; CHECK-NEXT: .short 19 @ 0x13 -; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 44 @ 0x2c ; CHECK-NEXT: .LCPI12_1: ; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .short 3 @ 0x3 @@ -827,14 +827,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, ; CHECK-NEXT: .short 18 @ 0x12 ; CHECK-NEXT: .short 21 @ 0x15 ; CHECK-NEXT: .LCPI12_2: -; CHECK-NEXT: .short 2 @ 0x2 -; CHECK-NEXT: .short 5 @ 0x5 -; CHECK-NEXT: .short 8 @ 0x8 -; CHECK-NEXT: .short 11 @ 0xb -; CHECK-NEXT: .short 14 @ 0xe -; CHECK-NEXT: .short 17 @ 0x11 -; CHECK-NEXT: .short 20 @ 0x14 -; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .short 28 @ 0x1c +; CHECK-NEXT: .short 34 @ 0x22 +; CHECK-NEXT: .short 40 @ 0x28 +; CHECK-NEXT: .short 46 @ 0x2e vector.ph: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll index c62a9fb64ef1..b2c7b22fb829 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -219,19 +219,19 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offp ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI9_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .short 20 @ 0x14 -; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .short 26 @ 0x1a -; CHECK-NEXT: .short 29 @ 0x1d -; CHECK-NEXT: .short 32 @ 0x20 -; CHECK-NEXT: .short 35 @ 0x23 -; CHECK-NEXT: .short 38 @ 0x26 -; CHECK-NEXT: .short 41 @ 0x29 +; CHECK-NEXT: .short 40 @ 0x28 +; CHECK-NEXT: .short 46 @ 0x2e +; CHECK-NEXT: .short 52 @ 0x34 +; CHECK-NEXT: .short 58 @ 0x3a +; CHECK-NEXT: .short 64 @ 0x40 +; CHECK-NEXT: .short 70 @ 0x46 +; CHECK-NEXT: .short 76 @ 0x4c +; CHECK-NEXT: .short 82 @ 0x52 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll index 1fbe8e594626..760dff52bff4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -258,15 +258,15 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %of ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI16_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI16_0: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 22 @ 0x16 +; CHECK-NEXT: .long 28 @ 0x1c entry: %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5