diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6f17d18488d5..c343664b44af 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7809,6 +7809,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } +static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + // Attempt to lower the vector shuffle using as many whole register movs as + // possible. This is useful for types smaller than 32bits, which would + // often otherwise become a series for grp movs. + SDLoc dl(Op); + EVT VT = Op.getValueType(); + if (VT.getScalarSizeInBits() >= 32) + return SDValue(); + + assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + "Unexpected vector type"); + int NumElts = VT.getVectorNumElements(); + int QuarterSize = NumElts / 4; + // The four final parts of the vector, as i32's + SDValue Parts[4]; + + // Look for full lane vmovs like <0,1,2,3> or etc, (but not + // ), returning the vmov lane index + auto getMovIdx = [](ArrayRef ShuffleMask, int Start, int Length) { + // Detect which mov lane this would be from the first non-undef element. + int MovIdx = -1; + for (int i = 0; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0) { + if (ShuffleMask[Start + i] % Length != i) + return -1; + MovIdx = ShuffleMask[Start + i] / Length; + break; + } + } + // If all items are undef, leave this for other combines + if (MovIdx == -1) + return -1; + // Check the remaining values are the correct part of the same mov + for (int i = 1; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0 && + (ShuffleMask[Start + i] / Length != MovIdx || + ShuffleMask[Start + i] % Length != i)) + return -1; + } + return MovIdx; + }; + + for (int Part = 0; Part < 4; ++Part) { + // Does this part look like a mov + int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); + if (Elt != -1) { + SDValue Input = Op->getOperand(0); + if (Elt >= 4) { + Input = Op->getOperand(1); + Elt -= 4; + } + SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, + DAG.getConstant(Elt, dl, MVT::i32)); + } + } + + // Nothing interesting found, just return + if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) + return SDValue(); + + // The other parts need to be built with the old shuffle vector, cast to a + // v4i32 and extract_vector_elts + if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { + SmallVector NewShuffleMask; + for (int Part = 0; Part < 4; ++Part) + for (int i = 0; i < QuarterSize; i++) + NewShuffleMask.push_back( + Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); + SDValue NewShuffle = DAG.getVectorShuffle( + VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); + SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); + + for (int Part = 0; Part < 4; ++Part) + if (!Parts[Part]) + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + BitCast, DAG.getConstant(Part, dl, MVT::i32)); + } + // Build a vector out of the various parts and bitcast it back to the original + // type. + SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); + return DAG.getBitcast(VT, NewVec); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -8003,6 +8089,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; + if (ST->hasMVEIntegerOps()) + if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) + return NewOp; + return SDValue(); } diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll index 895c91e3087f..1b00b708db3e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -260,9 +260,8 @@ define <8 x i16> @shuffle4_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vdup.16 q1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vcmp.i16 ne, q1, zr diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index a01b99726919..2f6541b7be11 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -98,11 +98,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[3], r0 @@ -114,6 +110,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -387,16 +384,11 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmovx.f16 s8, s0 @@ -407,6 +399,7 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll index 53945fce90bc..164d42f15152 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -7,23 +7,11 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -33,23 +21,11 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -94,23 +70,8 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_0123cdef(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_0123cdef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -120,14 +81,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -138,14 +95,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -179,39 +132,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -314,39 +239,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_0123ghij4567klmn(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_0123ghij4567klmn: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -356,31 +253,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -390,23 +267,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -416,14 +281,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -434,36 +295,16 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45u700123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -477,26 +318,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -507,26 +332,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -576,27 +385,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_0123cdef(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_0123cdef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> @@ -606,18 +396,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -628,14 +410,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index f23f73fd3cd9..3e7ba03d4849 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -343,107 +343,78 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.16 q4[1], r2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] +; CHECK-NEXT: vmov.u16 r0, q3[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vadd.i16 q3, q3, q5 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q5[7], r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vadd.i16 q0, q3, q0 +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vadd.i16 q1, q4, q3 +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -461,213 +432,155 @@ entry: define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld3_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r2, q4[6] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] ; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q4[1], r2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.16 q4[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vmov.u16 r2, q6[5] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] ; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q4[7] -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vadd.i16 q3, q3, q5 -; CHECK-NEXT: vmov.u16 r2, q4[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[1] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q4[2] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q4[3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q4[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vadd.i16 q0, q3, q0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.16 q4[5], r2 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r2, q3[0] +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vadd.i16 q1, q4, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vadd.i16 q4, q6, q4 -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] ; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.f32 s26, s12 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmovnb.i32 q7, q5 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vadd.i16 q1, q4, q2 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmovnb.i32 q3, q6 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vadd.i16 q1, q5, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q4 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i16>, <48 x i16>* %src, align 4 @@ -837,207 +750,135 @@ entry: define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov.8 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.8 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov.8 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.8 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.8 q2[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.8 q2[5], r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] -; CHECK-NEXT: vmov.8 q2[6], r2 -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.8 q2[7], r2 -; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmov.8 q2[8], r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.8 q2[9], r2 -; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov.8 q2[10], r2 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov.u8 r2, q2[0] -; CHECK-NEXT: vmov.8 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q2[1] -; CHECK-NEXT: vmov.8 q3[1], r2 -; CHECK-NEXT: vmov.u8 r2, q2[2] -; CHECK-NEXT: vmov.8 q3[2], r2 +; CHECK-NEXT: vmov.8 q1[0], r2 ; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q3[3], r2 -; CHECK-NEXT: vmov.u8 r2, q2[4] -; CHECK-NEXT: vmov.8 q3[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.8 q3[5], r2 +; CHECK-NEXT: vmov.8 q1[1], r2 ; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q3[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q3[7], r2 -; CHECK-NEXT: vmov.u8 r2, q2[8] -; CHECK-NEXT: vmov.8 q3[8], r2 +; CHECK-NEXT: vmov.8 q1[2], r2 ; CHECK-NEXT: vmov.u8 r2, q2[9] -; CHECK-NEXT: vmov.8 q3[9], r2 -; CHECK-NEXT: vmov.u8 r2, q2[10] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.8 q3[10], r2 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q4[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q4[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q4[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q4[11] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.u8 r0, q4[12] -; CHECK-NEXT: vmov.8 q3[12], r0 -; CHECK-NEXT: vmov.u8 r0, q4[13] -; CHECK-NEXT: vmov.8 q3[13], r0 -; CHECK-NEXT: vmov.u8 r0, q4[14] -; CHECK-NEXT: vmov.8 q3[14], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q6[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q6[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q6[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q6[10], r0 -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q6[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q6[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q6[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q6[15], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q4[15] -; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.8 q1[3], r2 +; CHECK-NEXT: vmov.u8 r2, q2[12] +; CHECK-NEXT: vmov.8 q1[4], r2 +; CHECK-NEXT: vmov.u8 r2, q2[15] +; CHECK-NEXT: vmov.8 q1[5], r2 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.8 q1[6], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.8 q1[7], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.8 q4[8], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.8 q4[9], r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.8 q4[10], r2 ; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vadd.i8 q3, q3, q5 -; CHECK-NEXT: vmov.u8 r0, q4[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q4[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q4[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q4[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q4[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q4[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q4[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q1[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q1[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q1[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q1[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.8 q5[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.8 q5[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.8 q5[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.8 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.8 q5[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q5[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q5[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q5[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q5[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vadd.i8 q3, q3, q4 +; CHECK-NEXT: vmov.8 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[5] +; CHECK-NEXT: vmov.8 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.8 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.8 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.8 q0[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] ; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vadd.i8 q0, q3, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i8>, <48 x i8>* %src, align 4 @@ -1552,76 +1393,97 @@ entry: define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q3[0], r3 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s16, s9 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s19 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q3[6], r3 ; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov lr, s22 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov.16 q5[7], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s2 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov r12, s23 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmovx.f16 s20, s18 ; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vadd.f16 q3, q4, q3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmovx.f16 s20, s5 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.16 q5[1], r2 ; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmovx.f16 s24, s11 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmovx.f16 s24, s4 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vadd.f16 q0, q3, q2 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[2], lr +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov r4, s15 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: vadd.f16 q0, q5, q0 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1636,142 +1498,182 @@ entry: define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmovx.f16 s20, s13 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov r12, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmovx.f16 s20, s2 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vadd.f16 q3, q3, q4 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.f16 q3, q3, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r3 ; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmovx.f16 s16, s4 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s1 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s20, s2 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vmov lr, s22 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.16 q5[1], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.16 q5[2], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.16 q5[3], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmovx.f16 s24, s15 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.16 q6[0], r3 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q6[1], r4 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov.16 q6[2], r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov.16 q6[3], r3 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.32 q2[2], lr +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vadd.f16 q0, q5, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vadd.f16 q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmovx.f16 s12, s16 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s10 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vadd.f16 q3, q3, q4 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vadd.f16 q0, q3, q2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s20, s5 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmovx.f16 s24, s4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmovx.f16 s20, s6 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov.16 q5[6], r5 +; CHECK-NEXT: vmov r5, s17 +; CHECK-NEXT: vmov.16 q5[7], r3 +; CHECK-NEXT: vmov.16 q4[6], r5 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.16 q5[1], r5 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: vmov.16 q5[2], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov.16 q5[3], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmovx.f16 s24, s7 +; CHECK-NEXT: vmov.16 q5[4], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov.16 q5[5], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.16 q6[0], r5 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q6[1], r4 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov.16 q6[2], r5 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov.16 q6[3], r5 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vadd.f16 q0, q5, q0 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index db0dc3614dc1..4b5f8b762204 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -8,8 +8,8 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .pad #104 +; CHECK-NEXT: sub sp, #104 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: cmp.w r3, r2, lsr #2 @@ -17,7 +17,7 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: and.w r2, r3, r2, lsr #2 -; CHECK-NEXT: vldr.16 s0, [sp, #112] +; CHECK-NEXT: vldr.16 s0, [sp, #176] ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r12, r0, #64 @@ -25,192 +25,214 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: subs r1, #64 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r12, #64]! -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vldrh.u16 q7, [r12, #16] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s28 ; CHECK-NEXT: vldrh.u16 q6, [r12, #32] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vldrh.u16 q5, [r12, #48] -; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vldrh.u16 q4, [r12, #48] +; CHECK-NEXT: vldrh.u16 q7, [r12, #16] ; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmovx.f16 s12, s16 ; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmul.f16 q1, q1, q2 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s12, s28 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmovx.f16 s12, s30 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s24 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s26 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmul.f16 q5, q2, q1 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s8, s28 +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s24 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s22 +; CHECK-NEXT: vmovx.f16 s12, s18 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s30 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmul.f16 q2, q2, q4 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmul.f16 q3, q3, q1 -; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmul.f16 q1, q1, q3 +; CHECK-NEXT: vmov r2, s27 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s4, s22 +; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[0], r3 ; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.16 q0[1], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s31 +; CHECK-NEXT: vmovx.f16 s4, s27 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vmul.f16 q5, q2, q3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s27 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmul.f16 q5, q0, q1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmul.f16 q6, q0, q3 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 ; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s4, s26 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vstrb.8 q4, [r1, #64]! -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov q4, q6 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s15 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s17 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vstrh.16 q0, [r1, #48] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s20, s22 +; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s8 ; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s13 ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrh.16 q0, [r1, #16] +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmovx.f16 s20, s26 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.f32 s29, s9 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.f32 s31, s11 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s3, s15 +; CHECK-NEXT: vstrb.8 q0, [r1, #64]! +; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrh.16 q4, [r1, #48] +; CHECK-NEXT: vstrh.16 q2, [r1, #32] +; CHECK-NEXT: vstrh.16 q7, [r1, #16] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #104 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 1942be9313ec..97c595c01448 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -389,108 +389,82 @@ entry: define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vrev32.16 q5, q3 -; CHECK-NEXT: vmov.u16 r2, q4[0] -; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[1] +; CHECK-NEXT: vmov.u16 r2, q2[3] ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.32 r2, q1[0] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vdup.32 q5, r2 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.32 r2, q2[3] +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.u16 r0, q4[6] ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.f32 s13, s25 ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vdup.32 q6, r2 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vmov.f32 s22, s7 +; CHECK-NEXT: vrev32.16 q4, q4 +; CHECK-NEXT: vmov.16 q7[2], r2 ; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.16 q7[3], r0 ; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q6[5] +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.f32 s21, s29 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -511,217 +485,185 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q4[5] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q4[6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q5[7] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.u16 r2, q4[7] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov.u16 r2, q7[0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: .pad #160 +; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u16 r2, q7[1] -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 r3, q0[0] +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov.u16 r2, q3[5] +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.u16 r2, q3[7] +; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.32 r0, q0[3] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q5[4] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vstrw.32 q3, [r1] ; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.32 r2, q0[0] ; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q6[4] +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u16 r0, q7[5] ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r0, q7[6] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov.u16 r0, q7[7] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vrev32.16 q1, q5 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.32 r2, q1[3] +; CHECK-NEXT: vmov.f32 s2, s19 +; CHECK-NEXT: vdup.32 q7, r2 ; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r2, q7[2] +; CHECK-NEXT: vrev32.16 q3, q3 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q7[4] -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov q7, q4 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vrev32.16 q1, q4 -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vstrw.32 q2, [r1, #64] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q7, q7 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.16 q7[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s5 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s18 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.u16 r2, q7[3] +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q7[4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vstrw.32 q3, [r1, #64] +; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -846,74 +788,65 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: vldrb.u16 q1, [r0, #8] +; CHECK-NEXT: vldrb.u16 q2, [r0, #16] ; CHECK-NEXT: vldrb.u16 q3, [r0] -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q4[6], r2 +; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.u16 r2, q4[0] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 r0, q3[3] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r2, q4[1] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.8 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.8 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] ; CHECK-NEXT: vmov.8 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q4[4], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.8 q4[8], r0 ; CHECK-NEXT: vmov.u16 r0, q3[3] ; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.8 q4[11], r0 ; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.8 q4[12], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.8 q4[13], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.f32 s1, s21 ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.f32 s2, s22 ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vstrb.16 q2, [r1, #16] +; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -936,110 +869,107 @@ define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.8 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q2[5] -; CHECK-NEXT: vmov.8 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.8 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.8 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q4[7], r2 -; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmov.8 q4[9], r2 -; CHECK-NEXT: vmov.u8 r2, q2[8] -; CHECK-NEXT: vmov.8 q4[10], r2 -; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.8 q4[12], r2 -; CHECK-NEXT: vmov.u8 r2, q2[9] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.8 q4[13], r2 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q4[15], r2 -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov.u8 r0, q5[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q4[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q5[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q4[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q4[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q5[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q4[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q4[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q5[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q4[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q4[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q5[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q4[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.u8 r3, q3[0] +; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q5[0], r3 +; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q3[1] ; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q3[2] ; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov.8 q5[7], r0 ; CHECK-NEXT: vmov.u8 r0, q3[3] ; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[4] +; CHECK-NEXT: vmov.8 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.8 q4[8], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.8 q4[11], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov.u8 r0, q5[0] +; CHECK-NEXT: vmov.8 q4[14], r2 +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.u8 r0, q5[1] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u8 r2, q4[2] +; CHECK-NEXT: vmov.8 q0[2], r2 +; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q5[4] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q4[5] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q5[6] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q5[7] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q4[8] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q5[9] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q5[10] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q4[11] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u8 r0, q5[12] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q5[13] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q4[14] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q5[15] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.u8 r0, q2[5] +; CHECK-NEXT: vmov.8 q5[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q5[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.8 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q5[4], r0 +; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.8 q5[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q5[7], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vmov.u8 r0, q5[0] ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.8 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.u8 r0, q3[7] ; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.u8 r0, q3[8] ; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.u8 r0, q3[9] ; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q6[14], r0 +; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.f32 s27, s14 ; CHECK-NEXT: vmov.u8 r0, q6[2] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q5[3] @@ -1068,72 +998,72 @@ define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q5[15] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.u8 r0, q1[10] ; CHECK-NEXT: vmov.8 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q3[11] ; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.8 q5[3], r0 ; CHECK-NEXT: vmov.u8 r0, q3[12] ; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[13] ; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.u8 r0, q1[13] ; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[14] ; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q3[15] ; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q5[1] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q3[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q3[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.u8 r0, q5[0] +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q5[1] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.8 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.8 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.8 q3[8], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.8 q3[11], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.8 q3[14], r0 ; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: vmov.8 q1[2], r0 ; CHECK-NEXT: vmov.u8 r0, q5[3] -; CHECK-NEXT: vmov.8 q2[3], r0 +; CHECK-NEXT: vmov.8 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q5[4] -; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.8 q1[4], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.8 q2[5], r0 +; CHECK-NEXT: vmov.8 q1[5], r0 ; CHECK-NEXT: vmov.u8 r0, q5[6] -; CHECK-NEXT: vmov.8 q2[6], r0 +; CHECK-NEXT: vmov.8 q1[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmov.8 q2[7], r0 +; CHECK-NEXT: vmov.8 q1[7], r0 ; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q2[8], r0 +; CHECK-NEXT: vmov.8 q1[8], r0 ; CHECK-NEXT: vmov.u8 r0, q5[9] -; CHECK-NEXT: vmov.8 q2[9], r0 +; CHECK-NEXT: vmov.8 q1[9], r0 ; CHECK-NEXT: vmov.u8 r0, q5[10] -; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.8 q1[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.8 q2[11], r0 +; CHECK-NEXT: vmov.8 q1[11], r0 ; CHECK-NEXT: vmov.u8 r0, q5[12] -; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.8 q1[12], r0 ; CHECK-NEXT: vmov.u8 r0, q5[13] -; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.8 q1[13], r0 ; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.8 q1[14], r0 ; CHECK-NEXT: vmov.u8 r0, q5[15] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: @@ -1530,22 +1460,18 @@ entry: define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { ; CHECK-LABEL: vst3_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vldmia r0, {s4, s5} +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: ldr r0, [r0, #8] ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.32 q2[0], r12 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s8 @@ -1574,62 +1500,51 @@ entry: define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-LABEL: vst3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r4 -; CHECK-NEXT: vmov.32 q1[0], r12 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.32 q1[1], lr -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[0], r3 -; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.16 q2[1], r2 ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s4 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 @@ -1647,76 +1562,100 @@ entry: define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK-LABEL: vst3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmovx.f16 s16, s11 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vrev32.16 q4, q0 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.32 r0, q5[0] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmovx.f16 s12, s8 ; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmovx.f16 s16, s18 ; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s10 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmovx.f16 s16, s22 +; CHECK-NEXT: vmovx.f16 s24, s7 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmovx.f16 s24, s23 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.32 r2, q2[3] +; CHECK-NEXT: vmov.f32 s18, s23 +; CHECK-NEXT: vdup.32 q7, r2 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmovx.f16 s28, s30 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmovx.f16 s28, s9 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.f32 s29, s21 +; CHECK-NEXT: vmov.f32 s30, s10 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vrev32.16 q2, q1 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.16 q1[4], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1737,145 +1676,203 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vrev32.16 q3, q2 -; CHECK-NEXT: vmov.16 q6[1], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.32 r3, q0[0] +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q3[2], r3 +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.32 r3, q5[3] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.16 q7[0], r3 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov.32 r0, q3[0] +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov.16 q7[6], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov.16 q7[7], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.f32 s29, s20 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.32 r2, q5[3] +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.f32 s9, s25 +; CHECK-NEXT: vmov.f32 s17, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vrev32.16 q0, q1 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.f32 s10, s26 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q3, q0 ; CHECK-NEXT: vmov r2, s13 ; CHECK-NEXT: vmovx.f16 s12, s14 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q6[6], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s28, s3 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s11 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s7 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s20 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s16 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmovx.f16 s24, s21 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vrev32.16 q7, q4 -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmovx.f16 s8, s30 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s18 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmovx.f16 s8, s23 -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s16 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.f32 s13, s25 +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s5 ; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 4ac204ac9d80..177046a747b7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -858,38 +858,31 @@ entry: define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r3, r12, [r0] +; CHECK-NEXT: vldmia r0, {s4, s5} +; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q2[0], r12 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov lr, s0 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q0[0], lr -; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[3], lr -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.16 q1[6], r4 -; CHECK-NEXT: vmov.16 q1[7], r4 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 %l1 = load <2 x half>, <2 x half>* %s1, align 4 @@ -909,73 +902,61 @@ entry: define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: ldrd r2, r3, [r0, #16] -; CHECK-NEXT: ldm.w r0, {r4, r5, r6} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] ; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov lr, s4 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.32 q2[1], r5 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov.16 q1[0], r4 -; CHECK-NEXT: vmov.16 q1[1], r5 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vmov.16 q0[0], lr -; CHECK-NEXT: vmov.16 q1[2], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov.32 q2[0], r6 -; CHECK-NEXT: vmov.16 q1[3], r4 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q3[0], r5 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[3], r3 -; CHECK-NEXT: vmov.16 q3[4], r4 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q3[3], lr -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.16 q3[7], r12 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4