From 4ea1b43527c9a845942dbbc27e022d74d72728e6 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 6 Jun 2022 11:39:51 +0100 Subject: [PATCH] [AArch64] Generate ADDP from shuffled add This adds a fold of add(x, shuffle(x, <1,0,3,2,5,4,...>), into shuffle(addp(x), <0,0,1,1,2,2,..>. The ADDP instruction takes two vectors and returns one, adding adjacent pairs. So we match x in a custom combine as it is lowered from a v8i32. The original code would be 2 rev64 and 2 add, with the new code being a single addp with a zip1;zip2 shuffle, producing smaller code. Differential Revision: https://reviews.llvm.org/D126686 --- .../Target/AArch64/AArch64ISelLowering.cpp | 51 +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 +- llvm/test/CodeGen/AArch64/arm64-addp.ll | 42 +-- llvm/test/CodeGen/AArch64/insert-extend.ll | 181 +++++----- llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 330 +++++++++--------- 6 files changed, 321 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 11e8e4adc6f0..f0a8b742d471 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1067,6 +1067,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + + // ADDP custom lowering + for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::ADD, VT, Custom); } if (Subtarget->hasSVE()) { @@ -2233,6 +2237,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) + MAKE_CASE(AArch64ISD::ADDP) MAKE_CASE(AArch64ISD::SADDLP) MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) @@ -19294,6 +19299,49 @@ void AArch64TargetLowering::ReplaceBITCASTResults( Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return; + + SDValue X = N->getOperand(0); + auto *Shuf = dyn_cast(N->getOperand(1)); + if (!Shuf) { + Shuf = dyn_cast(N->getOperand(0)); + X = N->getOperand(1); + if (!Shuf) + return; + } + + if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) + return; + + // Check the mask is 1,0,3,2,5,4,... + ArrayRef Mask = Shuf->getMask(); + for (int I = 0, E = Mask.size(); I < E; I++) + if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) + return; + + SDLoc DL(N); + auto LoHi = DAG.SplitVector(X, DL); + assert(LoHi.first.getValueType() == LoHi.second.getValueType()); + SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), + LoHi.first, LoHi.second); + + // Shuffle the elements back into order. + SmallVector NMask; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { + NMask.push_back(I); + NMask.push_back(I); + } + Results.push_back( + DAG.getVectorShuffle(VT, DL, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, + DAG.getUNDEF(LoHi.first.getValueType())), + DAG.getUNDEF(VT), NMask)); +} + static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, @@ -19471,6 +19519,9 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::ADD: + ReplaceAddWithADDP(N, Results, DAG); + return; case ISD::CTPOP: if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f3f11bb43e1f..b00cf9548360 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -232,6 +232,8 @@ enum NodeType : unsigned { SADDV, UADDV, + // Add Pairwise of two vectors + ADDP, // Add Long Pairwise SADDLP, UADDLP, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3d42ac84a626..6edac048c853 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -694,8 +694,12 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), [(abds node:$lhs, node:$rhs), (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; +def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>; def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>; def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>; +def AArch64addp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_addp node:$Rn, node:$Rm)]>; def AArch64uaddlp : PatFrags<(ops node:$src), [(AArch64uaddlp_n node:$src), (int_aarch64_neon_uaddlp node:$src)]>; @@ -4506,7 +4510,7 @@ def : Pat<(v8i16 (concat_vectors //===----------------------------------------------------------------------===// defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; -defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>; +defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", AArch64addp>; defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>; defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>; defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; diff --git a/llvm/test/CodeGen/AArch64/arm64-addp.ll b/llvm/test/CodeGen/AArch64/arm64-addp.ll index cd7794ff0ccc..47ffba9a509a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addp.ll @@ -52,10 +52,9 @@ entry: define <4 x i64> @addp_v4i64(<4 x i64> %a) { ; CHECK-LABEL: addp_v4i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext.16b v2, v1, v1, #8 -; CHECK-NEXT: ext.16b v3, v0, v0, #8 -; CHECK-NEXT: add.2d v0, v3, v0 -; CHECK-NEXT: add.2d v1, v2, v1 +; CHECK-NEXT: addp.2d v1, v0, v1 +; CHECK-NEXT: dup.2d v0, v1[0] +; CHECK-NEXT: dup.2d v1, v1[1] ; CHECK-NEXT: ret entry: %s = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> @@ -78,10 +77,9 @@ entry: define <8 x i32> @addp_v8i32(<8 x i32> %a) { ; CHECK-LABEL: addp_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64.4s v2, v1 -; CHECK-NEXT: rev64.4s v3, v0 -; CHECK-NEXT: add.4s v0, v3, v0 -; CHECK-NEXT: add.4s v1, v2, v1 +; CHECK-NEXT: addp.4s v1, v0, v1 +; CHECK-NEXT: zip1.4s v0, v1, v1 +; CHECK-NEXT: zip2.4s v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> @@ -92,14 +90,12 @@ entry: define <16 x i32> @addp_v16i32(<16 x i32> %a) { ; CHECK-LABEL: addp_v16i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64.4s v4, v3 -; CHECK-NEXT: rev64.4s v5, v2 -; CHECK-NEXT: rev64.4s v6, v1 -; CHECK-NEXT: rev64.4s v7, v0 -; CHECK-NEXT: add.4s v0, v7, v0 -; CHECK-NEXT: add.4s v1, v6, v1 -; CHECK-NEXT: add.4s v2, v5, v2 -; CHECK-NEXT: add.4s v3, v4, v3 +; CHECK-NEXT: addp.4s v1, v0, v1 +; CHECK-NEXT: zip1.4s v0, v1, v1 +; CHECK-NEXT: zip2.4s v1, v1, v1 +; CHECK-NEXT: addp.4s v3, v2, v3 +; CHECK-NEXT: zip1.4s v2, v3, v3 +; CHECK-NEXT: zip2.4s v3, v3, v3 ; CHECK-NEXT: ret entry: %s = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> @@ -122,10 +118,9 @@ entry: define <16 x i16> @addp_v16i16(<16 x i16> %a) { ; CHECK-LABEL: addp_v16i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32.8h v2, v1 -; CHECK-NEXT: rev32.8h v3, v0 -; CHECK-NEXT: add.8h v0, v3, v0 -; CHECK-NEXT: add.8h v1, v2, v1 +; CHECK-NEXT: addp.8h v1, v0, v1 +; CHECK-NEXT: zip1.8h v0, v1, v1 +; CHECK-NEXT: zip2.8h v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> @@ -148,10 +143,9 @@ entry: define <32 x i8> @addp_v32i8(<32 x i8> %a) { ; CHECK-LABEL: addp_v32i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev16.16b v2, v1 -; CHECK-NEXT: rev16.16b v3, v0 -; CHECK-NEXT: add.16b v0, v3, v0 -; CHECK-NEXT: add.16b v1, v2, v1 +; CHECK-NEXT: addp.16b v1, v0, v1 +; CHECK-NEXT: zip1.16b v0, v1, v1 +; CHECK-NEXT: zip2.16b v1, v1, v1 ; CHECK-NEXT: ret entry: %s = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index b0fb04f1fb78..9bda047b8784 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -91,111 +91,100 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s ; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s +; CHECK-NEXT: rev64 v17.4s, v1.4s ; CHECK-NEXT: add v3.4s, v3.4s, v16.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: add v18.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: add v16.4s, v0.4s, v4.4s -; CHECK-NEXT: zip1 v7.4s, v2.4s, v1.4s -; CHECK-NEXT: add v17.4s, v3.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s -; CHECK-NEXT: uzp2 v6.4s, v17.4s, v16.4s -; CHECK-NEXT: zip2 v5.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v20.16b, v17.16b, v17.16b, #12 -; CHECK-NEXT: mov v0.s[1], v3.s[0] -; CHECK-NEXT: ext v3.16b, v2.16b, v7.16b, #8 -; CHECK-NEXT: mov v2.s[3], v1.s[2] -; CHECK-NEXT: zip1 v4.4s, v19.4s, v18.4s -; CHECK-NEXT: trn2 v21.4s, v17.4s, v16.4s -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v17.4s -; CHECK-NEXT: mov v17.s[0], v16.s[1] -; CHECK-NEXT: zip2 v7.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v0.d[1], v3.d[1] -; CHECK-NEXT: ext v1.16b, v16.16b, v20.16b, #12 -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: mov v17.d[1], v4.d[1] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v1.d[1], v7.d[1] -; CHECK-NEXT: add v3.4s, v5.4s, v0.4s -; CHECK-NEXT: mov v21.d[1], v4.d[1] +; CHECK-NEXT: rev64 v5.4s, v0.4s ; CHECK-NEXT: rev64 v4.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v2.4s, v6.4s, v21.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: add v7.4s, v3.4s, v4.4s +; CHECK-NEXT: zip1 v5.4s, v2.4s, v1.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: add v17.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v7.16b, v3.16b, #4 -; CHECK-NEXT: add v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v6.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: ext v5.16b, v19.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v16.4s, v16.4s -; CHECK-NEXT: rev64 v6.4s, v6.4s -; CHECK-NEXT: ext v20.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: rev64 v5.4s, v5.4s -; CHECK-NEXT: mov v7.s[3], v3.s[3] -; CHECK-NEXT: ext v4.16b, v3.16b, v16.16b, #12 -; CHECK-NEXT: mov v19.s[3], v0.s[3] -; CHECK-NEXT: mov v17.s[3], v1.s[3] -; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #12 +; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8 +; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s +; CHECK-NEXT: mov v0.s[1], v3.s[0] +; CHECK-NEXT: ext v3.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: uzp2 v19.4s, v7.4s, v16.4s +; CHECK-NEXT: uzp1 v6.4s, v7.4s, v16.4s +; CHECK-NEXT: uzp1 v7.4s, v18.4s, v16.4s +; CHECK-NEXT: uzp2 v1.4s, v18.4s, v16.4s +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: mov v4.d[1], v2.d[1] +; CHECK-NEXT: add v5.4s, v19.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s +; CHECK-NEXT: rev64 v2.4s, v5.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v4.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: rev64 v3.4s, v1.4s +; CHECK-NEXT: rev64 v4.4s, v6.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v1.4s, v6.4s +; CHECK-NEXT: addp v17.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v1.16b, v16.16b, #8 +; CHECK-NEXT: ext v5.16b, v17.16b, v0.16b, #4 +; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: zip1 v7.4s, v17.4s, v17.4s +; CHECK-NEXT: ext v18.16b, v3.16b, v1.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s +; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s +; CHECK-NEXT: trn2 v7.4s, v7.4s, v2.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #4 +; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: ext v5.16b, v0.16b, v5.16b, #12 -; CHECK-NEXT: rev64 v18.4s, v18.4s -; CHECK-NEXT: trn2 v20.4s, v2.4s, v20.4s -; CHECK-NEXT: sub v16.4s, v7.4s, v4.4s -; CHECK-NEXT: sub v21.4s, v17.4s, v6.4s -; CHECK-NEXT: sub v22.4s, v19.4s, v5.4s -; CHECK-NEXT: trn2 v2.4s, v18.4s, v2.4s -; CHECK-NEXT: mov v17.s[0], v1.s[0] -; CHECK-NEXT: ext v1.16b, v20.16b, v20.16b, #4 -; CHECK-NEXT: mov v19.s[0], v0.s[0] -; CHECK-NEXT: mov v7.s[0], v3.s[0] -; CHECK-NEXT: add v0.4s, v17.4s, v6.4s -; CHECK-NEXT: add v3.4s, v2.4s, v1.4s -; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: add v4.4s, v7.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v5.d[1], v22.d[1] -; CHECK-NEXT: mov v0.d[1], v21.d[1] -; CHECK-NEXT: mov v3.d[1], v1.d[1] -; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: ext v6.16b, v4.16b, v6.16b, #12 +; CHECK-NEXT: uzp2 v3.4s, v3.4s, v18.4s +; CHECK-NEXT: mov v4.s[2], v16.s[3] +; CHECK-NEXT: mov v0.s[2], v17.s[3] +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: sub v18.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v19.4s, v4.4s, v6.4s +; CHECK-NEXT: sub v20.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v21.4s, v7.4s, v2.4s +; CHECK-NEXT: mov v4.s[1], v16.s[2] +; CHECK-NEXT: mov v0.s[1], v17.s[2] +; CHECK-NEXT: mov v2.s[0], v17.s[1] +; CHECK-NEXT: mov v1.s[1], v16.s[0] +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.d[1], v21.d[1] +; CHECK-NEXT: mov v1.d[1], v18.d[1] +; CHECK-NEXT: mov v4.d[1], v19.d[1] +; CHECK-NEXT: mov v0.d[1], v20.d[1] +; CHECK-NEXT: movi v3.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v2.4s, v0.4s, #15 -; CHECK-NEXT: ushr v6.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 -; CHECK-NEXT: ushr v16.4s, v5.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v7.16b, v7.16b, v1.16b -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-NEXT: mul v2.4s, v6.4s, v17.4s +; CHECK-NEXT: ushr v5.4s, v4.4s, #15 +; CHECK-NEXT: ushr v6.4s, v2.4s, #15 +; CHECK-NEXT: ushr v7.4s, v0.4s, #15 +; CHECK-NEXT: ushr v16.4s, v1.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v3.16b +; CHECK-NEXT: and v16.16b, v16.16b, v3.16b +; CHECK-NEXT: and v7.16b, v7.16b, v3.16b +; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s ; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s +; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s ; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v5.4s, v6.4s, v5.4s +; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v4.4s, v3.4s, v4.4s +; CHECK-NEXT: add v0.4s, v7.4s, v0.4s +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v6.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v5.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index 0d514f05cc29..42f8817ede89 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -35,122 +35,120 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v6.s }[1], [x2] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: usubl v7.4s, v3.4h, v5.4h -; CHECK-NEXT: usubl2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: usubl2 v5.4s, v2.8h, v4.8h +; CHECK-NEXT: usubl v17.4s, v3.4h, v5.4h +; CHECK-NEXT: usubl2 v5.4s, v3.8h, v5.8h +; CHECK-NEXT: usubl2 v3.4s, v2.8h, v4.8h ; CHECK-NEXT: usubl v2.4s, v2.4h, v4.4h ; CHECK-NEXT: ushll v4.8h, v6.8b, #0 -; CHECK-NEXT: shl v5.4s, v5.4s, #16 +; CHECK-NEXT: shl v3.4s, v3.4s, #16 ; CHECK-NEXT: usubl2 v6.4s, v0.8h, v4.8h ; CHECK-NEXT: shl v2.4s, v2.4s, #16 ; CHECK-NEXT: usubl v0.4s, v0.4h, v4.4h -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: shl v4.4s, v6.4s, #16 -; CHECK-NEXT: shl v0.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v7.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: zip1 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: mov v16.16b, v2.16b -; CHECK-NEXT: mov v19.16b, v1.16b -; CHECK-NEXT: zip2 v5.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v18.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v16.s[0], v1.s[1] -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v17.16b, #12 -; CHECK-NEXT: ext v17.16b, v0.16b, v4.16b, #8 -; CHECK-NEXT: mov v19.s[1], v2.s[0] -; CHECK-NEXT: mov v0.s[3], v3.s[2] -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v16.d[1], v4.d[1] -; CHECK-NEXT: mov v19.d[1], v17.d[1] -; CHECK-NEXT: mov v18.d[1], v0.d[1] -; CHECK-NEXT: mov v1.d[1], v7.d[1] -; CHECK-NEXT: mov v5.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v16.4s, v19.4s -; CHECK-NEXT: add v4.4s, v6.4s, v18.4s -; CHECK-NEXT: rev64 v3.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s -; CHECK-NEXT: rev64 v5.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v19.4s, v16.4s -; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: add v6.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v5.d[1], v4.d[1] -; CHECK-NEXT: rev64 v2.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v6.4s +; CHECK-NEXT: add v19.4s, v3.4s, v1.4s +; CHECK-NEXT: shl v6.4s, v6.4s, #16 +; CHECK-NEXT: shl v4.4s, v0.4s, #16 +; CHECK-NEXT: add v1.4s, v2.4s, v16.4s +; CHECK-NEXT: add v7.4s, v6.4s, v5.4s +; CHECK-NEXT: add v18.4s, v4.4s, v17.4s +; CHECK-NEXT: ext v20.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v18.4s, v7.4s +; CHECK-NEXT: uzp2 v16.4s, v1.4s, v19.4s +; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: mov v6.16b, v19.16b +; CHECK-NEXT: mov v2.s[0], v19.s[1] +; CHECK-NEXT: ext v5.16b, v19.16b, v20.16b, #12 +; CHECK-NEXT: ext v20.16b, v18.16b, v17.16b, #8 +; CHECK-NEXT: mov v6.s[1], v1.s[0] +; CHECK-NEXT: zip2 v3.4s, v1.4s, v19.4s +; CHECK-NEXT: uzp2 v4.4s, v16.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v18.4s, v7.4s +; CHECK-NEXT: mov v18.s[3], v7.s[2] +; CHECK-NEXT: mov v2.d[1], v17.d[1] +; CHECK-NEXT: mov v6.d[1], v20.d[1] +; CHECK-NEXT: zip2 v0.4s, v19.4s, v1.4s +; CHECK-NEXT: mov v4.d[1], v16.d[1] +; CHECK-NEXT: mov v3.d[1], v18.d[1] +; CHECK-NEXT: add v1.4s, v2.4s, v6.4s +; CHECK-NEXT: mov v5.d[1], v16.d[1] +; CHECK-NEXT: mov v0.d[1], v18.d[1] ; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: rev64 v4.4s, v3.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v4.4s, v1.4s, v2.4s -; CHECK-NEXT: add v16.4s, v6.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v6.4s, v7.4s -; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: rev64 v17.4s, v0.4s -; CHECK-NEXT: ext v7.16b, v4.16b, v1.16b, #4 -; CHECK-NEXT: ext v5.16b, v16.16b, v2.16b, #4 -; CHECK-NEXT: add v18.4s, v3.4s, v6.4s -; CHECK-NEXT: add v19.4s, v0.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v17.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s -; CHECK-NEXT: rev64 v6.4s, v7.4s -; CHECK-NEXT: rev64 v7.4s, v18.4s +; CHECK-NEXT: movi v19.8h, #1 +; CHECK-NEXT: mov v6.d[1], v1.d[1] +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: mov v4.d[1], v3.d[1] +; CHECK-NEXT: rev64 v2.4s, v5.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: addp v4.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v1.4s +; CHECK-NEXT: zip1 v16.4s, v4.4s, v4.4s +; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v17.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: ext v7.16b, v0.16b, v6.16b, #4 +; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #4 +; CHECK-NEXT: trn2 v3.4s, v16.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v1.16b, v0.16b, #8 +; CHECK-NEXT: zip2 v5.4s, v17.4s, v4.4s +; CHECK-NEXT: zip2 v7.4s, v7.4s, v0.4s ; CHECK-NEXT: ext v17.16b, v18.16b, v18.16b, #4 -; CHECK-NEXT: ext v18.16b, v19.16b, v0.16b, #4 -; CHECK-NEXT: rev64 v5.4s, v5.4s -; CHECK-NEXT: mov v16.s[3], v2.s[3] -; CHECK-NEXT: mov v4.s[3], v1.s[3] -; CHECK-NEXT: rev64 v18.4s, v18.4s -; CHECK-NEXT: mov v19.s[3], v0.s[3] +; CHECK-NEXT: ext v18.16b, v16.16b, v1.16b, #4 ; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #12 -; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #12 -; CHECK-NEXT: trn2 v7.4s, v7.4s, v3.4s -; CHECK-NEXT: trn2 v3.4s, v3.4s, v17.4s -; CHECK-NEXT: ext v18.16b, v0.16b, v18.16b, #12 -; CHECK-NEXT: sub v17.4s, v16.4s, v5.4s -; CHECK-NEXT: sub v20.4s, v4.4s, v6.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: mov v16.s[0], v2.s[0] -; CHECK-NEXT: sub v2.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v4.s[0], v1.s[0] -; CHECK-NEXT: mov v19.s[0], v0.s[0] -; CHECK-NEXT: add v1.4s, v7.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v7.4s, v3.4s -; CHECK-NEXT: add v3.4s, v4.4s, v6.4s -; CHECK-NEXT: add v4.4s, v16.4s, v5.4s -; CHECK-NEXT: add v5.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v4.d[1], v17.d[1] +; CHECK-NEXT: mov v2.s[2], v4.s[3] +; CHECK-NEXT: ext v7.16b, v6.16b, v7.16b, #12 +; CHECK-NEXT: mov v1.s[2], v0.s[1] +; CHECK-NEXT: mov v6.s[2], v0.s[3] +; CHECK-NEXT: uzp2 v16.4s, v16.4s, v18.4s +; CHECK-NEXT: sub v18.4s, v2.4s, v5.4s +; CHECK-NEXT: mov v2.s[1], v4.s[2] +; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v17.s[0], v4.s[1] +; CHECK-NEXT: sub v21.4s, v6.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s +; CHECK-NEXT: mov v6.s[1], v0.s[2] +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: add v0.4s, v2.4s, v5.4s +; CHECK-NEXT: add v3.4s, v3.4s, v17.4s +; CHECK-NEXT: mov v0.d[1], v18.d[1] +; CHECK-NEXT: add v2.4s, v6.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v16.4s ; CHECK-NEXT: mov v3.d[1], v20.d[1] -; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v2.4s, v1.4s, #15 -; CHECK-NEXT: ushr v6.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v5.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v0.16b -; CHECK-NEXT: and v16.16b, v16.16b, v0.16b -; CHECK-NEXT: and v7.16b, v7.16b, v0.16b -; CHECK-NEXT: and v0.16b, v2.16b, v0.16b -; CHECK-NEXT: mul v2.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: add v5.4s, v7.4s, v5.4s -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: eor v1.16b, v5.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v2.d[1], v21.d[1] +; CHECK-NEXT: mov v1.d[1], v4.d[1] +; CHECK-NEXT: ushr v5.4s, v0.4s, #15 +; CHECK-NEXT: ushr v7.4s, v3.4s, #15 +; CHECK-NEXT: and v4.16b, v5.16b, v19.16b +; CHECK-NEXT: ushr v6.4s, v2.4s, #15 +; CHECK-NEXT: movi v5.2d, #0x00ffff0000ffff +; CHECK-NEXT: ushr v16.4s, v1.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v19.16b +; CHECK-NEXT: and v16.16b, v16.16b, v19.16b +; CHECK-NEXT: and v7.16b, v7.16b, v19.16b +; CHECK-NEXT: mul v6.4s, v6.4s, v5.4s +; CHECK-NEXT: mul v4.4s, v4.4s, v5.4s +; CHECK-NEXT: mul v7.4s, v7.4s, v5.4s +; CHECK-NEXT: mul v5.4s, v16.4s, v5.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -541,76 +539,68 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: add v3.4s, v3.4s, v16.4s ; CHECK-NEXT: add v1.4s, v5.4s, v1.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s -; CHECK-NEXT: add v16.4s, v0.4s, v4.4s -; CHECK-NEXT: add v17.4s, v3.4s, v5.4s -; CHECK-NEXT: add v18.4s, v1.4s, v6.4s -; CHECK-NEXT: add v19.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s -; CHECK-NEXT: ext v4.16b, v17.16b, v17.16b, #12 -; CHECK-NEXT: zip1 v5.4s, v18.4s, v19.4s -; CHECK-NEXT: ext v7.16b, v2.16b, v1.16b, #4 -; CHECK-NEXT: mov v19.s[2], v18.s[3] -; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s +; CHECK-NEXT: rev64 v4.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v0.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s +; CHECK-NEXT: rev64 v17.4s, v1.4s +; CHECK-NEXT: addp v7.4s, v3.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: zip2 v6.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v4.16b, v16.16b, v4.16b, #12 -; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: trn2 v3.4s, v17.4s, v16.4s -; CHECK-NEXT: uzp2 v18.4s, v18.4s, v17.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v1.16b, #4 ; CHECK-NEXT: mov v1.s[3], v2.s[2] -; CHECK-NEXT: ext v2.16b, v7.16b, v2.16b, #4 -; CHECK-NEXT: mov v17.s[0], v16.s[1] -; CHECK-NEXT: mov v4.d[1], v19.d[1] +; CHECK-NEXT: uzp2 v5.4s, v7.4s, v16.4s +; CHECK-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; CHECK-NEXT: uzp1 v16.4s, v7.4s, v16.4s +; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s ; CHECK-NEXT: mov v6.d[1], v1.d[1] -; CHECK-NEXT: mov v0.d[1], v2.d[1] -; CHECK-NEXT: mov v17.d[1], v5.d[1] -; CHECK-NEXT: mov v3.d[1], v5.d[1] -; CHECK-NEXT: mov v18.d[1], v19.d[1] -; CHECK-NEXT: add v1.4s, v6.4s, v0.4s -; CHECK-NEXT: add v2.4s, v4.4s, v17.4s +; CHECK-NEXT: ext v1.16b, v17.16b, v2.16b, #4 +; CHECK-NEXT: rev64 v3.4s, v5.4s +; CHECK-NEXT: uzp1 v2.4s, v7.4s, v4.4s +; CHECK-NEXT: rev64 v5.4s, v16.4s +; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: add v1.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v18.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v1.4s -; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: trn2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v3.16b, v5.16b, #8 -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: mov v7.s[1], v1.s[1] -; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v2.4s, v3.4s, v0.4s -; CHECK-NEXT: mov v3.s[3], v0.s[2] +; CHECK-NEXT: zip1 v4.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v5.4s, v1.4s, v3.4s +; CHECK-NEXT: zip1 v6.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s +; CHECK-NEXT: trn2 v4.4s, v1.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v6.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: mov v1.s[1], v3.s[1] +; CHECK-NEXT: mov v2.s[3], v0.s[2] ; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v7.d[1], v5.d[1] -; CHECK-NEXT: mov v6.d[1], v2.d[1] -; CHECK-NEXT: mov v1.d[1], v3.d[1] -; CHECK-NEXT: add v0.4s, v4.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v7.4s, v4.4s -; CHECK-NEXT: add v4.4s, v6.4s, v1.4s -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v7.16b, v4.16b, v4.16b, #4 -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #8 -; CHECK-NEXT: ext v6.16b, v7.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v16.4s, v4.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v4.4s, v1.4s -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v3.4s, v0.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v6.16b, v7.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v17.4s, v6.4s -; CHECK-NEXT: sub v0.4s, v16.4s, v0.4s +; CHECK-NEXT: mov v5.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: mov v7.d[1], v2.d[1] +; CHECK-NEXT: add v0.4s, v4.4s, v1.4s +; CHECK-NEXT: add v2.4s, v5.4s, v7.4s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: ext v6.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v7.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v17.16b, v4.16b, v5.16b, #8 +; CHECK-NEXT: zip2 v16.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v2.4s, v5.4s, v2.4s +; CHECK-NEXT: ext v4.16b, v17.16b, v4.16b, #4 +; CHECK-NEXT: zip2 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v6.4s, v2.4s +; CHECK-NEXT: add v2.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v7.4s, v0.4s ; CHECK-NEXT: movi v4.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff ; CHECK-NEXT: ushr v5.4s, v0.4s, #15