From 9dd83f5ee8697fdb41ba73bc70d845085715b01d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 1 Mar 2021 10:41:57 +0000 Subject: [PATCH] [DAG] visitVECTOR_SHUFFLE - attempt to match commuted shuffles with MergeInnerShuffle. Try to match "shuffle(C, shuffle(A, B, M0), M1) -> shuffle(A, B, M2)" etc. by using MergeInnerShuffle's commuted inner shuffle mode. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 35 +- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 52 +- llvm/test/CodeGen/Thumb2/mve-vld3.ll | 452 +++++++++--------- llvm/test/CodeGen/X86/oddshuffles.ll | 32 +- llvm/test/CodeGen/X86/pr34592.ll | 39 +- 5 files changed, 307 insertions(+), 303 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d896b8c0cdef..06b07f32fb39 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21196,23 +21196,28 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // Don't try to fold shuffles with illegal type. // Only fold if this shuffle is the only user of the other shuffle. - if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && - N->isOnlyUserOf(N0.getNode())) { - // The incoming shuffle must be of the same type as the result of the - // current shuffle. - auto *OtherSV = cast(N0); - assert(OtherSV->getOperand(0).getValueType() == VT && - "Shuffle types don't match"); + // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well. + for (int i = 0; i != 2; ++i) { + if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE && + N->isOnlyUserOf(N->getOperand(i).getNode())) { + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + auto *OtherSV = cast(N->getOperand(i)); + assert(OtherSV->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); - SDValue SV0, SV1; - SmallVector Mask; - if (MergeInnerShuffle(false, SVN, OtherSV, N1, TLI, SV0, SV1, Mask)) { - // Check if all indices in Mask are Undef. In case, propagate Undef. - if (llvm::all_of(Mask, [](int M) { return M < 0; })) - return DAG.getUNDEF(VT); + SDValue SV0, SV1; + SmallVector Mask; + if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI, + SV0, SV1, Mask)) { + // Check if all indices in Mask are Undef. In case, propagate Undef. + if (llvm::all_of(Mask, [](int M) { return M < 0; })) + return DAG.getUNDEF(VT); - return DAG.getVectorShuffle(VT, SDLoc(N), SV0 ? SV0 : DAG.getUNDEF(VT), - SV1 ? SV1 : DAG.getUNDEF(VT), Mask); + return DAG.getVectorShuffle(VT, SDLoc(N), + SV0 ? SV0 : DAG.getUNDEF(VT), + SV1 ? SV1 : DAG.getUNDEF(VT), Mask); + } } } diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 128cfd0c2e2b..db8f7018ba55 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -135,19 +135,19 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s17, s3 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.i32 q0, q3, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1164,19 +1164,19 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) { ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s14, s8 -; CHECKFP-NEXT: vmov.f32 s15, s11 -; CHECKFP-NEXT: vmov.f32 s16, s1 -; CHECKFP-NEXT: vmov.f32 s12, s2 -; CHECKFP-NEXT: vmov.f32 s17, s4 -; CHECKFP-NEXT: vmov.f32 s1, s3 -; CHECKFP-NEXT: vmov.f32 s18, s7 -; CHECKFP-NEXT: vmov.f32 s2, s6 -; CHECKFP-NEXT: vmov.f32 s19, s10 -; CHECKFP-NEXT: vmov.f32 s3, s9 -; CHECKFP-NEXT: vmov.f32 s13, s5 -; CHECKFP-NEXT: vadd.f32 q0, q0, q4 -; CHECKFP-NEXT: vadd.f32 q0, q0, q3 +; CHECKFP-NEXT: vmov.f32 s12, s1 +; CHECKFP-NEXT: vmov.f32 s16, s0 +; CHECKFP-NEXT: vmov.f32 s13, s4 +; CHECKFP-NEXT: vmov.f32 s17, s3 +; CHECKFP-NEXT: vmov.f32 s14, s7 +; CHECKFP-NEXT: vmov.f32 s18, s6 +; CHECKFP-NEXT: vmov.f32 s4, s2 +; CHECKFP-NEXT: vmov.f32 s6, s8 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmov.f32 s19, s9 +; CHECKFP-NEXT: vadd.f32 q3, q4, q3 +; CHECKFP-NEXT: vmov.f32 s7, s11 +; CHECKFP-NEXT: vadd.f32 q0, q3, q1 ; CHECKFP-NEXT: vpop {d8, d9} ; CHECKFP-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 4cab1a4668af..06dbfe8debbb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -36,22 +36,22 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.i32 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -71,39 +71,39 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.i32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -123,73 +123,73 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d7, d4 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vadd.i32 q2, q4, q6 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vmov.f64 d9, d6 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s28, s21 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmov.f32 s21, s23 -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vadd.i32 q3, q5, q7 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q3, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s26, s11 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vadd.i32 q5, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q4, [r0, #128] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s21 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d14, d10 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vadd.i32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -929,22 +929,22 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -964,39 +964,39 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q1, q1, q2 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -1016,73 +1016,73 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d7, d4 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vadd.f32 q2, q4, q6 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vmov.f64 d9, d6 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s28, s21 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmov.f32 s21, s23 -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vadd.f32 q3, q5, q7 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q3, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s26, s11 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vadd.f32 q5, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q4, [r0, #128] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s21 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d14, d10 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vadd.f32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 6bbfe5c699c4..dad3251db0e7 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1247,38 +1247,36 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE42: # %bb.0: ; SSE42-NEXT: movups 80(%rdi), %xmm8 ; SSE42-NEXT: movdqu 64(%rdi), %xmm9 -; SSE42-NEXT: movdqu (%rdi), %xmm4 +; SSE42-NEXT: movdqu (%rdi), %xmm3 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 ; SSE42-NEXT: movups 32(%rdi), %xmm10 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1] ; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) -; SSE42-NEXT: movups %xmm4, (%rsi) -; SSE42-NEXT: movdqu %xmm3, 16(%rdx) +; SSE42-NEXT: movups %xmm3, (%rsi) +; SSE42-NEXT: movdqu %xmm4, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) -; SSE42-NEXT: movdqu %xmm2, 16(%rcx) -; SSE42-NEXT: movdqu %xmm1, (%rcx) +; SSE42-NEXT: movups %xmm0, 16(%rcx) +; SSE42-NEXT: movups %xmm7, (%rcx) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_out: diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index 3c5345bf3411..224a3dd5d207 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -14,38 +14,39 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm3, %ymm9 ; CHECK-NEXT: vmovaps %ymm1, %ymm8 -; CHECK-NEXT: vmovaps %ymm0, %ymm4 +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm3 +; CHECK-NEXT: vmovaps 208(%rbp), %ymm4 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm3, %xmm8 +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm4, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; CHECK-NEXT: vmovaps %xmm7, %xmm2 -; CHECK-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 +; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vmovaps %xmm4, %xmm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; CHECK-NEXT: vmovaps %xmm6, %xmm2 +; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3 +; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vmovaps %xmm7, %xmm3 +; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm3 +; CHECK-NEXT: vmovaps %xmm6, %xmm3 +; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 -; CHECK-NEXT: vmovq {{.*#+}} xmm4 = xmm2[0],zero -; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vmovaps %xmm4, %xmm2 -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm6[0,1] +; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8