[DAG] visitVECTOR_SHUFFLE - attempt to match commuted shuffles with MergeInnerShuffle.

Try to match "shuffle(C, shuffle(A, B, M0), M1) -> shuffle(A, B, M2)" etc. by using MergeInnerShuffle's commuted inner shuffle mode.
This commit is contained in:
Simon Pilgrim 2021-03-01 10:41:57 +00:00
parent 00e6513374
commit 9dd83f5ee8
5 changed files with 307 additions and 303 deletions

View File

@ -21196,23 +21196,28 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
// Don't try to fold shuffles with illegal type. // Don't try to fold shuffles with illegal type.
// Only fold if this shuffle is the only user of the other shuffle. // Only fold if this shuffle is the only user of the other shuffle.
if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well.
N->isOnlyUserOf(N0.getNode())) { for (int i = 0; i != 2; ++i) {
// The incoming shuffle must be of the same type as the result of the if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE &&
// current shuffle. N->isOnlyUserOf(N->getOperand(i).getNode())) {
auto *OtherSV = cast<ShuffleVectorSDNode>(N0); // The incoming shuffle must be of the same type as the result of the
assert(OtherSV->getOperand(0).getValueType() == VT && // current shuffle.
"Shuffle types don't match"); auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i));
assert(OtherSV->getOperand(0).getValueType() == VT &&
"Shuffle types don't match");
SDValue SV0, SV1; SDValue SV0, SV1;
SmallVector<int, 4> Mask; SmallVector<int, 4> Mask;
if (MergeInnerShuffle(false, SVN, OtherSV, N1, TLI, SV0, SV1, Mask)) { if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI,
// Check if all indices in Mask are Undef. In case, propagate Undef. SV0, SV1, Mask)) {
if (llvm::all_of(Mask, [](int M) { return M < 0; })) // Check if all indices in Mask are Undef. In case, propagate Undef.
return DAG.getUNDEF(VT); if (llvm::all_of(Mask, [](int M) { return M < 0; }))
return DAG.getUNDEF(VT);
return DAG.getVectorShuffle(VT, SDLoc(N), SV0 ? SV0 : DAG.getUNDEF(VT), return DAG.getVectorShuffle(VT, SDLoc(N),
SV1 ? SV1 : DAG.getUNDEF(VT), Mask); SV0 ? SV0 : DAG.getUNDEF(VT),
SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
}
} }
} }

View File

@ -135,19 +135,19 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s14, s8 ; CHECK-NEXT: vmov.f32 s12, s1
; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s16, s0
; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov.f32 s13, s4
; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s17, s3
; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s18, s6
; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s19, s10 ; CHECK-NEXT: vmov.f32 s15, s10
; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s19, s9
; CHECK-NEXT: vmov.f32 s13, s5 ; CHECK-NEXT: vadd.i32 q3, q4, q3
; CHECK-NEXT: vadd.i32 q0, q0, q4 ; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vadd.i32 q0, q0, q3 ; CHECK-NEXT: vadd.i32 q0, q3, q1
; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
entry: entry:
@ -1164,19 +1164,19 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
; CHECKFP: @ %bb.0: @ %entry ; CHECKFP: @ %bb.0: @ %entry
; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: .vsave {d8, d9}
; CHECKFP-NEXT: vpush {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9}
; CHECKFP-NEXT: vmov.f32 s14, s8 ; CHECKFP-NEXT: vmov.f32 s12, s1
; CHECKFP-NEXT: vmov.f32 s15, s11 ; CHECKFP-NEXT: vmov.f32 s16, s0
; CHECKFP-NEXT: vmov.f32 s16, s1 ; CHECKFP-NEXT: vmov.f32 s13, s4
; CHECKFP-NEXT: vmov.f32 s12, s2 ; CHECKFP-NEXT: vmov.f32 s17, s3
; CHECKFP-NEXT: vmov.f32 s17, s4 ; CHECKFP-NEXT: vmov.f32 s14, s7
; CHECKFP-NEXT: vmov.f32 s1, s3 ; CHECKFP-NEXT: vmov.f32 s18, s6
; CHECKFP-NEXT: vmov.f32 s18, s7 ; CHECKFP-NEXT: vmov.f32 s4, s2
; CHECKFP-NEXT: vmov.f32 s2, s6 ; CHECKFP-NEXT: vmov.f32 s6, s8
; CHECKFP-NEXT: vmov.f32 s19, s10 ; CHECKFP-NEXT: vmov.f32 s15, s10
; CHECKFP-NEXT: vmov.f32 s3, s9 ; CHECKFP-NEXT: vmov.f32 s19, s9
; CHECKFP-NEXT: vmov.f32 s13, s5 ; CHECKFP-NEXT: vadd.f32 q3, q4, q3
; CHECKFP-NEXT: vadd.f32 q0, q0, q4 ; CHECKFP-NEXT: vmov.f32 s7, s11
; CHECKFP-NEXT: vadd.f32 q0, q0, q3 ; CHECKFP-NEXT: vadd.f32 q0, q3, q1
; CHECKFP-NEXT: vpop {d8, d9} ; CHECKFP-NEXT: vpop {d8, d9}
; CHECKFP-NEXT: bx lr ; CHECKFP-NEXT: bx lr
entry: entry:

View File

@ -36,22 +36,22 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vadd.i32 q3, q3, q4
; CHECK-NEXT: vadd.i32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
@ -71,39 +71,39 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q3, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vadd.i32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q3, q3, q4
; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f64 d5, d2
; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s16, s13
; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.f32 s17, s4
; CHECK-NEXT: vmov.f32 s21, s16 ; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s13, s15 ; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov.f32 s22, s19 ; CHECK-NEXT: vmov.f32 s22, s6
; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vmov.f32 s23, s6 ; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s9, s17 ; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vadd.i32 q1, q3, q5 ; CHECK-NEXT: vadd.i32 q4, q5, q4
; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vadd.i32 q1, q4, q1
; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
@ -123,73 +123,73 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q3, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vadd.i32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
; CHECK-NEXT: vmov.f64 d5, d2
; CHECK-NEXT: vmov.f32 s11, s7
; CHECK-NEXT: vmov.f32 s20, s13
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vmov.f32 s21, s16
; CHECK-NEXT: vmov.f32 s13, s15
; CHECK-NEXT: vmov.f32 s22, s19
; CHECK-NEXT: vmov.f32 s14, s18
; CHECK-NEXT: vmov.f32 s23, s6
; CHECK-NEXT: vmov.f32 s15, s5
; CHECK-NEXT: vmov.f32 s9, s17
; CHECK-NEXT: vadd.i32 q1, q3, q5
; CHECK-NEXT: vadd.i32 q1, q1, q2
; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
; CHECK-NEXT: vmov.f64 d7, d4
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s24, s17
; CHECK-NEXT: vmov.f32 s12, s18
; CHECK-NEXT: vmov.f32 s25, s20
; CHECK-NEXT: vmov.f32 s17, s19
; CHECK-NEXT: vmov.f32 s26, s23
; CHECK-NEXT: vmov.f32 s18, s22
; CHECK-NEXT: vmov.f32 s27, s10
; CHECK-NEXT: vmov.f32 s19, s9
; CHECK-NEXT: vmov.f32 s13, s21
; CHECK-NEXT: vadd.i32 q2, q4, q6
; CHECK-NEXT: vadd.i32 q2, q2, q3
; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
; CHECK-NEXT: vmov.f64 d9, d6
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vmov.f32 s19, s15
; CHECK-NEXT: vmov.f32 s28, s21
; CHECK-NEXT: vmov.f32 s16, s22
; CHECK-NEXT: vmov.f32 s29, s24
; CHECK-NEXT: vmov.f32 s21, s23
; CHECK-NEXT: vmov.f32 s30, s27
; CHECK-NEXT: vmov.f32 s22, s26
; CHECK-NEXT: vmov.f32 s31, s14
; CHECK-NEXT: vmov.f32 s23, s13
; CHECK-NEXT: vmov.f32 s17, s25
; CHECK-NEXT: vadd.i32 q3, q5, q7
; CHECK-NEXT: vadd.i32 q3, q3, q4 ; CHECK-NEXT: vadd.i32 q3, q3, q4
; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f32 s16, s13
; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vmov.f32 s17, s4
; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov.f32 s22, s6
; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
; CHECK-NEXT: vadd.i32 q4, q5, q4
; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vadd.i32 q1, q4, q1
; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
; CHECK-NEXT: vmov.f32 s24, s13
; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s25, s8
; CHECK-NEXT: vmov.f32 s22, s10
; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
; CHECK-NEXT: vmov.f32 s10, s16
; CHECK-NEXT: vmov.f32 s27, s18
; CHECK-NEXT: vmov.f32 s23, s17
; CHECK-NEXT: vmov.f32 s11, s19
; CHECK-NEXT: vadd.i32 q5, q5, q6
; CHECK-NEXT: vadd.i32 q2, q5, q2
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vmov.f32 s24, s21
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vmov.f64 d14, d10
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vmov.f32 s25, s12
; CHECK-NEXT: vmov.f32 s29, s23
; CHECK-NEXT: vmov.f32 s26, s15
; CHECK-NEXT: vmov.f32 s30, s14
; CHECK-NEXT: vmov.f32 s12, s22
; CHECK-NEXT: vmov.f32 s14, s16
; CHECK-NEXT: vmov.f32 s27, s18
; CHECK-NEXT: vmov.f32 s31, s17
; CHECK-NEXT: vadd.i32 q6, q7, q6
; CHECK-NEXT: vmov.f32 s15, s19
; CHECK-NEXT: vadd.i32 q3, q6, q3
; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
@ -929,22 +929,22 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vadd.f32 q3, q3, q4
; CHECK-NEXT: vadd.f32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
@ -964,39 +964,39 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q3, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vadd.f32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q3, q3, q4
; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f64 d5, d2
; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s16, s13
; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vmov.f32 s17, s4
; CHECK-NEXT: vmov.f32 s21, s16 ; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s13, s15 ; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov.f32 s22, s19 ; CHECK-NEXT: vmov.f32 s22, s6
; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vmov.f32 s23, s6 ; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s9, s17 ; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vadd.f32 q1, q3, q5 ; CHECK-NEXT: vadd.f32 q4, q5, q4
; CHECK-NEXT: vadd.f32 q1, q1, q2 ; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vadd.f32 q1, q4, q1
; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr
@ -1016,73 +1016,73 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
; CHECK: @ %bb.0: @ %entry ; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q3, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d6, d2
; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s5
; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s17, s0
; CHECK-NEXT: vmov.f32 s17, s12 ; CHECK-NEXT: vmov.f32 s14, s2
; CHECK-NEXT: vmov.f32 s9, s11 ; CHECK-NEXT: vmov.f32 s18, s3
; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s15, s9
; CHECK-NEXT: vadd.f32 q0, q2, q4 ; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
; CHECK-NEXT: vmov.f64 d5, d2
; CHECK-NEXT: vmov.f32 s11, s7
; CHECK-NEXT: vmov.f32 s20, s13
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vmov.f32 s21, s16
; CHECK-NEXT: vmov.f32 s13, s15
; CHECK-NEXT: vmov.f32 s22, s19
; CHECK-NEXT: vmov.f32 s14, s18
; CHECK-NEXT: vmov.f32 s23, s6
; CHECK-NEXT: vmov.f32 s15, s5
; CHECK-NEXT: vmov.f32 s9, s17
; CHECK-NEXT: vadd.f32 q1, q3, q5
; CHECK-NEXT: vadd.f32 q1, q1, q2
; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
; CHECK-NEXT: vmov.f64 d7, d4
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s24, s17
; CHECK-NEXT: vmov.f32 s12, s18
; CHECK-NEXT: vmov.f32 s25, s20
; CHECK-NEXT: vmov.f32 s17, s19
; CHECK-NEXT: vmov.f32 s26, s23
; CHECK-NEXT: vmov.f32 s18, s22
; CHECK-NEXT: vmov.f32 s27, s10
; CHECK-NEXT: vmov.f32 s19, s9
; CHECK-NEXT: vmov.f32 s13, s21
; CHECK-NEXT: vadd.f32 q2, q4, q6
; CHECK-NEXT: vadd.f32 q2, q2, q3
; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
; CHECK-NEXT: vmov.f64 d9, d6
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vmov.f32 s19, s15
; CHECK-NEXT: vmov.f32 s28, s21
; CHECK-NEXT: vmov.f32 s16, s22
; CHECK-NEXT: vmov.f32 s29, s24
; CHECK-NEXT: vmov.f32 s21, s23
; CHECK-NEXT: vmov.f32 s30, s27
; CHECK-NEXT: vmov.f32 s22, s26
; CHECK-NEXT: vmov.f32 s31, s14
; CHECK-NEXT: vmov.f32 s23, s13
; CHECK-NEXT: vmov.f32 s17, s25
; CHECK-NEXT: vadd.f32 q3, q5, q7
; CHECK-NEXT: vadd.f32 q3, q3, q4 ; CHECK-NEXT: vadd.f32 q3, q3, q4
; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f32 s16, s13
; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vmov.f32 s17, s4
; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov.f32 s22, s6
; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
; CHECK-NEXT: vadd.f32 q4, q5, q4
; CHECK-NEXT: vmov.f64 d10, d6
; CHECK-NEXT: vadd.f32 q1, q4, q1
; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
; CHECK-NEXT: vmov.f32 s24, s13
; CHECK-NEXT: vmov.f32 s21, s15
; CHECK-NEXT: vmov.f32 s25, s8
; CHECK-NEXT: vmov.f32 s22, s10
; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vmov.f32 s8, s14
; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
; CHECK-NEXT: vmov.f32 s10, s16
; CHECK-NEXT: vmov.f32 s27, s18
; CHECK-NEXT: vmov.f32 s23, s17
; CHECK-NEXT: vmov.f32 s11, s19
; CHECK-NEXT: vadd.f32 q5, q5, q6
; CHECK-NEXT: vadd.f32 q2, q5, q2
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vmov.f32 s24, s21
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vmov.f64 d14, d10
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vmov.f32 s25, s12
; CHECK-NEXT: vmov.f32 s29, s23
; CHECK-NEXT: vmov.f32 s26, s15
; CHECK-NEXT: vmov.f32 s30, s14
; CHECK-NEXT: vmov.f32 s12, s22
; CHECK-NEXT: vmov.f32 s14, s16
; CHECK-NEXT: vmov.f32 s27, s18
; CHECK-NEXT: vmov.f32 s31, s17
; CHECK-NEXT: vadd.f32 q6, q7, q6
; CHECK-NEXT: vmov.f32 s15, s19
; CHECK-NEXT: vadd.f32 q3, q6, q3
; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr ; CHECK-NEXT: bx lr

View File

@ -1247,38 +1247,36 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE42: # %bb.0: ; SSE42: # %bb.0:
; SSE42-NEXT: movups 80(%rdi), %xmm8 ; SSE42-NEXT: movups 80(%rdi), %xmm8
; SSE42-NEXT: movdqu 64(%rdi), %xmm9 ; SSE42-NEXT: movdqu 64(%rdi), %xmm9
; SSE42-NEXT: movdqu (%rdi), %xmm4 ; SSE42-NEXT: movdqu (%rdi), %xmm3
; SSE42-NEXT: movdqu 16(%rdi), %xmm2 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2
; SSE42-NEXT: movups 32(%rdi), %xmm10 ; SSE42-NEXT: movups 32(%rdi), %xmm10
; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5
; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: movdqa %xmm2, %xmm6
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1]
; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: movdqa %xmm9, %xmm1
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2]
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2]
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] ; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] ; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE42-NEXT: movups %xmm5, 16(%rsi) ; SSE42-NEXT: movups %xmm5, 16(%rsi)
; SSE42-NEXT: movups %xmm4, (%rsi) ; SSE42-NEXT: movups %xmm3, (%rsi)
; SSE42-NEXT: movdqu %xmm3, 16(%rdx) ; SSE42-NEXT: movdqu %xmm4, 16(%rdx)
; SSE42-NEXT: movdqu %xmm6, (%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx)
; SSE42-NEXT: movdqu %xmm2, 16(%rcx) ; SSE42-NEXT: movups %xmm0, 16(%rcx)
; SSE42-NEXT: movdqu %xmm1, (%rcx) ; SSE42-NEXT: movups %xmm7, (%rcx)
; SSE42-NEXT: retq ; SSE42-NEXT: retq
; ;
; AVX1-LABEL: interleave_24i32_out: ; AVX1-LABEL: interleave_24i32_out:

View File

@ -14,38 +14,39 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm4, %ymm10
; CHECK-NEXT: vmovaps %ymm3, %ymm9 ; CHECK-NEXT: vmovaps %ymm3, %ymm9
; CHECK-NEXT: vmovaps %ymm1, %ymm8 ; CHECK-NEXT: vmovaps %ymm1, %ymm8
; CHECK-NEXT: vmovaps %ymm0, %ymm4 ; CHECK-NEXT: vmovaps %ymm0, %ymm3
; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1
; CHECK-NEXT: vmovaps 208(%rbp), %ymm3 ; CHECK-NEXT: vmovaps 208(%rbp), %ymm4
; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0
; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0
; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11
; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11
; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11
; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11
; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7]
; CHECK-NEXT: vmovaps %xmm3, %xmm8 ; CHECK-NEXT: vmovaps %xmm4, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
; CHECK-NEXT: vmovaps %xmm7, %xmm2 ; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
; CHECK-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero
; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vmovaps %xmm4, %xmm2 ; CHECK-NEXT: vmovaps %xmm6, %xmm2
; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3
; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; CHECK-NEXT: vmovaps %xmm7, %xmm3
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
; CHECK-NEXT: # implicit-def: $ymm3
; CHECK-NEXT: vmovaps %xmm6, %xmm3
; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,0,1,4,5,4,5] ; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
; CHECK-NEXT: vmovq {{.*#+}} xmm4 = xmm2[0],zero
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vmovaps %xmm4, %xmm2
; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm6[0,1]
; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp ; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: .cfi_def_cfa %rsp, 8