forked from OSchip/llvm-project
[DAG] visitVECTOR_SHUFFLE - attempt to match commuted shuffles with MergeInnerShuffle.
Try to match "shuffle(C, shuffle(A, B, M0), M1) -> shuffle(A, B, M2)" etc. by using MergeInnerShuffle's commuted inner shuffle mode.
This commit is contained in:
parent
00e6513374
commit
9dd83f5ee8
|
@ -21196,23 +21196,28 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
|
|||
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
|
||||
// Don't try to fold shuffles with illegal type.
|
||||
// Only fold if this shuffle is the only user of the other shuffle.
|
||||
if (N0.getOpcode() == ISD::VECTOR_SHUFFLE &&
|
||||
N->isOnlyUserOf(N0.getNode())) {
|
||||
// The incoming shuffle must be of the same type as the result of the
|
||||
// current shuffle.
|
||||
auto *OtherSV = cast<ShuffleVectorSDNode>(N0);
|
||||
assert(OtherSV->getOperand(0).getValueType() == VT &&
|
||||
"Shuffle types don't match");
|
||||
// Try matching shuffle(C,shuffle(A,B)) commutted patterns as well.
|
||||
for (int i = 0; i != 2; ++i) {
|
||||
if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE &&
|
||||
N->isOnlyUserOf(N->getOperand(i).getNode())) {
|
||||
// The incoming shuffle must be of the same type as the result of the
|
||||
// current shuffle.
|
||||
auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i));
|
||||
assert(OtherSV->getOperand(0).getValueType() == VT &&
|
||||
"Shuffle types don't match");
|
||||
|
||||
SDValue SV0, SV1;
|
||||
SmallVector<int, 4> Mask;
|
||||
if (MergeInnerShuffle(false, SVN, OtherSV, N1, TLI, SV0, SV1, Mask)) {
|
||||
// Check if all indices in Mask are Undef. In case, propagate Undef.
|
||||
if (llvm::all_of(Mask, [](int M) { return M < 0; }))
|
||||
return DAG.getUNDEF(VT);
|
||||
SDValue SV0, SV1;
|
||||
SmallVector<int, 4> Mask;
|
||||
if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI,
|
||||
SV0, SV1, Mask)) {
|
||||
// Check if all indices in Mask are Undef. In case, propagate Undef.
|
||||
if (llvm::all_of(Mask, [](int M) { return M < 0; }))
|
||||
return DAG.getUNDEF(VT);
|
||||
|
||||
return DAG.getVectorShuffle(VT, SDLoc(N), SV0 ? SV0 : DAG.getUNDEF(VT),
|
||||
SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
|
||||
return DAG.getVectorShuffle(VT, SDLoc(N),
|
||||
SV0 ? SV0 : DAG.getUNDEF(VT),
|
||||
SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -135,19 +135,19 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: vmov.f32 s14, s8
|
||||
; CHECK-NEXT: vmov.f32 s15, s11
|
||||
; CHECK-NEXT: vmov.f32 s16, s1
|
||||
; CHECK-NEXT: vmov.f32 s12, s2
|
||||
; CHECK-NEXT: vmov.f32 s17, s4
|
||||
; CHECK-NEXT: vmov.f32 s1, s3
|
||||
; CHECK-NEXT: vmov.f32 s18, s7
|
||||
; CHECK-NEXT: vmov.f32 s2, s6
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s3, s9
|
||||
; CHECK-NEXT: vmov.f32 s13, s5
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q3
|
||||
; CHECK-NEXT: vmov.f32 s12, s1
|
||||
; CHECK-NEXT: vmov.f32 s16, s0
|
||||
; CHECK-NEXT: vmov.f32 s13, s4
|
||||
; CHECK-NEXT: vmov.f32 s17, s3
|
||||
; CHECK-NEXT: vmov.f32 s14, s7
|
||||
; CHECK-NEXT: vmov.f32 s18, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s2
|
||||
; CHECK-NEXT: vmov.f32 s6, s8
|
||||
; CHECK-NEXT: vmov.f32 s15, s10
|
||||
; CHECK-NEXT: vmov.f32 s19, s9
|
||||
; CHECK-NEXT: vadd.i32 q3, q4, q3
|
||||
; CHECK-NEXT: vmov.f32 s7, s11
|
||||
; CHECK-NEXT: vadd.i32 q0, q3, q1
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
|
@ -1164,19 +1164,19 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
|
|||
; CHECKFP: @ %bb.0: @ %entry
|
||||
; CHECKFP-NEXT: .vsave {d8, d9}
|
||||
; CHECKFP-NEXT: vpush {d8, d9}
|
||||
; CHECKFP-NEXT: vmov.f32 s14, s8
|
||||
; CHECKFP-NEXT: vmov.f32 s15, s11
|
||||
; CHECKFP-NEXT: vmov.f32 s16, s1
|
||||
; CHECKFP-NEXT: vmov.f32 s12, s2
|
||||
; CHECKFP-NEXT: vmov.f32 s17, s4
|
||||
; CHECKFP-NEXT: vmov.f32 s1, s3
|
||||
; CHECKFP-NEXT: vmov.f32 s18, s7
|
||||
; CHECKFP-NEXT: vmov.f32 s2, s6
|
||||
; CHECKFP-NEXT: vmov.f32 s19, s10
|
||||
; CHECKFP-NEXT: vmov.f32 s3, s9
|
||||
; CHECKFP-NEXT: vmov.f32 s13, s5
|
||||
; CHECKFP-NEXT: vadd.f32 q0, q0, q4
|
||||
; CHECKFP-NEXT: vadd.f32 q0, q0, q3
|
||||
; CHECKFP-NEXT: vmov.f32 s12, s1
|
||||
; CHECKFP-NEXT: vmov.f32 s16, s0
|
||||
; CHECKFP-NEXT: vmov.f32 s13, s4
|
||||
; CHECKFP-NEXT: vmov.f32 s17, s3
|
||||
; CHECKFP-NEXT: vmov.f32 s14, s7
|
||||
; CHECKFP-NEXT: vmov.f32 s18, s6
|
||||
; CHECKFP-NEXT: vmov.f32 s4, s2
|
||||
; CHECKFP-NEXT: vmov.f32 s6, s8
|
||||
; CHECKFP-NEXT: vmov.f32 s15, s10
|
||||
; CHECKFP-NEXT: vmov.f32 s19, s9
|
||||
; CHECKFP-NEXT: vadd.f32 q3, q4, q3
|
||||
; CHECKFP-NEXT: vmov.f32 s7, s11
|
||||
; CHECKFP-NEXT: vadd.f32 q0, q3, q1
|
||||
; CHECKFP-NEXT: vpop {d8, d9}
|
||||
; CHECKFP-NEXT: bx lr
|
||||
entry:
|
||||
|
|
|
@ -36,22 +36,22 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.i32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vadd.i32 q3, q3, q4
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.i32 q0, q3, q0
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -71,39 +71,39 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.i32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.i32 q3, q3, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q3, q0
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d5, d2
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vmov.f32 s11, s7
|
||||
; CHECK-NEXT: vmov.f32 s20, s13
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vmov.f32 s21, s16
|
||||
; CHECK-NEXT: vmov.f32 s13, s15
|
||||
; CHECK-NEXT: vmov.f32 s22, s19
|
||||
; CHECK-NEXT: vmov.f32 s14, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s6
|
||||
; CHECK-NEXT: vmov.f32 s15, s5
|
||||
; CHECK-NEXT: vmov.f32 s9, s17
|
||||
; CHECK-NEXT: vadd.i32 q1, q3, q5
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, q2
|
||||
; CHECK-NEXT: vmov.f32 s16, s13
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vmov.f32 s17, s4
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s18, s7
|
||||
; CHECK-NEXT: vmov.f32 s22, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s14
|
||||
; CHECK-NEXT: vmov.f32 s6, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s23, s9
|
||||
; CHECK-NEXT: vadd.i32 q4, q5, q4
|
||||
; CHECK-NEXT: vmov.f32 s7, s11
|
||||
; CHECK-NEXT: vadd.i32 q1, q4, q1
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -123,73 +123,73 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.i32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d5, d2
|
||||
; CHECK-NEXT: vmov.f32 s11, s7
|
||||
; CHECK-NEXT: vmov.f32 s20, s13
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vmov.f32 s21, s16
|
||||
; CHECK-NEXT: vmov.f32 s13, s15
|
||||
; CHECK-NEXT: vmov.f32 s22, s19
|
||||
; CHECK-NEXT: vmov.f32 s14, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s6
|
||||
; CHECK-NEXT: vmov.f32 s15, s5
|
||||
; CHECK-NEXT: vmov.f32 s9, s17
|
||||
; CHECK-NEXT: vadd.i32 q1, q3, q5
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, q2
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
|
||||
; CHECK-NEXT: vmov.f64 d7, d4
|
||||
; CHECK-NEXT: vmov.f32 s15, s11
|
||||
; CHECK-NEXT: vmov.f32 s24, s17
|
||||
; CHECK-NEXT: vmov.f32 s12, s18
|
||||
; CHECK-NEXT: vmov.f32 s25, s20
|
||||
; CHECK-NEXT: vmov.f32 s17, s19
|
||||
; CHECK-NEXT: vmov.f32 s26, s23
|
||||
; CHECK-NEXT: vmov.f32 s18, s22
|
||||
; CHECK-NEXT: vmov.f32 s27, s10
|
||||
; CHECK-NEXT: vmov.f32 s19, s9
|
||||
; CHECK-NEXT: vmov.f32 s13, s21
|
||||
; CHECK-NEXT: vadd.i32 q2, q4, q6
|
||||
; CHECK-NEXT: vadd.i32 q2, q2, q3
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
|
||||
; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
|
||||
; CHECK-NEXT: vmov.f64 d9, d6
|
||||
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vmov.f32 s19, s15
|
||||
; CHECK-NEXT: vmov.f32 s28, s21
|
||||
; CHECK-NEXT: vmov.f32 s16, s22
|
||||
; CHECK-NEXT: vmov.f32 s29, s24
|
||||
; CHECK-NEXT: vmov.f32 s21, s23
|
||||
; CHECK-NEXT: vmov.f32 s30, s27
|
||||
; CHECK-NEXT: vmov.f32 s22, s26
|
||||
; CHECK-NEXT: vmov.f32 s31, s14
|
||||
; CHECK-NEXT: vmov.f32 s23, s13
|
||||
; CHECK-NEXT: vmov.f32 s17, s25
|
||||
; CHECK-NEXT: vadd.i32 q3, q5, q7
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.i32 q3, q3, q4
|
||||
; CHECK-NEXT: vadd.i32 q0, q3, q0
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vmov.f32 s16, s13
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vmov.f32 s17, s4
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s18, s7
|
||||
; CHECK-NEXT: vmov.f32 s22, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s14
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
|
||||
; CHECK-NEXT: vmov.f32 s6, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s23, s9
|
||||
; CHECK-NEXT: vmov.f32 s7, s11
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
|
||||
; CHECK-NEXT: vadd.i32 q4, q5, q4
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vadd.i32 q1, q4, q1
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
|
||||
; CHECK-NEXT: vmov.f32 s24, s13
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s25, s8
|
||||
; CHECK-NEXT: vmov.f32 s22, s10
|
||||
; CHECK-NEXT: vmov.f32 s26, s11
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
|
||||
; CHECK-NEXT: vmov.f32 s10, s16
|
||||
; CHECK-NEXT: vmov.f32 s27, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s17
|
||||
; CHECK-NEXT: vmov.f32 s11, s19
|
||||
; CHECK-NEXT: vadd.i32 q5, q5, q6
|
||||
; CHECK-NEXT: vadd.i32 q2, q5, q2
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
|
||||
; CHECK-NEXT: vmov.f32 s24, s21
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vmov.f64 d14, d10
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vmov.f32 s25, s12
|
||||
; CHECK-NEXT: vmov.f32 s29, s23
|
||||
; CHECK-NEXT: vmov.f32 s26, s15
|
||||
; CHECK-NEXT: vmov.f32 s30, s14
|
||||
; CHECK-NEXT: vmov.f32 s12, s22
|
||||
; CHECK-NEXT: vmov.f32 s14, s16
|
||||
; CHECK-NEXT: vmov.f32 s27, s18
|
||||
; CHECK-NEXT: vmov.f32 s31, s17
|
||||
; CHECK-NEXT: vadd.i32 q6, q7, q6
|
||||
; CHECK-NEXT: vmov.f32 s15, s19
|
||||
; CHECK-NEXT: vadd.i32 q3, q6, q3
|
||||
; CHECK-NEXT: vstrw.32 q3, [r1, #32]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -929,22 +929,22 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.f32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.f32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vadd.f32 q3, q3, q4
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.f32 q0, q3, q0
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -964,39 +964,39 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.f32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.f32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.f32 q3, q3, q4
|
||||
; CHECK-NEXT: vadd.f32 q0, q3, q0
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d5, d2
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vmov.f32 s11, s7
|
||||
; CHECK-NEXT: vmov.f32 s20, s13
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vmov.f32 s21, s16
|
||||
; CHECK-NEXT: vmov.f32 s13, s15
|
||||
; CHECK-NEXT: vmov.f32 s22, s19
|
||||
; CHECK-NEXT: vmov.f32 s14, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s6
|
||||
; CHECK-NEXT: vmov.f32 s15, s5
|
||||
; CHECK-NEXT: vmov.f32 s9, s17
|
||||
; CHECK-NEXT: vadd.f32 q1, q3, q5
|
||||
; CHECK-NEXT: vadd.f32 q1, q1, q2
|
||||
; CHECK-NEXT: vmov.f32 s16, s13
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vmov.f32 s17, s4
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s18, s7
|
||||
; CHECK-NEXT: vmov.f32 s22, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s14
|
||||
; CHECK-NEXT: vmov.f32 s6, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s23, s9
|
||||
; CHECK-NEXT: vadd.f32 q4, q5, q4
|
||||
; CHECK-NEXT: vmov.f32 s7, s11
|
||||
; CHECK-NEXT: vadd.f32 q1, q4, q1
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
@ -1016,73 +1016,73 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
|
||||
; CHECK-NEXT: vmov.f64 d3, d0
|
||||
; CHECK-NEXT: vmov.f32 s7, s3
|
||||
; CHECK-NEXT: vmov.f32 s16, s9
|
||||
; CHECK-NEXT: vmov.f32 s4, s10
|
||||
; CHECK-NEXT: vmov.f32 s17, s12
|
||||
; CHECK-NEXT: vmov.f32 s9, s11
|
||||
; CHECK-NEXT: vmov.f32 s18, s15
|
||||
; CHECK-NEXT: vmov.f32 s10, s14
|
||||
; CHECK-NEXT: vmov.f32 s19, s2
|
||||
; CHECK-NEXT: vmov.f32 s11, s1
|
||||
; CHECK-NEXT: vmov.f32 s5, s13
|
||||
; CHECK-NEXT: vadd.f32 q0, q2, q4
|
||||
; CHECK-NEXT: vadd.f32 q0, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f64 d5, d2
|
||||
; CHECK-NEXT: vmov.f32 s11, s7
|
||||
; CHECK-NEXT: vmov.f32 s20, s13
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vmov.f32 s21, s16
|
||||
; CHECK-NEXT: vmov.f32 s13, s15
|
||||
; CHECK-NEXT: vmov.f32 s22, s19
|
||||
; CHECK-NEXT: vmov.f32 s14, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s6
|
||||
; CHECK-NEXT: vmov.f32 s15, s5
|
||||
; CHECK-NEXT: vmov.f32 s9, s17
|
||||
; CHECK-NEXT: vadd.f32 q1, q3, q5
|
||||
; CHECK-NEXT: vadd.f32 q1, q1, q2
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
|
||||
; CHECK-NEXT: vmov.f64 d7, d4
|
||||
; CHECK-NEXT: vmov.f32 s15, s11
|
||||
; CHECK-NEXT: vmov.f32 s24, s17
|
||||
; CHECK-NEXT: vmov.f32 s12, s18
|
||||
; CHECK-NEXT: vmov.f32 s25, s20
|
||||
; CHECK-NEXT: vmov.f32 s17, s19
|
||||
; CHECK-NEXT: vmov.f32 s26, s23
|
||||
; CHECK-NEXT: vmov.f32 s18, s22
|
||||
; CHECK-NEXT: vmov.f32 s27, s10
|
||||
; CHECK-NEXT: vmov.f32 s19, s9
|
||||
; CHECK-NEXT: vmov.f32 s13, s21
|
||||
; CHECK-NEXT: vadd.f32 q2, q4, q6
|
||||
; CHECK-NEXT: vadd.f32 q2, q2, q3
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
|
||||
; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
|
||||
; CHECK-NEXT: vmov.f64 d9, d6
|
||||
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vmov.f32 s19, s15
|
||||
; CHECK-NEXT: vmov.f32 s28, s21
|
||||
; CHECK-NEXT: vmov.f32 s16, s22
|
||||
; CHECK-NEXT: vmov.f32 s29, s24
|
||||
; CHECK-NEXT: vmov.f32 s21, s23
|
||||
; CHECK-NEXT: vmov.f32 s30, s27
|
||||
; CHECK-NEXT: vmov.f32 s22, s26
|
||||
; CHECK-NEXT: vmov.f32 s31, s14
|
||||
; CHECK-NEXT: vmov.f32 s23, s13
|
||||
; CHECK-NEXT: vmov.f32 s17, s25
|
||||
; CHECK-NEXT: vadd.f32 q3, q5, q7
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
|
||||
; CHECK-NEXT: vmov.f64 d6, d2
|
||||
; CHECK-NEXT: vmov.f32 s16, s5
|
||||
; CHECK-NEXT: vmov.f32 s13, s7
|
||||
; CHECK-NEXT: vmov.f32 s17, s0
|
||||
; CHECK-NEXT: vmov.f32 s14, s2
|
||||
; CHECK-NEXT: vmov.f32 s18, s3
|
||||
; CHECK-NEXT: vmov.f32 s0, s6
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmov.f32 s2, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s15, s9
|
||||
; CHECK-NEXT: vmov.f32 s3, s11
|
||||
; CHECK-NEXT: vadd.f32 q3, q3, q4
|
||||
; CHECK-NEXT: vadd.f32 q0, q3, q0
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vmov.f32 s16, s13
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vmov.f32 s17, s4
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s18, s7
|
||||
; CHECK-NEXT: vmov.f32 s22, s6
|
||||
; CHECK-NEXT: vmov.f32 s4, s14
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
|
||||
; CHECK-NEXT: vmov.f32 s6, s8
|
||||
; CHECK-NEXT: vmov.f32 s19, s10
|
||||
; CHECK-NEXT: vmov.f32 s23, s9
|
||||
; CHECK-NEXT: vmov.f32 s7, s11
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
|
||||
; CHECK-NEXT: vadd.f32 q4, q5, q4
|
||||
; CHECK-NEXT: vmov.f64 d10, d6
|
||||
; CHECK-NEXT: vadd.f32 q1, q4, q1
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
|
||||
; CHECK-NEXT: vmov.f32 s24, s13
|
||||
; CHECK-NEXT: vmov.f32 s21, s15
|
||||
; CHECK-NEXT: vmov.f32 s25, s8
|
||||
; CHECK-NEXT: vmov.f32 s22, s10
|
||||
; CHECK-NEXT: vmov.f32 s26, s11
|
||||
; CHECK-NEXT: vmov.f32 s8, s14
|
||||
; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
|
||||
; CHECK-NEXT: vmov.f32 s10, s16
|
||||
; CHECK-NEXT: vmov.f32 s27, s18
|
||||
; CHECK-NEXT: vmov.f32 s23, s17
|
||||
; CHECK-NEXT: vmov.f32 s11, s19
|
||||
; CHECK-NEXT: vadd.f32 q5, q5, q6
|
||||
; CHECK-NEXT: vadd.f32 q2, q5, q2
|
||||
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
|
||||
; CHECK-NEXT: vmov.f32 s24, s21
|
||||
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vmov.f64 d14, d10
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1]
|
||||
; CHECK-NEXT: vmov.f32 s25, s12
|
||||
; CHECK-NEXT: vmov.f32 s29, s23
|
||||
; CHECK-NEXT: vmov.f32 s26, s15
|
||||
; CHECK-NEXT: vmov.f32 s30, s14
|
||||
; CHECK-NEXT: vmov.f32 s12, s22
|
||||
; CHECK-NEXT: vmov.f32 s14, s16
|
||||
; CHECK-NEXT: vmov.f32 s27, s18
|
||||
; CHECK-NEXT: vmov.f32 s31, s17
|
||||
; CHECK-NEXT: vadd.f32 q6, q7, q6
|
||||
; CHECK-NEXT: vmov.f32 s15, s19
|
||||
; CHECK-NEXT: vadd.f32 q3, q6, q3
|
||||
; CHECK-NEXT: vstrw.32 q3, [r1, #32]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: bx lr
|
||||
|
|
|
@ -1247,38 +1247,36 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
|
|||
; SSE42: # %bb.0:
|
||||
; SSE42-NEXT: movups 80(%rdi), %xmm8
|
||||
; SSE42-NEXT: movdqu 64(%rdi), %xmm9
|
||||
; SSE42-NEXT: movdqu (%rdi), %xmm4
|
||||
; SSE42-NEXT: movdqu (%rdi), %xmm3
|
||||
; SSE42-NEXT: movdqu 16(%rdi), %xmm2
|
||||
; SSE42-NEXT: movups 32(%rdi), %xmm10
|
||||
; SSE42-NEXT: movdqu 48(%rdi), %xmm5
|
||||
; SSE42-NEXT: movdqa %xmm2, %xmm6
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
|
||||
; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
|
||||
; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
|
||||
; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3]
|
||||
; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1]
|
||||
; SSE42-NEXT: movdqa %xmm9, %xmm1
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
|
||||
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3]
|
||||
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6,7]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7]
|
||||
; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7]
|
||||
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3]
|
||||
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
|
||||
; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3]
|
||||
; SSE42-NEXT: movups %xmm5, 16(%rsi)
|
||||
; SSE42-NEXT: movups %xmm4, (%rsi)
|
||||
; SSE42-NEXT: movdqu %xmm3, 16(%rdx)
|
||||
; SSE42-NEXT: movups %xmm3, (%rsi)
|
||||
; SSE42-NEXT: movdqu %xmm4, 16(%rdx)
|
||||
; SSE42-NEXT: movdqu %xmm6, (%rdx)
|
||||
; SSE42-NEXT: movdqu %xmm2, 16(%rcx)
|
||||
; SSE42-NEXT: movdqu %xmm1, (%rcx)
|
||||
; SSE42-NEXT: movups %xmm0, 16(%rcx)
|
||||
; SSE42-NEXT: movups %xmm7, (%rcx)
|
||||
; SSE42-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: interleave_24i32_out:
|
||||
|
|
|
@ -14,38 +14,39 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
|
|||
; CHECK-NEXT: vmovaps %ymm4, %ymm10
|
||||
; CHECK-NEXT: vmovaps %ymm3, %ymm9
|
||||
; CHECK-NEXT: vmovaps %ymm1, %ymm8
|
||||
; CHECK-NEXT: vmovaps %ymm0, %ymm4
|
||||
; CHECK-NEXT: vmovaps %ymm0, %ymm3
|
||||
; CHECK-NEXT: vmovaps 240(%rbp), %ymm1
|
||||
; CHECK-NEXT: vmovaps 208(%rbp), %ymm3
|
||||
; CHECK-NEXT: vmovaps 208(%rbp), %ymm4
|
||||
; CHECK-NEXT: vmovaps 176(%rbp), %ymm0
|
||||
; CHECK-NEXT: vmovaps 144(%rbp), %ymm0
|
||||
; CHECK-NEXT: vmovaps 112(%rbp), %ymm11
|
||||
; CHECK-NEXT: vmovaps 80(%rbp), %ymm11
|
||||
; CHECK-NEXT: vmovaps 48(%rbp), %ymm11
|
||||
; CHECK-NEXT: vmovaps 16(%rbp), %ymm11
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7]
|
||||
; CHECK-NEXT: vmovaps %xmm3, %xmm8
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7]
|
||||
; CHECK-NEXT: vmovaps %xmm4, %xmm6
|
||||
; CHECK-NEXT: # implicit-def: $ymm2
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
|
||||
; CHECK-NEXT: vmovaps %xmm7, %xmm2
|
||||
; CHECK-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero
|
||||
; CHECK-NEXT: # implicit-def: $ymm2
|
||||
; CHECK-NEXT: vmovaps %xmm4, %xmm2
|
||||
; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
|
||||
; CHECK-NEXT: vmovaps %xmm6, %xmm2
|
||||
; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vmovaps %xmm7, %xmm3
|
||||
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
|
||||
; CHECK-NEXT: # implicit-def: $ymm3
|
||||
; CHECK-NEXT: vmovaps %xmm6, %xmm3
|
||||
; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,0,1,4,5,4,5]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
|
||||
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm4 = xmm2[0],zero
|
||||
; CHECK-NEXT: # implicit-def: $ymm2
|
||||
; CHECK-NEXT: vmovaps %xmm4, %xmm2
|
||||
; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm6[0,1]
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
|
||||
; CHECK-NEXT: movq %rbp, %rsp
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
|
||||
|
|
Loading…
Reference in New Issue