[DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),undef) -> bop(shuffle'(x,y),shuffle'(z,w))

Followup to D96345, handle unary shuffles of binops (as well as binary shuffles) if we can merge the shuffle with inner operand shuffles.

Differential Revision: https://reviews.llvm.org/D98646
This commit is contained in:
Simon Pilgrim 2021-03-19 13:34:47 +00:00
parent 2ec9239a7b
commit ffb2887103
4 changed files with 25 additions and 35 deletions

View File

@ -21255,14 +21255,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
// Merge shuffles through binops if we are able to merge it with at least
// one other shuffles.
// shuffle(bop(shuffle(x,y),shuffle(z,w)),undef)
// shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
unsigned SrcOpcode = N0.getOpcode();
if (SrcOpcode == N1.getOpcode() && TLI.isBinOp(SrcOpcode) &&
N->isOnlyUserOf(N0.getNode()) && N->isOnlyUserOf(N1.getNode())) {
if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) &&
(N1.isUndef() ||
(SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) {
// Get binop source ops, or just pass on the undef.
SDValue Op00 = N0.getOperand(0);
SDValue Op10 = N1.getOperand(0);
SDValue Op01 = N0.getOperand(1);
SDValue Op11 = N1.getOperand(1);
SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0);
SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1);
// TODO: We might be able to relax the VT check but we don't currently
// have any isBinOp() that has different result/ops VTs so play safe until
// we have test coverage.

View File

@ -123,26 +123,25 @@ define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) {
define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) {
; SSE-LABEL: hadd_reverse3_v8f32:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm4
; SSE-NEXT: haddps %xmm2, %xmm4
; SSE-NEXT: haddps %xmm3, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: movaps %xmm4, %xmm1
; SSE-NEXT: haddps %xmm1, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
; SSE-NEXT: haddps %xmm0, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: hadd_reverse3_v8f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse3_v8f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
%shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>

View File

@ -525,7 +525,6 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_v8i32b:
@ -615,7 +614,6 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hsub_v8i32b:
@ -705,7 +703,6 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_v16i16b:
@ -795,7 +792,6 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hsub_v16i16b:

View File

@ -513,9 +513,8 @@ define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) {
; X86: # %bb.0:
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
@ -523,9 +522,8 @@ define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) {
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@ -553,9 +551,8 @@ define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) {
; X86: # %bb.0:
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
@ -563,9 +560,8 @@ define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) {
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@ -593,9 +589,8 @@ define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) {
; X86: # %bb.0:
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
@ -603,9 +598,8 @@ define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) {
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;
@ -633,9 +627,8 @@ define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) {
; X86: # %bb.0:
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X86-NEXT: retl
;
@ -643,9 +636,8 @@ define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) {
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq
;