From 4cf1da0e021bbd05665e8dbc7eebb94a5f1ea461 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 18:05:34 +0000 Subject: [PATCH] [SelectionDAG] allow FP binops in SimplifyDemandedVectorElts This is intended to make the backend on par with functionality that was added to the IR version of SimplifyDemandedVectorElts in: rL343727 ...and the original motivation is that we need to improve demanded-vector-elements in several ways to avoid problems that would be exposed in D51553. Differential Revision: https://reviews.llvm.org/D52912 llvm-svn: 344541 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 7 ++++- llvm/test/CodeGen/X86/avx512-hadd-hsub.ll | 30 +++++++++---------- .../X86/avx512-intrinsics-fast-isel.ll | 16 +++++----- .../CodeGen/X86/vector-shuffle-combining.ll | 4 +-- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d3a50788f79c..150d22cffa75 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1712,7 +1712,12 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::ADD: - case ISD::SUB: { + case ISD::SUB: + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: { APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef, SrcZero, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index 510553b56d46..aed182179cfe 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -178,16 +178,16 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) { define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_low: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_low: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; SKX-NEXT: retq @@ -252,17 +252,15 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) { define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_eel: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_eel: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -278,18 +276,18 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fsub_noundef_ee: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; KNL-NEXT: vbroadcastsd %xmm0, %zmm0 +; KNL-NEXT: vsubpd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; KNL-NEXT: retq ; ; SKX-LABEL: fsub_noundef_ee: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 +; SKX-NEXT: vsubpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; SKX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index fa37d2148f26..aa89ee7c3906 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7304,7 +7304,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7321,7 +7321,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7354,7 +7354,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7371,7 +7371,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7516,7 +7516,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7535,7 +7535,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7573,7 +7573,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7593,7 +7593,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 2eb9362947e2..01e366814005 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2703,7 +2703,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: PR22377: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; SSE-NEXT: addps %xmm0, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2711,7 +2711,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]