From 3d881a02309db79f1e3f79ec5fa872176a629920 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 19 Jun 2016 18:03:52 +0000 Subject: [PATCH] [X86][SSE] Allow target shuffle combining to match masks with SM_Sentinel values We currently only allow exact matches of shuffle mask patterns during target shuffle combining. This patch relaxes this to permit SM_SentinelUndef in the combined shuffle to always be accepted as well as allowing exact matching of the SM_SentinelZero value. I've adjusted some tests that were requiring exact shuffle masks to now include undef values. Differential Revision: http://reviews.llvm.org/D21495 llvm-svn: 273119 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 70 +++++++++++++------ llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 2 +- .../X86/vector-shuffle-combining-avx.ll | 8 +-- .../X86/vector-shuffle-combining-avx512bw.ll | 16 ++--- .../X86/vector-shuffle-combining-ssse3.ll | 8 +-- .../X86/vector-shuffle-combining-xop.ll | 2 +- 6 files changed, 66 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index da9c13e0274d..3a7940cf7d61 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7123,6 +7123,31 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef Mask, return true; } +/// Checks whether a target shuffle mask is equivalent to an explicit pattern. +/// +/// The masks must be exactly the same width. +/// +/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding +/// value in ExpectedMask is always accepted. Otherwise the indices must match. +/// +/// SM_SentinelZero is accepted as a valid negative index but must match in both. +static bool isTargetShuffleEquivalent(ArrayRef Mask, + ArrayRef ExpectedMask) { + int Size = Mask.size(); + if (Size != ExpectedMask.size()) + return false; + + for (int i = 0; i < Size; ++i) + if (Mask[i] == SM_SentinelUndef) + continue; + else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) + return false; + else if (Mask[i] != ExpectedMask[i]) + return false; + + return true; +} + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -24541,15 +24566,14 @@ static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. -// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals(). static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT) { bool FloatDomain = SrcVT.isFloatingPoint(); // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. - if (!FloatDomain && SrcVT.is128BitVector() && Mask.size() == 2 && - Mask[0] == 0 && Mask[1] < 0) { + if (!FloatDomain && SrcVT.is128BitVector() && + isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) { Shuffle = X86ISD::VZEXT_MOVL; ShuffleVT = MVT::v2i64; return true; @@ -24562,17 +24586,17 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) { - if (Mask.equals({0, 0})) { + if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v2f64; return true; } - if (Mask.equals({0, 0, 2, 2})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVSLDUP; ShuffleVT = MVT::v4f32; return true; } - if (Mask.equals({1, 1, 3, 3})) { + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { Shuffle = X86ISD::MOVSHDUP; ShuffleVT = MVT::v4f32; return true; @@ -24581,17 +24605,17 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, if (SrcVT.is256BitVector()) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); - if (Mask.equals({0, 0, 2, 2})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v4f64; return true; } - if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVSLDUP; ShuffleVT = MVT::v8f32; return true; } - if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7})) { + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { Shuffle = X86ISD::MOVSHDUP; ShuffleVT = MVT::v8f32; return true; @@ -24601,17 +24625,19 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, if (SrcVT.is512BitVector()) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); - if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVDDUP; ShuffleVT = MVT::v8f64; return true; } - if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { + if (isTargetShuffleEquivalent( + Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { Shuffle = X86ISD::MOVSLDUP; ShuffleVT = MVT::v16f32; return true; } - if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { + if (isTargetShuffleEquivalent( + Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { Shuffle = X86ISD::MOVSHDUP; ShuffleVT = MVT::v16f32; return true; @@ -24624,41 +24650,41 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef Mask, // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. -// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals(). static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef Mask, unsigned &Shuffle, MVT &ShuffleVT) { bool FloatDomain = SrcVT.isFloatingPoint(); if (SrcVT.is128BitVector()) { - if (Mask.equals({0, 0}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } - if (Mask.equals({1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } - if (Mask.equals({0, 0, 1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { Shuffle = X86ISD::UNPCKL; ShuffleVT = MVT::v4f32; return true; } - if (Mask.equals({2, 2, 3, 3}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { Shuffle = X86ISD::UNPCKH; ShuffleVT = MVT::v4f32; return true; } - if (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || - Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || + isTargetShuffleEquivalent( + Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { Shuffle = X86ISD::UNPCKL; ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; return true; } - if (Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || - Mask.equals( - {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15})) { + if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || + isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, + 13, 14, 14, 15, 15})) { Shuffle = X86ISD::UNPCKH; ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; return true; diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index 79317e4576b9..3f47d987aeda 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -388,7 +388,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 39288424cb34..3ab7d0be530b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -51,7 +51,7 @@ define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) { ; ALL: # BB#0: ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: retq - %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) + %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } @@ -60,7 +60,7 @@ define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) { ; ALL: # BB#0: ; ALL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; ALL-NEXT: retq - %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) + %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } @@ -86,7 +86,7 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_8f32_identity: ; ALL: # BB#0: ; ALL-NEXT: retq - %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) + %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) ret <8 x float> %2 } @@ -114,7 +114,7 @@ define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) { ; ALL: # BB#0: ; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; ALL-NEXT: retq - %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) + %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) ret <8 x float> %1 } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 7a6e81828962..658aab594f49 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -37,7 +37,7 @@ define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x doub ; CHECK: # BB#0: ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: retq - %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 -1) + %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 -1) ret <8 x double> %res0 } define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) { @@ -63,8 +63,8 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) ; CHECK-LABEL: combine_vpermt2var_8i64_identity: ; CHECK: # BB#0: ; CHECK-NEXT: retq - %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) + %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) ret <8 x i64> %res1 } define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { @@ -198,7 +198,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, < ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: retq - %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) ret <16 x float> %res0 } define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { @@ -208,7 +208,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> * ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 - %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) ret <16 x float> %res0 } @@ -216,8 +216,8 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> ; CHECK-LABEL: combine_vpermt2var_16i32_identity: ; CHECK: # BB#0: ; CHECK-NEXT: retq - %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) + %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) ret <16 x i32> %res1 } define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { @@ -261,7 +261,7 @@ define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) { ; CHECK: # BB#0: ; CHECK-NEXT: retq %select = bitcast <8 x i64> to <64 x i8> - %mask = bitcast <16 x i32> to <64 x i8> + %mask = bitcast <16 x i32> to <64 x i8> %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1) ret <64 x i8> %res1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 54af924d2af8..e893d9f82628 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -140,12 +140,12 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_unary_unpcklbw: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_unary_unpcklbw: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %1 @@ -154,12 +154,12 @@ define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_unary_unpckhwd: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_unary_unpckhwd: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index f39aa4e93e79..76226065fd7c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -119,7 +119,7 @@ define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) ; CHECK: # BB#0: ; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: retq - %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> ) + %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 }