[X86][SSE] Add support for target shuffle combining to PSHUFLW/PSHUFHW

llvm-svn: 275022
2016-07-10 21:02:47 +00:00 · 2016-07-10 21:02:47 +00:00 · 2191faa433
parent 9bedcdb5f5
commit 2191faa433
4 changed files with 63 additions and 22 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -4192,6 +4192,16 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
  return (Val < 0) || (Val >= Low && Val < Hi);
 }

+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+                             int Low, int Hi) {
+  for (int M : Mask)
+    if (!isUndefOrInRange(M, Low, Hi))
+      return false;
+  return true;
+}
+
 /// Val is either less than zero (undef) or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
  return (Val < 0 || Val == CmpVal);
@ -24834,12 +24844,47 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
           "Expected unary shuffle");
  }

-  // We only support permutation of 32/64 bit elements.
-  // TODO - support PSHUFLW/PSHUFHW.
  unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+  // Handle PSHUFLW/PSHUFHW repeated patterns.
+  if (MaskScalarSizeInBits == 16) {
+    SmallVector<int, 4> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+      ArrayRef<int> LoMask(Mask.data() + 0, 4);
+      ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+      // PSHUFLW: permute lower 4 elements only.
+      if (isUndefOrInRange(LoMask, 0, 4) &&
+          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+        Shuffle = X86ISD::PSHUFLW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(LoMask);
+        return true;
+      }
+
+      // PSHUFHW: permute upper 4 elements only.
+      if (isUndefOrInRange(HiMask, 4, 8) &&
+          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+        // Offset the HiMask so that we can create the shuffle immediate.
+        int OffsetHiMask[4];
+        for (int i = 0; i != 4; ++i)
+          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+        Shuffle = X86ISD::PSHUFHW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+        return true;
+      }
+
+      return false;
+    }
+    return false;
+  }
+
+  // We only support permutation of 32/64 bit elements after this.
  if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
    return false;
-  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@ -229,7 +229,7 @@ define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15,18,19,16,17,22,23,20,21,24,25,26,27,28,29,30,31]
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
  ret <32 x i8> %res0
@ -238,14 +238,14 @@ define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13,16,17,18,19,20,21,22,23,26,27,24,25,30,31,28,29]
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
 ; CHECK-NEXT:    retq
  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
  ret <32 x i8> %res0
 }

-define <32 x i8> @combine_pshufb_as_pshufw(<32 x i8> %a0) {
-; CHECK-LABEL: combine_pshufb_as_pshufw:
+define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_not_as_pshufw:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
 ; CHECK-NEXT:    retq
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@ -476,8 +476,7 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
 define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_permvar_as_pshuflw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
 ; CHECK-NEXT:    retq
  %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
  ret <32 x i16> %res0
@ -486,8 +485,7 @@ define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
 define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
 ; CHECK-NEXT:    retq
  %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
  ret <32 x i16> %res0
@ -496,10 +494,8 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
 define <32 x i16> @combine_pshufb_as_pshufw(<32 x i16> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pshufw:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
 ; CHECK-NEXT:    retq
  %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
  %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@ -167,12 +167,12 @@ define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_pshuflw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_pshuflw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
 ; AVX-NEXT:    retq
  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
  ret <16 x i8> %res0
@ -181,24 +181,24 @@ define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_pshufhw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_pshufhw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,10,11,8,9,14,15,12,13]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
 ; AVX-NEXT:    retq
  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
  ret <16 x i8> %res0
 }

-define <16 x i8> @combine_pshufb_as_pshufw(<16 x i8> %a0) {
-; SSE-LABEL: combine_pshufb_as_pshufw:
+define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_not_as_pshufw:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_pshufb_as_pshufw:
+; AVX-LABEL: combine_pshufb_not_as_pshufw:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
 ; AVX-NEXT:    retq