AVX-512: Implemented DAG lowering for shuff62x2/shufi62x2 instuctions ( Shuffle Packed Values at 128-bit Granularity )

Tests added , vector-shuffle-512-v8.ll test re-generated. Differential Revision: http://reviews.llvm.org/D10300 llvm-svn: 239697
2015-06-14 13:07:47 +00:00 · 2015-06-14 13:07:47 +00:00 · 5e49697138
parent ce1ce989e2
commit 5e49697138
3 changed files with 88 additions and 4 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -9383,6 +9383,30 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                     DAG.getConstant(PermMask, DL, MVT::i8));
 }

+/// \brief Handle lowering 4-lane 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> WidenedMask,
+                                        SelectionDAG &DAG) {
+
+  assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!");
+  // form a 128-bit permutation.
+  // convert the 64-bit shuffle mask selection values into 128-bit selection
+  // bits defined by a vshuf64x2 instruction's immediate control byte.
+  unsigned PermMask = 0, Imm = 0;
+
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if(WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+
+    // use first element in place of undef musk
+    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+    PermMask |= (Imm % 4) << (i * 2);
+  }
+
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
 /// shuffling each lane.
 ///
@ -10176,6 +10200,10 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  ArrayRef<int> Mask = SVOp->getMask();
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG))
+      return Op;
  // X86 has dedicated unpack instructions that can handle specific blend
  // operations: UNPCKH and UNPCKL.
  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
--- a/llvm/test/CodeGen/X86/avx512-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffle.ll
@ -116,10 +116,10 @@ define <16 x i32> @test15(<16 x i32> %a) {
 ret <16 x i32> %b
 }
 ; CHECK-LABEL: test16
-; CHECK: valignq $2, %zmm0, %zmm1
+; CHECK: valignq $3, %zmm0, %zmm1
 ; CHECK: ret
 define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
  ret <8 x double> %c
 }

@ -252,6 +252,62 @@ define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
  ret <8 x double> %c
 }

+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $136, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1,  i32 4, i32 5>
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwq %xmm2, %zmm1
+; CHECK-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1,  i32 4, i32 5>
+  %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_vshufi64x2_512_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwq %xmm2, %zmm1
+; CHECK-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5,  i32 4, i32 5>
+  %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
+  ret <8 x i64> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $40, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x1   = load <8 x double>,<8 x double> *%ptr,align 1
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5,  i32 0, i32 1>
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
+; CHECK-LABEL: test_vshuff32x4_512_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $20, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x1   = load <16 x float>,<16 x float> *%ptr,align 1
+  %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x float> %res
+}
+
 define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; CHECK-LABEL: test_align_v16i32_rr:
 ; CHECK:       ## BB#0:
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@ -88,7 +88,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_01014545:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd $68, %zmm0, %zmm0
+; ALL-NEXT:    vshuff64x2 $160, %zmm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
  ret <8 x double> %shuffle
@ -650,7 +650,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_01014545:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermq $68, %zmm0, %zmm0
+; ALL-NEXT:    vshufi64x2 $160, %zmm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
  ret <8 x i64> %shuffle