[X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets

lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count. Fixes the remaining regression from the fixes in rG293899c64b75
2022-08-11 17:35:44 +01:00 · 2022-08-11 17:35:44 +01:00 · 6ba5fc2dee
parent dd4c838da3
commit 6ba5fc2dee
6 changed files with 45 additions and 29 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
  unsigned EltSizeInBits = VT.getScalarSizeInBits();
  unsigned MaxScale = 64 / EltSizeInBits;
  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    unsigned SrcEltBits = EltSizeInBits * Scale;
    unsigned NumSrcElts = NumElts / Scale;
    unsigned UpperElts = NumElts - NumSrcElts;
    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
      continue;

+    // Attempt to find a matching source truncation, but as a fall back VLX
+    // cases can use the VPMOV directly.
    SDValue Src = peekThroughBitcasts(V1);
-    if (Src.getOpcode() != ISD::TRUNCATE ||
-        Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
+    if (Src.getOpcode() == ISD::TRUNCATE &&
+        Src.getScalarValueSizeInBits() == SrcEltBits) {
+      Src = Src.getOperand(0);
+    } else if (Subtarget.hasVLX()) {
+      MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+      MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+      Src = DAG.getBitcast(SrcVT, Src);
+      // Don't do this if PACKSS/PACKUS could perform it cheaper.
+      if (Scale == 2 &&
+          ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
+           (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
+        return SDValue();
+    } else
      return SDValue();
-    Src = Src.getOperand(0);

    // VPMOVWB is only available with avx512bw.
-    MVT SrcVT = Src.getSimpleValueType();
-    if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
-        !Subtarget.hasBWI())
+    if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
      return SDValue();

    bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
--- a/llvm/test/CodeGen/X86/avx512-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc.ll
@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
 ;
 ; SKX-LABEL: trunc_qw_128:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT:    vpmovqw %xmm0, %xmm0
 ; SKX-NEXT:    retq
  %x = trunc <2 x i64> %i to <2 x i16>
  ret <2 x i16> %x
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
 define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
 ; CHECK-LABEL: test_u1tofp2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT:    vpmovqw %xmm0, %xmm0
 ; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
 ; CHECK-NEXT:    retq
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
 ; AVX512-LABEL: vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdx)
 ; AVX512-NEXT:    retq
  %wide.vec = load <8 x i16>, ptr %in.vec, align 32

--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
 ; AVX512-LABEL: vf2:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpmovqw %xmm0, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovd %xmm1, (%rsi)
-; AVX512-NEXT:    vmovd %xmm2, (%rdx)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512-NEXT:    vmovd %xmm1, (%rdx)
 ; AVX512-NEXT:    vmovd %xmm3, (%rcx)
-; AVX512-NEXT:    vmovd %xmm0, (%r8)
+; AVX512-NEXT:    vmovd %xmm2, (%r8)
 ; AVX512-NEXT:    retq
  %wide.vec = load <8 x i16>, ptr %in.vec, align 32

--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: rot16_trunc:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT:    retq
+; AVX512NOVLX-LABEL: rot16_trunc:
+; AVX512NOVLX:       # %bb.0:
+; AVX512NOVLX-NEXT:    vpsrld $11, %xmm0, %xmm1
+; AVX512NOVLX-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512NOVLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512NOVLX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512NOVLX-NEXT:    retq
+;
+; AVX512VLX-LABEL: rot16_trunc:
+; AVX512VLX:       # %bb.0:
+; AVX512VLX-NEXT:    vpsrld $11, %xmm0, %xmm1
+; AVX512VLX-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512VLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VLX-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512VLX-NEXT:    retq
 ;
 ; XOP-LABEL: rot16_trunc:
 ; XOP:       # %bb.0: