[X86] Add DAG combine to combine a v8i32->v8i16 truncate with a packuswb that truncates v8i16->v8i8.

Summary: Under -x86-experimental-vector-widening-legalization, fp_to_uint/fp_to_sint with a smaller than 128 bit vector type results are custom type legalized by promoting the result to a 128 bit vector by promoting the elements, inserting an assertzext/assertsext, then truncating back to original type. The truncate will be further legalizdd to a pack shuffle. In the case of a v8i8 result type, we'll end up with a v8i16 fp_to_sint. This will need to be further legalized during vector op legalization by promoting to v8i32 and then truncating again. Under avx2 this produces good code with two pack instructions, but Under avx512 this will result in a truncate instruction and a packuswb instruction. But we should be able to get away with a single truncate instruction. The other option is to promote all the way to vXi32 result type during the first type legalization. But in some experimentation that seemed to require more work to produce good code for other configurations. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D54836 llvm-svn: 348158
2018-12-03 18:26:24 +00:00 · 2018-12-03 18:26:24 +00:00 · e35b01f8ea
parent ff90e627ba
commit e35b01f8ea
3 changed files with 42 additions and 4 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -9084,6 +9084,26 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
    return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
  }

+  // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
+  // than X. Just move the AssertZext in front of the truncate and drop the
+  // AssertSExt.
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::AssertSext &&
+      Opcode == ISD::AssertZext) {
+    SDValue BigA = N0.getOperand(0);
+    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+    assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
+           "Asserting zero/sign-extended bits to a type larger than the "
+           "truncated destination does not provide information");
+
+    if (AssertVT.bitsLT(BigA_AssertVT)) {
+      SDLoc DL(N);
+      SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
+                                      BigA.getOperand(0), N1);
+      return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
+    }
+  }
+
  return SDValue();
 }

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -35398,6 +35398,26 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
    return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
  }

+  // Try to combine a PACKUSWB implemented truncate with a regular truncate to
+  // create a larger truncate.
+  // TODO: Match PACKSSWB as well?
+  if (Subtarget.hasAVX512() && Opcode == X86ISD::PACKUS &&
+      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+      N0.getOperand(0).getValueType() == MVT::v8i32) {
+
+    APInt ZeroMask = APInt::getHighBitsSet(16, 8);
+    if (DAG.MaskedValueIsZero(N0, ZeroMask)) {
+      if (Subtarget.hasVLX())
+        return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+      // Widen input to v16i32 so we can truncate that.
+      SDLoc dl(N);
+      SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
+                                   N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
+    }
+  }
+
  // Attempt to combine as shuffle.
  SDValue Op(N, 0);
  if (SDValue Res =
--- a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll
@ -505,16 +505,14 @@ define <8 x i8> @f64to8uc(<8 x double> %f) {
 ; NOVL-LABEL: f64to8uc:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
-; NOVL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; NOVL-NEXT:    vzeroupper
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f64to8uc:
 ; VL:       # %bb.0:
 ; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT:    vpmovdw %ymm0, %xmm0
-; VL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; VL-NEXT:    vpmovdb %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
  %res = fptoui <8 x double> %f to <8 x i8>