[X86] Attempt to pre-truncate arithmetic operations if useful

In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 llvm-svn: 290947
2017-01-04 08:05:42 +00:00 · 2017-01-04 08:05:42 +00:00 · c76ea4b638
parent d0aa53b9ae
commit c76ea4b638
4 changed files with 506 additions and 1006 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget,
                                          SDLoc &DL) {
  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
  SDValue Src = N->getOperand(0);
  unsigned Opcode = Src.getOpcode();
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  EVT VT = N->getValueType(0);
  EVT SrcVT = Src.getValueType();
  auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
    // TODO: Add extra cases where we can truncate both inputs for the
    // cost of one (or none).
    // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
    if (Op0 == Op1)
      return true;
    SDValue BC0 = peekThroughOneUseBitcasts(Op0);
    SDValue BC1 = peekThroughOneUseBitcasts(Op1);
    return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
           ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
  };
  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
    return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
  };
  // Don't combine if the operation has other uses.
  if (!N->isOnlyUserOf(Src.getNode()))
    return SDValue();
  // Only support vector truncation for now.
  // TODO: i64 scalar math would benefit as well.
  if (!VT.isVector())
    return SDValue();
  // In most cases its only worth pre-truncating if we're only facing the cost
  // of one truncation.
  // i.e. if one of the inputs will constant fold or the input is repeated.
  switch (Opcode) {
  case ISD::AND:
  case ISD::XOR:
  case ISD::OR: {
    SDValue Op0 = Src.getOperand(0);
    SDValue Op1 = Src.getOperand(1);
    if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
        IsRepeatedOpOrOneUseConstant(Op0, Op1))
      return TruncateArithmetic(Op0, Op1);
    break;
  }
  case ISD::MUL:
    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
    // better to truncate if we have the chance.
    if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
        !TLI.isOperationLegal(Opcode, SrcVT))
      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
    LLVM_FALLTHROUGH;
  case ISD::ADD: {
    SDValue Op0 = Src.getOperand(0);
    SDValue Op1 = Src.getOperand(1);
    if (TLI.isOperationLegal(Opcode, VT) &&
        IsRepeatedOpOrOneUseConstant(Op0, Op1))
      return TruncateArithmetic(Op0, Op1);
    break;
  }
  }
  return SDValue();
 }
 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
 static SDValue
 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
  SDValue Src = N->getOperand(0);
  SDLoc DL(N);
  // Attempt to pre-truncate inputs to arithmetic ops instead.
  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
    return V;
  // Try to detect AVG pattern first.
  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
    return Avg;
--- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll
@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
 define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
 ; KNL-LABEL: any_extend_load_v8i32:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; KNL-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
 ; KNL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vmovq %xmm0, (%rdi)
 ; KNL-NEXT:    retq
--- a/llvm/test/CodeGen/X86/i64-to-float.ll
+++ b/llvm/test/CodeGen/X86/i64-to-float.ll
@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
 define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
 ; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
 ; X32-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
 ; X64-SSE:       # BB#0:
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm1
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
 ; X64-AVX:       # BB#0:
 ; X64-AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X64-AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
 define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
 ; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
 ; X32-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
 ; X64-SSE:       # BB#0:
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm1
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
 ; X64-AVX:       # BB#0:
 ; X64-AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; X64-AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll