forked from OSchip/llvm-project
[X86] Attempt to pre-truncate arithmetic operations if useful
In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 llvm-svn: 290947
This commit is contained in:
parent
d0aa53b9ae
commit
c76ea4b638
|
@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
|
||||
/// the codegen.
|
||||
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
|
||||
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget,
|
||||
SDLoc &DL) {
|
||||
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
|
||||
SDValue Src = N->getOperand(0);
|
||||
unsigned Opcode = Src.getOpcode();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
|
||||
auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
|
||||
// TODO: Add extra cases where we can truncate both inputs for the
|
||||
// cost of one (or none).
|
||||
// e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
|
||||
if (Op0 == Op1)
|
||||
return true;
|
||||
|
||||
SDValue BC0 = peekThroughOneUseBitcasts(Op0);
|
||||
SDValue BC1 = peekThroughOneUseBitcasts(Op1);
|
||||
return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
|
||||
ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
|
||||
};
|
||||
|
||||
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
|
||||
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
|
||||
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
|
||||
return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
|
||||
};
|
||||
|
||||
// Don't combine if the operation has other uses.
|
||||
if (!N->isOnlyUserOf(Src.getNode()))
|
||||
return SDValue();
|
||||
|
||||
// Only support vector truncation for now.
|
||||
// TODO: i64 scalar math would benefit as well.
|
||||
if (!VT.isVector())
|
||||
return SDValue();
|
||||
|
||||
// In most cases its only worth pre-truncating if we're only facing the cost
|
||||
// of one truncation.
|
||||
// i.e. if one of the inputs will constant fold or the input is repeated.
|
||||
switch (Opcode) {
|
||||
case ISD::AND:
|
||||
case ISD::XOR:
|
||||
case ISD::OR: {
|
||||
SDValue Op0 = Src.getOperand(0);
|
||||
SDValue Op1 = Src.getOperand(1);
|
||||
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
|
||||
IsRepeatedOpOrOneUseConstant(Op0, Op1))
|
||||
return TruncateArithmetic(Op0, Op1);
|
||||
break;
|
||||
}
|
||||
|
||||
case ISD::MUL:
|
||||
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
|
||||
// better to truncate if we have the chance.
|
||||
if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
|
||||
!TLI.isOperationLegal(Opcode, SrcVT))
|
||||
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
|
||||
LLVM_FALLTHROUGH;
|
||||
case ISD::ADD: {
|
||||
SDValue Op0 = Src.getOperand(0);
|
||||
SDValue Op1 = Src.getOperand(1);
|
||||
if (TLI.isOperationLegal(Opcode, VT) &&
|
||||
IsRepeatedOpOrOneUseConstant(Op0, Op1))
|
||||
return TruncateArithmetic(Op0, Op1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
|
||||
static SDValue
|
||||
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
|
||||
|
@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
|
|||
SDValue Src = N->getOperand(0);
|
||||
SDLoc DL(N);
|
||||
|
||||
// Attempt to pre-truncate inputs to arithmetic ops instead.
|
||||
if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
|
||||
return V;
|
||||
|
||||
// Try to detect AVG pattern first.
|
||||
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
|
||||
return Avg;
|
||||
|
|
|
@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
|
|||
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
|
||||
; KNL-LABEL: any_extend_load_v8i32:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
|
||||
; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; KNL-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
||||
; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
||||
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; KNL-NEXT: vmovq %xmm0, (%rdi)
|
||||
; KNL-NEXT: retq
|
||||
|
|
|
@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
|
|||
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
||||
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||
; X32-SSE-NEXT: retl
|
||||
;
|
||||
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
|
||||
; X32-AVX: # BB#0:
|
||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
|
||||
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||
; X32-AVX-NEXT: vzeroupper
|
||||
; X32-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
|
||||
; X64-SSE: # BB#0:
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||
; X64-SSE-NEXT: retq
|
||||
;
|
||||
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
|
||||
; X64-AVX: # BB#0:
|
||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
||||
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||
; X64-AVX-NEXT: vzeroupper
|
||||
; X64-AVX-NEXT: retq
|
||||
|
@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
|||
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
||||
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||
; X32-SSE-NEXT: retl
|
||||
;
|
||||
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
|
||||
; X32-AVX: # BB#0:
|
||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
|
||||
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||
; X32-AVX-NEXT: vzeroupper
|
||||
; X32-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
|
||||
; X64-SSE: # BB#0:
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||
; X64-SSE-NEXT: retq
|
||||
;
|
||||
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
|
||||
; X64-AVX: # BB#0:
|
||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
||||
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||
; X64-AVX-NEXT: vzeroupper
|
||||
; X64-AVX-NEXT: retq
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue