forked from OSchip/llvm-project
[X86] Attempt to pre-truncate arithmetic operations if useful
In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 llvm-svn: 290947
This commit is contained in:
parent
d0aa53b9ae
commit
c76ea4b638
|
@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
|
||||||
|
/// the codegen.
|
||||||
|
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
|
||||||
|
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
|
||||||
|
const X86Subtarget &Subtarget,
|
||||||
|
SDLoc &DL) {
|
||||||
|
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
|
||||||
|
SDValue Src = N->getOperand(0);
|
||||||
|
unsigned Opcode = Src.getOpcode();
|
||||||
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||||
|
|
||||||
|
EVT VT = N->getValueType(0);
|
||||||
|
EVT SrcVT = Src.getValueType();
|
||||||
|
|
||||||
|
auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
|
||||||
|
// TODO: Add extra cases where we can truncate both inputs for the
|
||||||
|
// cost of one (or none).
|
||||||
|
// e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
|
||||||
|
if (Op0 == Op1)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
SDValue BC0 = peekThroughOneUseBitcasts(Op0);
|
||||||
|
SDValue BC1 = peekThroughOneUseBitcasts(Op1);
|
||||||
|
return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
|
||||||
|
ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
|
||||||
|
};
|
||||||
|
|
||||||
|
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
|
||||||
|
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
|
||||||
|
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
|
||||||
|
return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Don't combine if the operation has other uses.
|
||||||
|
if (!N->isOnlyUserOf(Src.getNode()))
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Only support vector truncation for now.
|
||||||
|
// TODO: i64 scalar math would benefit as well.
|
||||||
|
if (!VT.isVector())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// In most cases its only worth pre-truncating if we're only facing the cost
|
||||||
|
// of one truncation.
|
||||||
|
// i.e. if one of the inputs will constant fold or the input is repeated.
|
||||||
|
switch (Opcode) {
|
||||||
|
case ISD::AND:
|
||||||
|
case ISD::XOR:
|
||||||
|
case ISD::OR: {
|
||||||
|
SDValue Op0 = Src.getOperand(0);
|
||||||
|
SDValue Op1 = Src.getOperand(1);
|
||||||
|
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
|
||||||
|
IsRepeatedOpOrOneUseConstant(Op0, Op1))
|
||||||
|
return TruncateArithmetic(Op0, Op1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case ISD::MUL:
|
||||||
|
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
|
||||||
|
// better to truncate if we have the chance.
|
||||||
|
if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
|
||||||
|
!TLI.isOperationLegal(Opcode, SrcVT))
|
||||||
|
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
|
||||||
|
LLVM_FALLTHROUGH;
|
||||||
|
case ISD::ADD: {
|
||||||
|
SDValue Op0 = Src.getOperand(0);
|
||||||
|
SDValue Op1 = Src.getOperand(1);
|
||||||
|
if (TLI.isOperationLegal(Opcode, VT) &&
|
||||||
|
IsRepeatedOpOrOneUseConstant(Op0, Op1))
|
||||||
|
return TruncateArithmetic(Op0, Op1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
|
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
|
||||||
static SDValue
|
static SDValue
|
||||||
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
|
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
|
||||||
|
@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
|
||||||
SDValue Src = N->getOperand(0);
|
SDValue Src = N->getOperand(0);
|
||||||
SDLoc DL(N);
|
SDLoc DL(N);
|
||||||
|
|
||||||
|
// Attempt to pre-truncate inputs to arithmetic ops instead.
|
||||||
|
if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
|
||||||
|
return V;
|
||||||
|
|
||||||
// Try to detect AVG pattern first.
|
// Try to detect AVG pattern first.
|
||||||
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
|
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
|
||||||
return Avg;
|
return Avg;
|
||||||
|
|
|
@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
|
||||||
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
|
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
|
||||||
; KNL-LABEL: any_extend_load_v8i32:
|
; KNL-LABEL: any_extend_load_v8i32:
|
||||||
; KNL: # BB#0:
|
; KNL: # BB#0:
|
||||||
; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
||||||
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
|
; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
|
||||||
; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
|
||||||
; KNL-NEXT: vpmovdw %zmm0, %ymm0
|
|
||||||
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||||
; KNL-NEXT: vmovq %xmm0, (%rdi)
|
; KNL-NEXT: vmovq %xmm0, (%rdi)
|
||||||
; KNL-NEXT: retq
|
; KNL-NEXT: retq
|
||||||
|
|
|
@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
|
||||||
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
||||||
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
|
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
|
||||||
; X32-SSE: # BB#0:
|
; X32-SSE: # BB#0:
|
||||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
|
|
||||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
|
||||||
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||||
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||||
; X32-SSE-NEXT: retl
|
; X32-SSE-NEXT: retl
|
||||||
;
|
;
|
||||||
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
|
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
|
||||||
; X32-AVX: # BB#0:
|
; X32-AVX: # BB#0:
|
||||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
|
|
||||||
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||||
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
|
||||||
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||||
; X32-AVX-NEXT: vzeroupper
|
; X32-AVX-NEXT: vzeroupper
|
||||||
; X32-AVX-NEXT: retl
|
; X32-AVX-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
|
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
|
||||||
; X64-SSE: # BB#0:
|
; X64-SSE: # BB#0:
|
||||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
|
|
||||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
|
||||||
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||||
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||||
; X64-SSE-NEXT: retq
|
; X64-SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
|
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
|
||||||
; X64-AVX: # BB#0:
|
; X64-AVX: # BB#0:
|
||||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
|
||||||
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||||
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||||
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||||
; X64-AVX-NEXT: vzeroupper
|
; X64-AVX-NEXT: vzeroupper
|
||||||
; X64-AVX-NEXT: retq
|
; X64-AVX-NEXT: retq
|
||||||
|
@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
||||||
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
|
||||||
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
|
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
|
||||||
; X32-SSE: # BB#0:
|
; X32-SSE: # BB#0:
|
||||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
|
|
||||||
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
|
||||||
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
|
||||||
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||||
; X32-SSE-NEXT: retl
|
; X32-SSE-NEXT: retl
|
||||||
;
|
;
|
||||||
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
|
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
|
||||||
; X32-AVX: # BB#0:
|
; X32-AVX: # BB#0:
|
||||||
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
|
|
||||||
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||||
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
|
||||||
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||||
; X32-AVX-NEXT: vzeroupper
|
; X32-AVX-NEXT: vzeroupper
|
||||||
; X32-AVX-NEXT: retl
|
; X32-AVX-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
|
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
|
||||||
; X64-SSE: # BB#0:
|
; X64-SSE: # BB#0:
|
||||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
|
|
||||||
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
|
||||||
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||||
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
|
||||||
; X64-SSE-NEXT: retq
|
; X64-SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
|
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
|
||||||
; X64-AVX: # BB#0:
|
; X64-AVX: # BB#0:
|
||||||
; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
|
||||||
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||||
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
|
||||||
|
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||||
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
|
||||||
; X64-AVX-NEXT: vzeroupper
|
; X64-AVX-NEXT: vzeroupper
|
||||||
; X64-AVX-NEXT: retq
|
; X64-AVX-NEXT: retq
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue