forked from OSchip/llvm-project
I optimized the following patterns:
sext <4 x i1> to <4 x i64> sext <4 x i8> to <4 x i64> sext <4 x i16> to <4 x i64> I'm running Combine on SIGN_EXTEND_IN_REG and revert SEXT patterns: (sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) The sext_in_reg (v4i32 x) may be lowered to shl+sar operations. The "sar" does not exist on 64-bit operation, so lowering sext_in_reg (v4i64 x) has no vector solution. I also added a cost of this operations to the AVX costs table. llvm-svn: 175619
This commit is contained in:
parent
ac05bc0556
commit
0ccdd1315b
|
@ -1323,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
|||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::ANY_EXTEND);
|
||||
setTargetDAGCombine(ISD::SIGN_EXTEND);
|
||||
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
|
||||
setTargetDAGCombine(ISD::TRUNCATE);
|
||||
setTargetDAGCombine(ISD::SINT_TO_FP);
|
||||
setTargetDAGCombine(ISD::SETCC);
|
||||
|
@ -17076,6 +17077,41 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
EVT VT = N->getValueType(0);
|
||||
if (!VT.isVector())
|
||||
return SDValue();
|
||||
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
|
||||
// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
|
||||
// both SSE and AVX2 since there is no sign-extended shift right
|
||||
// operation on a vector with 64-bit elements.
|
||||
//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
|
||||
// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
|
||||
if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
|
||||
N0.getOpcode() == ISD::SIGN_EXTEND)) {
|
||||
SDValue N00 = N0.getOperand(0);
|
||||
|
||||
// EXTLOAD has a better solution on AVX2,
|
||||
// it may be replaced with X86ISD::VSEXT node.
|
||||
if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
|
||||
if (!ISD::isNormalLoad(N00.getNode()))
|
||||
return SDValue();
|
||||
|
||||
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
|
||||
SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
|
||||
N00, N1);
|
||||
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
|
||||
}
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget *Subtarget) {
|
||||
|
@ -17468,6 +17504,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ISD::ANY_EXTEND:
|
||||
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
|
||||
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
|
||||
case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
|
||||
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
|
||||
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
|
||||
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
|
||||
|
|
|
@ -232,6 +232,9 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
|
|||
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
|
||||
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
|
||||
};
|
||||
|
||||
|
|
|
@ -44,6 +44,10 @@ define i32 @zext_sext(<8 x i1> %in) {
|
|||
%B = zext <8 x i16> undef to <8 x i32>
|
||||
;CHECK: cost of 1 {{.*}} sext
|
||||
%C = sext <4 x i32> undef to <4 x i64>
|
||||
;CHECK: cost of 8 {{.*}} sext
|
||||
%C1 = sext <4 x i8> undef to <4 x i64>
|
||||
;CHECK: cost of 8 {{.*}} sext
|
||||
%C2 = sext <4 x i16> undef to <4 x i64>
|
||||
|
||||
;CHECK: cost of 1 {{.*}} zext
|
||||
%D = zext <4 x i32> undef to <4 x i64>
|
||||
|
@ -59,7 +63,7 @@ define i32 @zext_sext(<8 x i1> %in) {
|
|||
ret i32 undef
|
||||
}
|
||||
|
||||
define i32 @masks(<8 x i1> %in) {
|
||||
define i32 @masks8(<8 x i1> %in) {
|
||||
;CHECK: cost of 6 {{.*}} zext
|
||||
%Z = zext <8 x i1> %in to <8 x i32>
|
||||
;CHECK: cost of 9 {{.*}} sext
|
||||
|
@ -67,3 +71,9 @@ define i32 @masks(<8 x i1> %in) {
|
|||
ret i32 undef
|
||||
}
|
||||
|
||||
define i32 @masks4(<4 x i1> %in) {
|
||||
;CHECK: cost of 8 {{.*}} sext
|
||||
%S = sext <4 x i1> %in to <4 x i64>
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
|
|
|
@ -142,3 +142,26 @@ define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
|
|||
%Y = sext <8 x i8> %X to <8 x i16>
|
||||
ret <8 x i16>%Y
|
||||
}
|
||||
|
||||
; AVX: sext_4i1_to_4i64
|
||||
; AVX: vpslld $31
|
||||
; AVX: vpsrad $31
|
||||
; AVX: vpmovsxdq
|
||||
; AVX: vpmovsxdq
|
||||
; AVX: ret
|
||||
define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
|
||||
%extmask = sext <4 x i1> %mask to <4 x i64>
|
||||
ret <4 x i64> %extmask
|
||||
}
|
||||
|
||||
; AVX: sext_4i8_to_4i64
|
||||
; AVX: vpslld $24
|
||||
; AVX: vpsrad $24
|
||||
; AVX: vpmovsxdq
|
||||
; AVX: vpmovsxdq
|
||||
; AVX: ret
|
||||
define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
|
||||
%extmask = sext <4 x i8> %mask to <4 x i64>
|
||||
ret <4 x i64> %extmask
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue