forked from OSchip/llvm-project
[CodeGen][X86] Expand UADDSAT to NOT+UMIN+ADD
Followup to D56636, this time handling the UADDSAT case by expanding uadd.sat(a, b) to umin(a, ~b) + b. Differential Revision: https://reviews.llvm.org/D56869 llvm-svn: 352409
This commit is contained in:
parent
1c3694a4d4
commit
8e1a464e6a
|
@ -5287,6 +5287,12 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
|
||||||
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
|
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
|
||||||
|
SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
|
||||||
|
SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
|
||||||
|
return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
|
||||||
|
}
|
||||||
|
|
||||||
if (VT.isVector()) {
|
if (VT.isVector()) {
|
||||||
// TODO: Consider not scalarizing here.
|
// TODO: Consider not scalarizing here.
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
|
@ -1876,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
|
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
|
||||||
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
|
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
|
||||||
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
|
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
|
||||||
|
{ ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
|
||||||
|
{ ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
|
||||||
|
{ ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
|
||||||
|
{ ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
|
||||||
};
|
};
|
||||||
static const CostTblEntry XOPCostTbl[] = {
|
static const CostTblEntry XOPCostTbl[] = {
|
||||||
{ ISD::BITREVERSE, MVT::v4i64, 4 },
|
{ ISD::BITREVERSE, MVT::v4i64, 4 },
|
||||||
|
@ -1917,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
{ ISD::SSUBSAT, MVT::v32i8, 1 },
|
{ ISD::SSUBSAT, MVT::v32i8, 1 },
|
||||||
{ ISD::UADDSAT, MVT::v16i16, 1 },
|
{ ISD::UADDSAT, MVT::v16i16, 1 },
|
||||||
{ ISD::UADDSAT, MVT::v32i8, 1 },
|
{ ISD::UADDSAT, MVT::v32i8, 1 },
|
||||||
|
{ ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
|
||||||
{ ISD::USUBSAT, MVT::v16i16, 1 },
|
{ ISD::USUBSAT, MVT::v16i16, 1 },
|
||||||
{ ISD::USUBSAT, MVT::v32i8, 1 },
|
{ ISD::USUBSAT, MVT::v32i8, 1 },
|
||||||
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
|
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
|
||||||
|
@ -1953,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
||||||
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
|
||||||
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
||||||
|
{ ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
|
||||||
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
|
||||||
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
|
||||||
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
|
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
|
||||||
|
@ -1977,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
};
|
};
|
||||||
static const CostTblEntry SSE42CostTbl[] = {
|
static const CostTblEntry SSE42CostTbl[] = {
|
||||||
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
|
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
|
||||||
|
{ ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
|
||||||
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
|
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
|
||||||
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
|
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
|
||||||
};
|
};
|
||||||
|
|
|
@ -80,8 +80,8 @@ define i32 @add(i32 %arg) {
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
|
||||||
|
@ -213,8 +213,8 @@ define i32 @add(i32 %arg) {
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
|
||||||
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
|
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
|
||||||
|
|
|
@ -111,37 +111,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
|
||||||
;
|
;
|
||||||
; X64-LABEL: vec:
|
; X64-LABEL: vec:
|
||||||
; X64: # %bb.0:
|
; X64: # %bb.0:
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
|
; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
|
||||||
; X64-NEXT: movd %xmm2, %eax
|
; X64-NEXT: pxor %xmm0, %xmm2
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
; X64-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
|
||||||
; X64-NEXT: movd %xmm2, %ecx
|
; X64-NEXT: pxor %xmm1, %xmm3
|
||||||
; X64-NEXT: addl %eax, %ecx
|
; X64-NEXT: pcmpgtd %xmm2, %xmm3
|
||||||
; X64-NEXT: movl $-1, %eax
|
; X64-NEXT: pand %xmm3, %xmm0
|
||||||
; X64-NEXT: cmovbl %eax, %ecx
|
; X64-NEXT: pcmpeqd %xmm2, %xmm2
|
||||||
; X64-NEXT: movd %ecx, %xmm2
|
; X64-NEXT: pxor %xmm3, %xmm2
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
|
; X64-NEXT: movdqa %xmm1, %xmm3
|
||||||
; X64-NEXT: movd %xmm3, %ecx
|
; X64-NEXT: pandn %xmm2, %xmm3
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
; X64-NEXT: por %xmm3, %xmm0
|
||||||
; X64-NEXT: movd %xmm3, %edx
|
; X64-NEXT: paddd %xmm1, %xmm0
|
||||||
; X64-NEXT: addl %ecx, %edx
|
|
||||||
; X64-NEXT: cmovbl %eax, %edx
|
|
||||||
; X64-NEXT: movd %edx, %xmm3
|
|
||||||
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
||||||
; X64-NEXT: movd %xmm1, %ecx
|
|
||||||
; X64-NEXT: movd %xmm0, %edx
|
|
||||||
; X64-NEXT: addl %ecx, %edx
|
|
||||||
; X64-NEXT: cmovbl %eax, %edx
|
|
||||||
; X64-NEXT: movd %edx, %xmm2
|
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
||||||
; X64-NEXT: movd %xmm1, %ecx
|
|
||||||
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
||||||
; X64-NEXT: movd %xmm0, %edx
|
|
||||||
; X64-NEXT: addl %ecx, %edx
|
|
||||||
; X64-NEXT: cmovbl %eax, %edx
|
|
||||||
; X64-NEXT: movd %edx, %xmm0
|
|
||||||
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
|
||||||
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
|
||||||
; X64-NEXT: movdqa %xmm2, %xmm0
|
|
||||||
; X64-NEXT: retq
|
; X64-NEXT: retq
|
||||||
%tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
|
%tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
|
||||||
ret <4 x i32> %tmp;
|
ret <4 x i32> %tmp;
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue