[CodeGen][X86] Expand UADDSAT to NOT+UMIN+ADD

Followup to D56636, this time handling the UADDSAT case by expanding
uadd.sat(a, b) to umin(a, ~b) + b.

Differential Revision: https://reviews.llvm.org/D56869

llvm-svn: 352409
This commit is contained in:
Nikita Popov 2019-01-28 19:19:09 +00:00
parent 1c3694a4d4
commit 8e1a464e6a
5 changed files with 871 additions and 1624 deletions

View File

@ -5287,6 +5287,12 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
}
if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
}
if (VT.isVector()) {
// TODO: Consider not scalarizing here.
return SDValue();

View File

@ -1876,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
{ ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
{ ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@ -1917,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
@ -1953,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
@ -1977,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
};
static const CostTblEntry SSE42CostTbl[] = {
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
{ ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};

View File

@ -80,8 +80,8 @@ define i32 @add(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@ -213,8 +213,8 @@ define i32 @add(i32 %arg) {
; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)

View File

@ -111,37 +111,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
; X64-NEXT: movd %xmm2, %eax
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; X64-NEXT: movd %xmm2, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movl $-1, %eax
; X64-NEXT: cmovbl %eax, %ecx
; X64-NEXT: movd %ecx, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; X64-NEXT: movd %xmm3, %ecx
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; X64-NEXT: movd %xmm3, %edx
; X64-NEXT: addl %ecx, %edx
; X64-NEXT: cmovbl %eax, %edx
; X64-NEXT: movd %edx, %xmm3
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; X64-NEXT: movd %xmm1, %ecx
; X64-NEXT: movd %xmm0, %edx
; X64-NEXT: addl %ecx, %edx
; X64-NEXT: cmovbl %eax, %edx
; X64-NEXT: movd %edx, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: movd %xmm1, %ecx
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm0, %edx
; X64-NEXT: addl %ecx, %edx
; X64-NEXT: cmovbl %eax, %edx
; X64-NEXT: movd %edx, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X64-NEXT: movdqa %xmm2, %xmm0
; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; X64-NEXT: pxor %xmm0, %xmm2
; X64-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
; X64-NEXT: pxor %xmm1, %xmm3
; X64-NEXT: pcmpgtd %xmm2, %xmm3
; X64-NEXT: pand %xmm3, %xmm0
; X64-NEXT: pcmpeqd %xmm2, %xmm2
; X64-NEXT: pxor %xmm3, %xmm2
; X64-NEXT: movdqa %xmm1, %xmm3
; X64-NEXT: pandn %xmm2, %xmm3
; X64-NEXT: por %xmm3, %xmm0
; X64-NEXT: paddd %xmm1, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
ret <4 x i32> %tmp;

File diff suppressed because it is too large Load Diff