forked from OSchip/llvm-project
[X86] Use min/max to optimze unsigend vector comparison on X86
Use PMIN/PMAX for UGE/ULE vector comparions to reduce the number of required instructions. This trick also works for UGT/ULT, but there is no advantage in doing so. It wouldn't reduce the number of instructions and it would actually reduce performance. Reviewer: Ben radar:5972691 llvm-svn: 186432
This commit is contained in:
parent
8b77f18da0
commit
3d527d80b8
|
@ -9351,7 +9351,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
|
||||||
// GT and EQ comparisons for integer, swapping operands and multiple
|
// GT and EQ comparisons for integer, swapping operands and multiple
|
||||||
// operations may be required for some comparisons.
|
// operations may be required for some comparisons.
|
||||||
unsigned Opc;
|
unsigned Opc;
|
||||||
bool Swap = false, Invert = false, FlipSigns = false;
|
bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
|
||||||
|
|
||||||
switch (SetCCOpcode) {
|
switch (SetCCOpcode) {
|
||||||
default: llvm_unreachable("Unexpected SETCC condition");
|
default: llvm_unreachable("Unexpected SETCC condition");
|
||||||
|
@ -9366,6 +9366,23 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
|
||||||
case ISD::SETUGE: Swap = true;
|
case ISD::SETUGE: Swap = true;
|
||||||
case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
|
case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Special case: Use min/max operations for SETULE/SETUGE
|
||||||
|
MVT VET = VT.getVectorElementType();
|
||||||
|
bool hasMinMax =
|
||||||
|
(Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
|
||||||
|
|| (Subtarget->hasSSE2() && (VET == MVT::i8));
|
||||||
|
|
||||||
|
if (hasMinMax) {
|
||||||
|
switch (SetCCOpcode) {
|
||||||
|
default: break;
|
||||||
|
case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
|
||||||
|
case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
|
||||||
|
}
|
||||||
|
|
||||||
if (Swap)
|
if (Swap)
|
||||||
std::swap(Op0, Op1);
|
std::swap(Op0, Op1);
|
||||||
|
|
||||||
|
@ -9453,6 +9470,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
|
||||||
if (Invert)
|
if (Invert)
|
||||||
Result = DAG.getNOT(dl, Result, VT);
|
Result = DAG.getNOT(dl, Result, VT);
|
||||||
|
|
||||||
|
if (MinMax)
|
||||||
|
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
|
||||||
|
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
; RUN: llc < %s -mcpu=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
|
||||||
|
; RUN: llc < %s -mcpu=x86-64 -mattr=sse41 | FileCheck %s -check-prefix=SSE41
|
||||||
|
; RUN: llc < %s -mcpu=x86-64 -mattr=avx | FileCheck %s -check-prefix=AVX
|
||||||
|
|
||||||
|
define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp uge <16 x i8> %a, %b
|
||||||
|
%2 = sext <16 x i1> %1 to <16 x i8>
|
||||||
|
ret <16 x i8> %2
|
||||||
|
; SSE2: _v16i8_icmp_uge:
|
||||||
|
; SSE2: pmaxub %xmm0, %xmm1
|
||||||
|
; SSE2: pcmpeqb %xmm1, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v16i8_icmp_uge:
|
||||||
|
; SSE41: pmaxub %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqb %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v16i8_icmp_uge:
|
||||||
|
; AVX: vpmaxub %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x i8> @v16i8_icmp_ule(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp ule <16 x i8> %a, %b
|
||||||
|
%2 = sext <16 x i1> %1 to <16 x i8>
|
||||||
|
ret <16 x i8> %2
|
||||||
|
; SSE2: _v16i8_icmp_ule:
|
||||||
|
; SSE2: pminub %xmm0, %xmm1
|
||||||
|
; SSE2: pcmpeqb %xmm1, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v16i8_icmp_ule:
|
||||||
|
; SSE41: pminub %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqb %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v16i8_icmp_ule:
|
||||||
|
; AVX: vpminub %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp uge <8 x i16> %a, %b
|
||||||
|
%2 = sext <8 x i1> %1 to <8 x i16>
|
||||||
|
ret <8 x i16> %2
|
||||||
|
; SSE2: _v8i16_icmp_uge:
|
||||||
|
; SSE2: movdqa LCPI2_0(%rip), %xmm2
|
||||||
|
; SEE2: pxor %xmm2, %xmm0
|
||||||
|
; SSE2: pxor %xmm1, %xmm2
|
||||||
|
; SSE2: pcmpgtw %xmm0, %xmm2
|
||||||
|
; SSE2: pcmpeqd %xmm0, %xmm0
|
||||||
|
; SSE2: pxor %xmm2, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v8i16_icmp_uge:
|
||||||
|
; SSE41: pmaxuw %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqw %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v8i16_icmp_uge:
|
||||||
|
; AVX: vpmaxuw %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
||||||
|
define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp ule <8 x i16> %a, %b
|
||||||
|
%2 = sext <8 x i1> %1 to <8 x i16>
|
||||||
|
ret <8 x i16> %2
|
||||||
|
; SSE2: _v8i16_icmp_ule:
|
||||||
|
; SSE2: movdqa LCPI3_0(%rip), %xmm2
|
||||||
|
; SSE2: pxor %xmm2, %xmm1
|
||||||
|
; SSE2: pxor %xmm2, %xmm0
|
||||||
|
; SSE2: pcmpgtw %xmm1, %xmm0
|
||||||
|
; SSE2: pcmpeqd %xmm1, %xmm1
|
||||||
|
; SSE2: pxor %xmm0, %xmm1
|
||||||
|
; SSE2: movdqa %xmm1, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v8i16_icmp_ule:
|
||||||
|
; SSE41: pminuw %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqw %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v8i16_icmp_ule:
|
||||||
|
; AVX: vpminuw %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp uge <4 x i32> %a, %b
|
||||||
|
%2 = sext <4 x i1> %1 to <4 x i32>
|
||||||
|
ret <4 x i32> %2
|
||||||
|
; SSE2: _v4i32_icmp_uge:
|
||||||
|
; SSE2: movdqa LCPI4_0(%rip), %xmm2
|
||||||
|
; SSE2: pxor %xmm2, %xmm0
|
||||||
|
; SSE2: pxor %xmm1, %xmm2
|
||||||
|
; SSE2: pcmpgtd %xmm0, %xmm2
|
||||||
|
; SSE2: pcmpeqd %xmm0, %xmm0
|
||||||
|
; SSE2: pxor %xmm2, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v4i32_icmp_uge:
|
||||||
|
; SSE41: pmaxud %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqd %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v4i32_icmp_uge:
|
||||||
|
; AVX: vpmaxud %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: vpcmpeqd %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
|
||||||
|
%1 = icmp ule <4 x i32> %a, %b
|
||||||
|
%2 = sext <4 x i1> %1 to <4 x i32>
|
||||||
|
ret <4 x i32> %2
|
||||||
|
; SSE2: _v4i32_icmp_ule:
|
||||||
|
; SSE2: movdqa LCPI5_0(%rip), %xmm2
|
||||||
|
; SSE2: pxor %xmm2, %xmm1
|
||||||
|
; SSE2: pxor %xmm2, %xmm0
|
||||||
|
; SSE2: pcmpgtd %xmm1, %xmm0
|
||||||
|
; SSE2: pcmpeqd %xmm1, %xmm1
|
||||||
|
; SSE2: pxor %xmm0, %xmm1
|
||||||
|
; SSE2: movdqa %xmm1, %xmm0
|
||||||
|
|
||||||
|
; SSE41: _v4i32_icmp_ule:
|
||||||
|
; SSE41: pminud %xmm0, %xmm1
|
||||||
|
; SSE41: pcmpeqd %xmm1, %xmm0
|
||||||
|
|
||||||
|
; AVX: _v4i32_icmp_ule:
|
||||||
|
; AVX: pminud %xmm1, %xmm0, %xmm1
|
||||||
|
; AVX: pcmpeqd %xmm1, %xmm0, %xmm0
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue