[x86] use psubus for more vsetcc lowering (PR39859)

Circling back to a leftover bit from PR39859:
https://bugs.llvm.org/show_bug.cgi?id=39859#c1

...we have this counter-intuitive (based on the test diffs) opportunity to use 'psubus'.
This appears to be the better perf option for both Haswell and Jaguar based on llvm-mca.
We already do this transform for the SETULT predicate, so this makes the code more
symmetrical too. If we have pminub/pminuw, we prefer those, so this should not affect
anything but pre-SSE4.1 subtargets.

  $ cat before.s
	movdqa	-16(%rip), %xmm2    ## xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
	pxor	%xmm0, %xmm2
	pcmpgtw	-32(%rip), %xmm2 ## xmm2 = [255,255,255,255,255,255,255,255]
	pand	%xmm2, %xmm0
	pandn	%xmm1, %xmm2
	por	%xmm2, %xmm0

  $ cat after.s
	movdqa	-16(%rip), %xmm2    ## xmm2 = [256,256,256,256,256,256,256,256]
	psubusw	%xmm0, %xmm2
	pxor	%xmm3, %xmm3
	pcmpeqw	%xmm2, %xmm3
	pand	%xmm3, %xmm0
	pandn	%xmm1, %xmm3
	por	%xmm3, %xmm0

  $ llvm-mca before.s -mcpu=haswell
  Iterations:        100
  Instructions:      600
  Total Cycles:      909
  Total uOps:        700

  Dispatch Width:    4
  uOps Per Cycle:    0.77
  IPC:               0.66
  Block RThroughput: 1.8

  $ llvm-mca after.s -mcpu=haswell
  Iterations:        100
  Instructions:      700
  Total Cycles:      409
  Total uOps:        700

  Dispatch Width:    4
  uOps Per Cycle:    1.71
  IPC:               1.71
  Block RThroughput: 1.8

Differential Revision: https://reviews.llvm.org/D60838

llvm-svn: 358999
This commit is contained in:
Sanjay Patel 2019-04-23 15:20:17 +00:00
parent 6e7cc49d5c
commit 12a561fa1b
2 changed files with 30 additions and 13 deletions

View File

@ -19747,10 +19747,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
/// Given a simple buildvector constant, return a new vector constant with each
/// element decremented. If decrementing would result in underflow or this
/// is not a simple vector constant, return an empty value.
static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
/// Given a buildvector constant, return a new vector constant with each element
/// incremented or decremented. If incrementing or decrementing would result in
/// unsigned overflow or underflow or this is not a simple vector constant,
/// return an empty value.
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
@ -19765,11 +19766,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
// Avoid underflow.
if (Elt->getAPIntValue().isNullValue())
// Avoid overflow/underflow.
const APInt &EltC = Elt->getAPIntValue();
if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
}
return DAG.getBuildVector(VT, DL, NewVecC);
@ -19801,12 +19803,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
break;
}
case ISD::SETUGT: {
// If the comparison is against a constant, we can turn this into a setuge.
// This is beneficial because materializing a constant 0 for the PCMPEQ is
// probably cheaper than XOR+PCMPGT using 2 different vector constants:
// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
if (!UGEOp1)
return SDValue();
Op1 = Op0;
Op0 = UGEOp1;
break;
}
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);

View File

@ -194,8 +194,10 @@ define <16 x i1> @ugt_v16i8_splat(<16 x i8> %x) {
define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {
; SSE2-LABEL: ugt_v8i16_splat:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
; SSE2-NEXT: psubusw %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: ugt_v8i16_splat:
@ -541,9 +543,10 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) {
define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: PR39859:
; SSE2: ## %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [43,43,43,43,43,43,43,43]
; SSE2-NEXT: psubusw %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2