[DAGCombiner] fold bit-hack form of usubsat

(i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128

I haven't found a generalization of this identity:
https://alive2.llvm.org/ce/z/_sriEQ

Note: I was actually looking at the first form of the pattern in that link,
but that's part of a long chain of potential missed transforms in codegen
and IR....that I hope ends here!

The predicates for when this is profitable are a bit tricky. This version of
the patch excludes multi-use but includes custom lowering (as opposed to
legal only).

On x86 for example, we have custom lowering for some vector types, and that
uses umax and sub. So to enable that fold, we need add use checks to avoid
regressions. Even with legal-only lowering, we could see code with extra
reg move instructions for extra uses, so that constraint would have to be
eased very carefully to avoid penalties.

Differential Revision: https://reviews.llvm.org/D112085
This commit is contained in:
Sanjay Patel 2021-10-21 09:06:37 -04:00
parent fa111d3085
commit d2198771e9
3 changed files with 75 additions and 65 deletions

View File

@ -5627,6 +5627,35 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(Setcc, DL, VT);
}
/// For targets that support usubsat, match a bit-hack form of that operation
/// that ends in 'and' and convert it.
static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N1.getValueType();
// Canonicalize xor as operand 0.
if (N1.getOpcode() == ISD::XOR)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::XOR || N1.getOpcode() != ISD::SRA ||
!N0.hasOneUse() || !N1.hasOneUse() ||
N0.getOperand(0) != N1.getOperand(0))
return SDValue();
unsigned BitWidth = VT.getScalarSizeInBits();
ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
if (!XorC || !XorC->getAPIntValue().isSignMask() ||
!SraC || SraC->getAPIntValue() != BitWidth - 1)
return SDValue();
// (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
SDLoc DL(N);
SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
}
SDValue DAGCombiner::visitAND(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@ -5989,6 +6018,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (IsAndZeroExtMask(N0, N1))
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
if (hasOperation(ISD::USUBSAT, VT))
if (SDValue V = foldAndToUsubsat(N, DAG))
return V;
return SDValue();
}

View File

@ -86,26 +86,22 @@ define i16 @usubsat_as_bithack_i16(i16 %x) {
; GFX8-LABEL: usubsat_as_bithack_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v0
; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
; GFX8-NEXT: s_movk_i32 s4, 0x8000
; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: usubsat_as_bithack_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ashrrev_i16_e32 v1, 15, v0
; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: usubsat_as_bithack_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_ashrrev_i16 v1, 15, v0
; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX10-NEXT: v_and_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%signsplat = ashr i16 %x, 15
%flipsign = xor i16 %x, 32768

View File

@ -34,37 +34,21 @@ vector.ph:
define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind {
; SSE-LABEL: ashr_xor_and:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ashr_xor_and:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ashr_xor_and:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ashr_xor_and:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX512-NEXT: vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: ashr_xor_and:
; AVX: # %bb.0:
; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%flipsign = xor <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
%res = and <8 x i16> %signsplat, %flipsign
ret <8 x i16> %res
}
; negative test - extra uses may lead to extra instructions when custom-lowered
define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, <16 x i8>* %p1, <16 x i8>* %p2) nounwind {
; SSE-LABEL: ashr_xor_and_commute_uses:
; SSE: # %bb.0:
@ -94,27 +78,33 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, <16 x i8>* %p1, <16 x
}
define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
; SSE-LABEL: ashr_xor_and_custom:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: retq
; SSE2OR3-LABEL: ashr_xor_and_custom:
; SSE2OR3: # %bb.0:
; SSE2OR3-NEXT: movdqa %xmm0, %xmm1
; SSE2OR3-NEXT: psrad $31, %xmm1
; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2OR3-NEXT: pand %xmm1, %xmm0
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: ashr_xor_and_custom:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pmaxud %xmm1, %xmm0
; SSE41-NEXT: psubd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: ashr_xor_and_custom:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ashr_xor_and_custom:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ashr_xor_and_custom:
@ -349,37 +339,28 @@ vector.ph:
define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind {
; SSE-LABEL: ashr_xor_and_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psraw $15, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psraw $15, %xmm3
; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: ashr_xor_and_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ashr_xor_and_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ashr_xor_and_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsraw $15, %ymm0, %ymm1
; AVX512-NEXT: vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
%signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%flipsign = xor <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>