[SDAG] match rotate pattern with extra 'or' operation

This is another fold generalized from D111530.
We can find a common source for a rotate operation hidden inside an 'or':
https://alive2.llvm.org/ce/z/9pV8hn

Deciding when this is profitable vs. a funnel-shift is tricky, but this
does not show any regressions: if a target has a rotate but it does not
have a funnel-shift, then try to form the rotate here. That is why we
don't have x86 test diffs for the scalar tests that are duplicated from
AArch64 ( 74a65e3834 ) - shld/shrd are available. That also makes it
difficult to show vector diffs - the only case where I found a diff was
on x86 AVX512 or XOP with i64 elements.

There's an additional check for a legal type to avoid a problem seen
with x86-32 where we form a 64-bit rotate but then it gets split
inefficiently. We might avoid that by adding more rotate folds, but
I didn't check to see what is missing on that path.

This gets most of the motivating patterns for AArch64 / ARM that are in
D111530.

We still need a couple of enhancements to setcc pattern matching with
rotate/funnel-shift to get the rest.

Differential Revision: https://reviews.llvm.org/D120933
This commit is contained in:
Sanjay Patel 2022-03-09 13:12:43 -05:00
parent 0f20a35b9e
commit 341623653d
6 changed files with 85 additions and 72 deletions

View File

@ -7405,11 +7405,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
if (LHSShift.getOpcode() == RHSShift.getOpcode())
return SDValue(); // Shifts must disagree.
// TODO: Support pre-legalization funnel-shift by constant.
bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
if (!IsRotate && !(HasFSHL || HasFSHR))
return SDValue(); // Requires funnel shift support.
// Canonicalize shl to left side in a shl/srl pair.
if (RHSShift.getOpcode() == ISD::SHL) {
std::swap(LHS, RHS);
@ -7423,15 +7418,57 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
SDValue RHSShiftArg = RHSShift.getOperand(0);
SDValue RHSShiftAmt = RHSShift.getOperand(1);
auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
};
// TODO: Support pre-legalization funnel-shift by constant.
bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
if (!IsRotate && !(HasFSHL || HasFSHR)) {
if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() &&
ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
// Look for a disguised rotate by constant.
// The common shifted operand X may be hidden inside another 'or'.
SDValue X, Y;
auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) {
if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR)
return false;
if (CommonOp == Or.getOperand(0)) {
X = CommonOp;
Y = Or.getOperand(1);
return true;
}
if (CommonOp == Or.getOperand(1)) {
X = CommonOp;
Y = Or.getOperand(0);
return true;
}
return false;
};
// (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1)
if (matchOr(LHSShiftArg, RHSShiftArg)) {
SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt);
return DAG.getNode(ISD::OR, DL, VT, RotX, ShlY);
}
// (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2)
if (matchOr(RHSShiftArg, LHSShiftArg)) {
SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt);
return DAG.getNode(ISD::OR, DL, VT, RotX, SrlY);
}
}
return SDValue(); // Requires funnel shift support.
}
// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
// fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
// fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
// iff C1+C2 == EltSizeInBits
auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
};
if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
SDValue Res;
if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {

View File

@ -12,8 +12,7 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds x0, x0, #1
; CHECK-NEXT: adcs x1, x1, xzr
; CHECK-NEXT: orr x8, x0, x1
; CHECK-NEXT: extr x8, x1, x8, #60
; CHECK-NEXT: orr x8, x1, x0, lsr #60
; CHECK-NEXT: cbnz x8, .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit
; CHECK-NEXT: ret
@ -32,8 +31,7 @@ exit:
define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
; CHECK-LABEL: opt_setcc_srl_eq_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x0, x1
; CHECK-NEXT: extr x8, x1, x8, #17
; CHECK-NEXT: orr x8, x1, x0, lsr #17
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
@ -45,8 +43,7 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
; CHECK-LABEL: opt_setcc_srl_ne_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x0, x1
; CHECK-NEXT: extr x8, x1, x8, #17
; CHECK-NEXT: orr x8, x1, x0, lsr #17
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
@ -58,8 +55,7 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
; CHECK-LABEL: opt_setcc_shl_eq_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x1, x0
; CHECK-NEXT: extr x8, x8, x0, #47
; CHECK-NEXT: orr x8, x0, x1, lsl #17
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
@ -71,8 +67,7 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
; CHECK-LABEL: opt_setcc_shl_ne_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x1, x0
; CHECK-NEXT: extr x8, x8, x0, #47
; CHECK-NEXT: orr x8, x0, x1, lsl #17
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
@ -106,8 +101,7 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x0, x1
; CHECK-NEXT: extr x8, x8, x1, #47
; CHECK-NEXT: orr x8, x1, x0, lsl #17
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret

View File

@ -690,8 +690,8 @@ define i64 @mix_logic_shl(i64 %x0, i64 %x1, i64 %y, i64 %z) {
define i32 @or_fshl_commute0(i32 %x, i32 %y) {
; CHECK-LABEL: or_fshl_commute0:
; CHECK: // %bb.0:
; CHECK-NEXT: orr w8, w0, w1
; CHECK-NEXT: extr w0, w8, w0, #27
; CHECK-NEXT: ror w8, w0, #27
; CHECK-NEXT: orr w0, w8, w1, lsl #5
; CHECK-NEXT: ret
%or1 = or i32 %x, %y
%sh1 = shl i32 %or1, 5
@ -703,8 +703,8 @@ define i32 @or_fshl_commute0(i32 %x, i32 %y) {
define i64 @or_fshl_commute1(i64 %x, i64 %y) {
; CHECK-LABEL: or_fshl_commute1:
; CHECK: // %bb.0:
; CHECK-NEXT: orr w8, w1, w0
; CHECK-NEXT: extr x0, x8, x0, #29
; CHECK-NEXT: ror x8, x0, #29
; CHECK-NEXT: orr x0, x8, x1, lsl #35
; CHECK-NEXT: ret
%or1 = or i64 %y, %x
%sh1 = shl i64 %or1, 35
@ -762,8 +762,8 @@ define i32 @or_fshl_wrong_shift(i32 %x, i32 %y) {
define i64 @or_fshr_commute0(i64 %x, i64 %y) {
; CHECK-LABEL: or_fshr_commute0:
; CHECK: // %bb.0:
; CHECK-NEXT: orr x8, x0, x1
; CHECK-NEXT: extr x0, x0, x8, #24
; CHECK-NEXT: ror x8, x0, #24
; CHECK-NEXT: orr x0, x8, x1, lsr #24
; CHECK-NEXT: ret
%or1 = or i64 %x, %y
%sh1 = shl i64 %x, 40
@ -775,8 +775,8 @@ define i64 @or_fshr_commute0(i64 %x, i64 %y) {
define i32 @or_fshr_commute1(i32 %x, i32 %y) {
; CHECK-LABEL: or_fshr_commute1:
; CHECK: // %bb.0:
; CHECK-NEXT: orr w8, w1, w0
; CHECK-NEXT: extr w0, w0, w8, #29
; CHECK-NEXT: ror w8, w0, #29
; CHECK-NEXT: orr w0, w8, w1, lsr #29
; CHECK-NEXT: ret
%or1 = or i32 %y, %x
%sh1 = shl i32 %x, 3

View File

@ -630,14 +630,10 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) {
; CHECKV7M-NEXT: ldrd lr, r0, [sp, #8]
; CHECKV7M-NEXT: beq .LBB6_2
; CHECKV7M-NEXT: @ %bb.1: @ %then
; CHECKV7M-NEXT: orrs r2, r3
; CHECKV7M-NEXT: lsrs r2, r2, #17
; CHECKV7M-NEXT: orr.w r2, r2, r3, lsl #15
; CHECKV7M-NEXT: orr.w r3, r12, r1
; CHECKV7M-NEXT: orr.w r2, r3, r2, lsr #17
; CHECKV7M-NEXT: orr.w r1, r1, r12, lsr #17
; CHECKV7M-NEXT: cmp r2, #0
; CHECKV7M-NEXT: mov r2, r0
; CHECKV7M-NEXT: lsr.w r3, r3, #17
; CHECKV7M-NEXT: orr.w r1, r3, r1, lsl #15
; CHECKV7M-NEXT: it ne
; CHECKV7M-NEXT: movne r2, lr
; CHECKV7M-NEXT: cmp r1, #0
@ -646,9 +642,7 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) {
; CHECKV7M-NEXT: add r0, r2
; CHECKV7M-NEXT: pop {r7, pc}
; CHECKV7M-NEXT: .LBB6_2: @ %else
; CHECKV7M-NEXT: orr.w r1, r2, r3
; CHECKV7M-NEXT: lsrs r1, r1, #17
; CHECKV7M-NEXT: orr.w r1, r1, r3, lsl #15
; CHECKV7M-NEXT: orr.w r1, r3, r2, lsr #17
; CHECKV7M-NEXT: cmp r1, #0
; CHECKV7M-NEXT: it ne
; CHECKV7M-NEXT: movne r0, lr
@ -664,14 +658,10 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) {
; CHECKV7A-NEXT: lsls r4, r4, #31
; CHECKV7A-NEXT: beq .LBB6_2
; CHECKV7A-NEXT: @ %bb.1: @ %then
; CHECKV7A-NEXT: orrs r2, r3
; CHECKV7A-NEXT: lsrs r2, r2, #17
; CHECKV7A-NEXT: orr.w r2, r2, r3, lsl #15
; CHECKV7A-NEXT: orr.w r3, r12, r1
; CHECKV7A-NEXT: orr.w r2, r3, r2, lsr #17
; CHECKV7A-NEXT: orr.w r1, r1, r12, lsr #17
; CHECKV7A-NEXT: cmp r2, #0
; CHECKV7A-NEXT: mov r2, r0
; CHECKV7A-NEXT: lsr.w r3, r3, #17
; CHECKV7A-NEXT: orr.w r1, r3, r1, lsl #15
; CHECKV7A-NEXT: it ne
; CHECKV7A-NEXT: movne r2, lr
; CHECKV7A-NEXT: cmp r1, #0
@ -680,9 +670,7 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) {
; CHECKV7A-NEXT: add r0, r2
; CHECKV7A-NEXT: pop {r4, pc}
; CHECKV7A-NEXT: .LBB6_2: @ %else
; CHECKV7A-NEXT: orr.w r1, r2, r3
; CHECKV7A-NEXT: lsrs r1, r1, #17
; CHECKV7A-NEXT: orr.w r1, r1, r3, lsl #15
; CHECKV7A-NEXT: orr.w r1, r3, r2, lsr #17
; CHECKV7A-NEXT: cmp r1, #0
; CHECKV7A-NEXT: it ne
; CHECKV7A-NEXT: movne r0, lr

View File

@ -12,9 +12,7 @@ define i64 @opt_setcc_lt_power_of_2(i64 %a) nounwind {
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r0, r0, #1
; CHECK-NEXT: adc r1, r1, #0
; CHECK-NEXT: orr r2, r0, r1
; CHECK-NEXT: uxth r3, r1
; CHECK-NEXT: orr r2, r3, r2, lsr #16
; CHECK-NEXT: orr r2, r1, r0, lsr #16
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: bne .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %exit
@ -34,9 +32,7 @@ exit:
define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind {
; CHECK-LABEL: opt_setcc_srl_eq_zero:
; CHECK: @ %bb.0:
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: lsr r0, r0, #17
; CHECK-NEXT: orr r0, r0, r1, lsl #15
; CHECK-NEXT: orr r0, r1, r0, lsr #17
; CHECK-NEXT: clz r0, r0
; CHECK-NEXT: lsr r0, r0, #5
; CHECK-NEXT: bx lr
@ -48,9 +44,7 @@ define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind {
define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind {
; CHECK-LABEL: opt_setcc_srl_ne_zero:
; CHECK: @ %bb.0:
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: lsr r0, r0, #17
; CHECK-NEXT: orr r0, r0, r1, lsl #15
; CHECK-NEXT: orr r0, r1, r0, lsr #17
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: movwne r0, #1
; CHECK-NEXT: bx lr
@ -62,9 +56,7 @@ define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind {
define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind {
; CHECK-LABEL: opt_setcc_shl_eq_zero:
; CHECK: @ %bb.0:
; CHECK-NEXT: orr r1, r1, r0
; CHECK-NEXT: lsl r1, r1, #17
; CHECK-NEXT: orr r0, r1, r0, lsr #15
; CHECK-NEXT: orr r0, r0, r1, lsl #17
; CHECK-NEXT: clz r0, r0
; CHECK-NEXT: lsr r0, r0, #5
; CHECK-NEXT: bx lr
@ -76,9 +68,7 @@ define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind {
define i1 @opt_setcc_shl_ne_zero(i64 %a) nounwind {
; CHECK-LABEL: opt_setcc_shl_ne_zero:
; CHECK: @ %bb.0:
; CHECK-NEXT: orr r1, r1, r0
; CHECK-NEXT: lsl r1, r1, #17
; CHECK-NEXT: orr r0, r1, r0, lsr #15
; CHECK-NEXT: orr r0, r0, r1, lsl #17
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: movwne r0, #1
; CHECK-NEXT: bx lr
@ -113,9 +103,7 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i64 %a) nounwind {
define i1 @opt_setcc_expanded_shl_correct_shifts(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts:
; CHECK: @ %bb.0:
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: lsl r0, r0, #17
; CHECK-NEXT: orr r0, r0, r1, lsr #15
; CHECK-NEXT: orr r0, r1, r0, lsl #17
; CHECK-NEXT: clz r0, r0
; CHECK-NEXT: lsr r0, r0, #5
; CHECK-NEXT: bx lr

View File

@ -230,13 +230,19 @@ define <4 x i32> @or_fshl_v4i32(<4 x i32> %x, <4 x i32> %y) {
}
define <2 x i64> @or_fshr_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: or_fshr_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpsllq $42, %xmm0, %xmm0
; CHECK-NEXT: vpsrlq $22, %xmm1, %xmm1
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
; XOP-LABEL: or_fshr_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vpsrlq $22, %xmm1, %xmm1
; XOP-NEXT: vprotq $42, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: or_fshr_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq $22, %xmm1, %xmm1
; AVX512-NEXT: vprolq $42, %xmm0, %xmm0
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%or1 = or <2 x i64> %x, %y
%sh1 = shl <2 x i64> %x, <i64 42, i64 42>
%sh2 = lshr <2 x i64> %or1, <i64 22, i64 22>