From 341623653d891386b7943445981565ed1dff2a18 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 9 Mar 2022 13:12:43 -0500 Subject: [PATCH] [SDAG] match rotate pattern with extra 'or' operation This is another fold generalized from D111530. We can find a common source for a rotate operation hidden inside an 'or': https://alive2.llvm.org/ce/z/9pV8hn Deciding when this is profitable vs. a funnel-shift is tricky, but this does not show any regressions: if a target has a rotate but it does not have a funnel-shift, then try to form the rotate here. That is why we don't have x86 test diffs for the scalar tests that are duplicated from AArch64 ( 74a65e3834d9487 ) - shld/shrd are available. That also makes it difficult to show vector diffs - the only case where I found a diff was on x86 AVX512 or XOP with i64 elements. There's an additional check for a legal type to avoid a problem seen with x86-32 where we form a 64-bit rotate but then it gets split inefficiently. We might avoid that by adding more rotate folds, but I didn't check to see what is missing on that path. This gets most of the motivating patterns for AArch64 / ARM that are in D111530. We still need a couple of enhancements to setcc pattern matching with rotate/funnel-shift to get the rest. Differential Revision: https://reviews.llvm.org/D120933 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 55 ++++++++++++++++--- llvm/test/CodeGen/AArch64/icmp-shift-opt.ll | 18 ++---- llvm/test/CodeGen/AArch64/logic-shift.ll | 16 +++--- llvm/test/CodeGen/ARM/consthoist-icmpimm.ll | 24 ++------ llvm/test/CodeGen/ARM/icmp-shift-opt.ll | 24 ++------ llvm/test/CodeGen/X86/rotate_vec.ll | 20 ++++--- 6 files changed, 85 insertions(+), 72 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b76f4711322f..a3e85cecf39b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7405,11 +7405,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { if (LHSShift.getOpcode() == RHSShift.getOpcode()) return SDValue(); // Shifts must disagree. - // TODO: Support pre-legalization funnel-shift by constant. - bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0); - if (!IsRotate && !(HasFSHL || HasFSHR)) - return SDValue(); // Requires funnel shift support. - // Canonicalize shl to left side in a shl/srl pair. if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); @@ -7423,15 +7418,57 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { SDValue RHSShiftArg = RHSShift.getOperand(0); SDValue RHSShiftAmt = RHSShift.getOperand(1); + auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; + }; + + // TODO: Support pre-legalization funnel-shift by constant. + bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0); + if (!IsRotate && !(HasFSHL || HasFSHR)) { + if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() && + ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { + // Look for a disguised rotate by constant. + // The common shifted operand X may be hidden inside another 'or'. + SDValue X, Y; + auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) { + if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR) + return false; + if (CommonOp == Or.getOperand(0)) { + X = CommonOp; + Y = Or.getOperand(1); + return true; + } + if (CommonOp == Or.getOperand(1)) { + X = CommonOp; + Y = Or.getOperand(0); + return true; + } + return false; + }; + + // (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1) + if (matchOr(LHSShiftArg, RHSShiftArg)) { + SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt); + SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt); + return DAG.getNode(ISD::OR, DL, VT, RotX, ShlY); + } + // (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2) + if (matchOr(RHSShiftArg, LHSShiftArg)) { + SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt); + SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt); + return DAG.getNode(ISD::OR, DL, VT, RotX, SrlY); + } + } + + return SDValue(); // Requires funnel shift support. + } + // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) // iff C1+C2 == EltSizeInBits - auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, - ConstantSDNode *RHS) { - return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; - }; if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { SDValue Res; if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) { diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll index 6b4d898a41a8..368246dbb242 100644 --- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll @@ -12,8 +12,7 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds x0, x0, #1 ; CHECK-NEXT: adcs x1, x1, xzr -; CHECK-NEXT: orr x8, x0, x1 -; CHECK-NEXT: extr x8, x1, x8, #60 +; CHECK-NEXT: orr x8, x1, x0, lsr #60 ; CHECK-NEXT: cbnz x8, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret @@ -32,8 +31,7 @@ exit: define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1 -; CHECK-NEXT: extr x8, x1, x8, #17 +; CHECK-NEXT: orr x8, x1, x0, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -45,8 +43,7 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1 -; CHECK-NEXT: extr x8, x1, x8, #17 +; CHECK-NEXT: orr x8, x1, x0, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -58,8 +55,7 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0 -; CHECK-NEXT: extr x8, x8, x0, #47 +; CHECK-NEXT: orr x8, x0, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -71,8 +67,7 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0 -; CHECK-NEXT: extr x8, x8, x0, #47 +; CHECK-NEXT: orr x8, x0, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -106,8 +101,7 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind { define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1 -; CHECK-NEXT: extr x8, x8, x1, #47 +; CHECK-NEXT: orr x8, x1, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll index 7889bda08a4f..12c3e18317f8 100644 --- a/llvm/test/CodeGen/AArch64/logic-shift.ll +++ b/llvm/test/CodeGen/AArch64/logic-shift.ll @@ -690,8 +690,8 @@ define i64 @mix_logic_shl(i64 %x0, i64 %x1, i64 %y, i64 %z) { define i32 @or_fshl_commute0(i32 %x, i32 %y) { ; CHECK-LABEL: or_fshl_commute0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1 -; CHECK-NEXT: extr w0, w8, w0, #27 +; CHECK-NEXT: ror w8, w0, #27 +; CHECK-NEXT: orr w0, w8, w1, lsl #5 ; CHECK-NEXT: ret %or1 = or i32 %x, %y %sh1 = shl i32 %or1, 5 @@ -703,8 +703,8 @@ define i32 @or_fshl_commute0(i32 %x, i32 %y) { define i64 @or_fshl_commute1(i64 %x, i64 %y) { ; CHECK-LABEL: or_fshl_commute1: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w1, w0 -; CHECK-NEXT: extr x0, x8, x0, #29 +; CHECK-NEXT: ror x8, x0, #29 +; CHECK-NEXT: orr x0, x8, x1, lsl #35 ; CHECK-NEXT: ret %or1 = or i64 %y, %x %sh1 = shl i64 %or1, 35 @@ -762,8 +762,8 @@ define i32 @or_fshl_wrong_shift(i32 %x, i32 %y) { define i64 @or_fshr_commute0(i64 %x, i64 %y) { ; CHECK-LABEL: or_fshr_commute0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1 -; CHECK-NEXT: extr x0, x0, x8, #24 +; CHECK-NEXT: ror x8, x0, #24 +; CHECK-NEXT: orr x0, x8, x1, lsr #24 ; CHECK-NEXT: ret %or1 = or i64 %x, %y %sh1 = shl i64 %x, 40 @@ -775,8 +775,8 @@ define i64 @or_fshr_commute0(i64 %x, i64 %y) { define i32 @or_fshr_commute1(i32 %x, i32 %y) { ; CHECK-LABEL: or_fshr_commute1: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w1, w0 -; CHECK-NEXT: extr w0, w0, w8, #29 +; CHECK-NEXT: ror w8, w0, #29 +; CHECK-NEXT: orr w0, w8, w1, lsr #29 ; CHECK-NEXT: ret %or1 = or i32 %y, %x %sh1 = shl i32 %x, 3 diff --git a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll index 9675c32393da..ef1d61dc6631 100644 --- a/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll +++ b/llvm/test/CodeGen/ARM/consthoist-icmpimm.ll @@ -630,14 +630,10 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) { ; CHECKV7M-NEXT: ldrd lr, r0, [sp, #8] ; CHECKV7M-NEXT: beq .LBB6_2 ; CHECKV7M-NEXT: @ %bb.1: @ %then -; CHECKV7M-NEXT: orrs r2, r3 -; CHECKV7M-NEXT: lsrs r2, r2, #17 -; CHECKV7M-NEXT: orr.w r2, r2, r3, lsl #15 -; CHECKV7M-NEXT: orr.w r3, r12, r1 +; CHECKV7M-NEXT: orr.w r2, r3, r2, lsr #17 +; CHECKV7M-NEXT: orr.w r1, r1, r12, lsr #17 ; CHECKV7M-NEXT: cmp r2, #0 ; CHECKV7M-NEXT: mov r2, r0 -; CHECKV7M-NEXT: lsr.w r3, r3, #17 -; CHECKV7M-NEXT: orr.w r1, r3, r1, lsl #15 ; CHECKV7M-NEXT: it ne ; CHECKV7M-NEXT: movne r2, lr ; CHECKV7M-NEXT: cmp r1, #0 @@ -646,9 +642,7 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) { ; CHECKV7M-NEXT: add r0, r2 ; CHECKV7M-NEXT: pop {r7, pc} ; CHECKV7M-NEXT: .LBB6_2: @ %else -; CHECKV7M-NEXT: orr.w r1, r2, r3 -; CHECKV7M-NEXT: lsrs r1, r1, #17 -; CHECKV7M-NEXT: orr.w r1, r1, r3, lsl #15 +; CHECKV7M-NEXT: orr.w r1, r3, r2, lsr #17 ; CHECKV7M-NEXT: cmp r1, #0 ; CHECKV7M-NEXT: it ne ; CHECKV7M-NEXT: movne r0, lr @@ -664,14 +658,10 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) { ; CHECKV7A-NEXT: lsls r4, r4, #31 ; CHECKV7A-NEXT: beq .LBB6_2 ; CHECKV7A-NEXT: @ %bb.1: @ %then -; CHECKV7A-NEXT: orrs r2, r3 -; CHECKV7A-NEXT: lsrs r2, r2, #17 -; CHECKV7A-NEXT: orr.w r2, r2, r3, lsl #15 -; CHECKV7A-NEXT: orr.w r3, r12, r1 +; CHECKV7A-NEXT: orr.w r2, r3, r2, lsr #17 +; CHECKV7A-NEXT: orr.w r1, r1, r12, lsr #17 ; CHECKV7A-NEXT: cmp r2, #0 ; CHECKV7A-NEXT: mov r2, r0 -; CHECKV7A-NEXT: lsr.w r3, r3, #17 -; CHECKV7A-NEXT: orr.w r1, r3, r1, lsl #15 ; CHECKV7A-NEXT: it ne ; CHECKV7A-NEXT: movne r2, lr ; CHECKV7A-NEXT: cmp r1, #0 @@ -680,9 +670,7 @@ define i32 @icmp64_uge_m2(i64 %x, i64 %y, i32 %a, i32 %b, i1 %c) { ; CHECKV7A-NEXT: add r0, r2 ; CHECKV7A-NEXT: pop {r4, pc} ; CHECKV7A-NEXT: .LBB6_2: @ %else -; CHECKV7A-NEXT: orr.w r1, r2, r3 -; CHECKV7A-NEXT: lsrs r1, r1, #17 -; CHECKV7A-NEXT: orr.w r1, r1, r3, lsl #15 +; CHECKV7A-NEXT: orr.w r1, r3, r2, lsr #17 ; CHECKV7A-NEXT: cmp r1, #0 ; CHECKV7A-NEXT: it ne ; CHECKV7A-NEXT: movne r0, lr diff --git a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll index 492d6477c735..a78978f977f8 100644 --- a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll @@ -12,9 +12,7 @@ define i64 @opt_setcc_lt_power_of_2(i64 %a) nounwind { ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r0, r0, #1 ; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: orr r2, r0, r1 -; CHECK-NEXT: uxth r3, r1 -; CHECK-NEXT: orr r2, r3, r2, lsr #16 +; CHECK-NEXT: orr r2, r1, r0, lsr #16 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %exit @@ -34,9 +32,7 @@ exit: define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: lsr r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsl #15 +; CHECK-NEXT: orr r0, r1, r0, lsr #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr @@ -48,9 +44,7 @@ define i1 @opt_setcc_srl_eq_zero(i64 %a) nounwind { define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: lsr r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsl #15 +; CHECK-NEXT: orr r0, r1, r0, lsr #17 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr @@ -62,9 +56,7 @@ define i1 @opt_setcc_srl_ne_zero(i64 %a) nounwind { define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r1, r1, r0 -; CHECK-NEXT: lsl r1, r1, #17 -; CHECK-NEXT: orr r0, r1, r0, lsr #15 +; CHECK-NEXT: orr r0, r0, r1, lsl #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr @@ -76,9 +68,7 @@ define i1 @opt_setcc_shl_eq_zero(i64 %a) nounwind { define i1 @opt_setcc_shl_ne_zero(i64 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r1, r1, r0 -; CHECK-NEXT: lsl r1, r1, #17 -; CHECK-NEXT: orr r0, r1, r0, lsr #15 +; CHECK-NEXT: orr r0, r0, r1, lsl #17 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr @@ -113,9 +103,7 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i64 %a) nounwind { define i1 @opt_setcc_expanded_shl_correct_shifts(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: lsl r0, r0, #17 -; CHECK-NEXT: orr r0, r0, r1, lsr #15 +; CHECK-NEXT: orr r0, r1, r0, lsl #17 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll index c08110fdda06..af7e24887328 100644 --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -230,13 +230,19 @@ define <4 x i32> @or_fshl_v4i32(<4 x i32> %x, <4 x i32> %y) { } define <2 x i64> @or_fshr_v2i64(<2 x i64> %x, <2 x i64> %y) { -; CHECK-LABEL: or_fshr_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpsllq $42, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlq $22, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; XOP-LABEL: or_fshr_v2i64: +; XOP: # %bb.0: +; XOP-NEXT: vpsrlq $22, %xmm1, %xmm1 +; XOP-NEXT: vprotq $42, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: or_fshr_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $22, %xmm1, %xmm1 +; AVX512-NEXT: vprolq $42, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %or1 = or <2 x i64> %x, %y %sh1 = shl <2 x i64> %x, %sh2 = lshr <2 x i64> %or1,