From 1eaa04d682d684f7e83c263aa40e78ae8faab378 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@codeaurora.org>
Date: Tue, 22 Jan 2019 01:51:37 +0000
Subject: [PATCH] [ARM] Combine ands+lsls to lsls+lsrs for Thumb1.

This patch may seem familiar... but my previous patch handled the
equivalent lsls+and, not this case.  Usually instcombine puts the
"and" after the shift, so this case doesn't come up. However, if the
shift comes out of a GEP, it won't get canonicalized by instcombine,
and DAGCombine doesn't have an equivalent transform.

This also modifies isDesirableToCommuteWithShift to suppress DAGCombine
transforms which would make the overall code worse.

I'm not really happy adding a bunch of code to handle this, but it would
probably be tricky to substantially improve the behavior of DAGCombine
here.

Differential Revision: https://reviews.llvm.org/D56032

llvm-svn: 351776
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 66 +++++++++++++++++++++--
 llvm/test/CodeGen/Thumb/shift-and.ll    | 70 +++++++++++++++++++++++++
 2 files changed, 131 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 18d441eb20d1..5cf9a2fc2bfe 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1176,6 +1176,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
+  if (Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::SHL);
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
@@ -10418,12 +10420,29 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
   if (Level == BeforeLegalizeTypes)
     return true;
 
-  if (Subtarget->isThumb() && Subtarget->isThumb1Only())
-    return true;
-
   if (N->getOpcode() != ISD::SHL)
     return true;
 
+  if (Subtarget->isThumb1Only()) {
+    // Avoid making expensive immediates by commuting shifts. (This logic
+    // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
+    // for free.)
+    if (N->getOpcode() != ISD::SHL)
+      return true;
+    SDValue N1 = N->getOperand(0);
+    if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
+        N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
+      return true;
+    if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
+      if (Const->getAPIntValue().ult(256))
+        return false;
+      if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
+          Const->getAPIntValue().sgt(-256))
+        return false;
+    }
+    return true;
+  }
+
   // Turn off commute-with-shift transform after legalization, so it doesn't
   // conflict with PerformSHLSimplify.  (We could try to detect when
   // PerformSHLSimplify would trigger more precisely, but it isn't
@@ -12419,8 +12438,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
 /// combining instead of DAG legalizing because the build_vectors for 64-bit
 /// vector element shift counts are generally not legal, and it is hard to see
 /// their values after they get legalized to loads from a constant pool.
-static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformShiftCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                    const ARMSubtarget *ST) {
+  SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
@@ -12435,6 +12456,40 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
+      N->getOperand(0)->getOpcode() == ISD::AND &&
+      N->getOperand(0)->hasOneUse()) {
+    if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+      return SDValue();
+    // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
+    // usually show up because instcombine prefers to canonicalize it to
+    // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
+    // out of GEP lowering in some cases.
+    SDValue N0 = N->getOperand(0);
+    ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!ShiftAmtNode)
+      return SDValue();
+    uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
+    ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!AndMaskNode)
+      return SDValue();
+    uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
+    // Don't transform uxtb/uxth.
+    if (AndMask == 255 || AndMask == 65535)
+      return SDValue();
+    if (isMask_32(AndMask)) {
+      uint32_t MaskedBits = countLeadingZeros(AndMask);
+      if (MaskedBits > ShiftAmt) {
+        SDLoc DL(N);
+        SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+                                  DAG.getConstant(MaskedBits, DL, MVT::i32));
+        return DAG.getNode(
+            ISD::SRL, DL, MVT::i32, SHL,
+            DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
+      }
+    }
+  }
+
   // Nothing to be done for scalar shifts.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!VT.isVector() || !TLI.isTypeLegal(VT))
@@ -12853,7 +12908,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
-  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
diff --git a/llvm/test/CodeGen/Thumb/shift-and.ll b/llvm/test/CodeGen/Thumb/shift-and.ll
index 3981d2c9702a..b8e3416adbb8 100644
--- a/llvm/test/CodeGen/Thumb/shift-and.ll
+++ b/llvm/test/CodeGen/Thumb/shift-and.ll
@@ -188,3 +188,73 @@ entry:
   %shl = shl i32 %shr, 3
   ret i32 %shl
 }
+
+define i32 @test16(i32 %x) {
+; CHECK-LABEL: test16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsls r0, r0, #28
+; CHECK-NEXT:    lsrs r0, r0, #26
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %x, 15
+  %shl = shl i32 %0, 2
+  ret i32 %shl
+}
+
+define i32* @test17(i32* %p, i32 %x) {
+; CHECK-LABEL: test17:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    lsrs r1, r1, #26
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %x, 15
+  %shl = getelementptr i32, i32* %p, i32 %0
+  ret i32* %shl
+}
+
+define i32* @test18(i32* %p, i32 %x) {
+; CHECK-LABEL: test18:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r1, r1, #1
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    lsrs r1, r1, #26
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %xx = add i32 %x, 1
+  %0 = and i32 %xx, 15
+  %shl = getelementptr i32, i32* %p, i32 %0
+  ret i32* %shl
+}
+
+define i32* @test19(i32* %p, i32 %x) {
+; CHECK-LABEL: test19:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r1, r1, #1
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    lsrs r1, r1, #26
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %xx = sub i32 %x, 1
+  %0 = and i32 %xx, 15
+  %shl = getelementptr i32, i32* %p, i32 %0
+  ret i32* %shl
+}
+
+define i32* @test20(i32* %p, i32 %x) {
+; CHECK-LABEL: test20:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r1, r1, #1
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    lsrs r1, r1, #26
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %xx = add i32 %x, 15
+  %0 = and i32 %xx, 15
+  %shl = getelementptr i32, i32* %p, i32 %0
+  ret i32* %shl
+}