From 02f25a95655e9e320c524978e54df8e4a22ef921 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 19 May 2016 14:19:47 +0000
Subject: [PATCH] [AArch64 ] Generate a BFXIL from 'or (and X, Mask0Imm),(and
 Y, Mask1Imm)'.

Mask0Imm and ~Mask1Imm must be equivalent and one of the MaskImms is a shifted
mask (e.g., 0x000ffff0).  Both 'and's must have a single use.

This changes code like:

  and w8, w0, #0xffff000f
  and w9, w1, #0x0000fff0
  orr w0, w9, w8

into

  lsr w8, w1, #4
  bfi w0, w8, #4, #12

llvm-svn: 270063
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 59 ++++++++++++++++
 llvm/test/CodeGen/AArch64/bitfield-insert.ll  | 67 +++++++++++++++++++
 2 files changed, 126 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index b8ee03a35585..fe5d6045bde6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1974,6 +1974,13 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
   return true;
 }
 
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+  assert(VT == MVT::i32 || VT == MVT::i64);
+  if (VT == MVT::i32)
+    return isShiftedMask_32(Mask);
+  return isShiftedMask_64(Mask);
+}
+
 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
                                       SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
@@ -2084,6 +2091,58 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
     return true;
   }
+
+  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+  // mask (e.g., 0x000ffff0).
+  uint64_t Mask0Imm, Mask1Imm;
+  SDValue And0 = N->getOperand(0);
+  SDValue And1 = N->getOperand(1);
+  if (And0.hasOneUse() && And1.hasOneUse() &&
+      isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+      isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+      APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+      (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+    // We should have already caught the case where we extract hi and low parts.
+    // E.g. BFXIL from 'or (and X, 0xffff0000), (and Y, 0x0000ffff)'.
+    assert(!(isShiftedMask(Mask0Imm, VT) && isShiftedMask(Mask1Imm, VT)) &&
+           "BFXIL should have already been optimized.");
+
+    // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+    // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+    // bits to be inserted.
+    if (isShiftedMask(Mask0Imm, VT)) {
+      std::swap(And0, And1);
+      std::swap(Mask0Imm, Mask1Imm);
+    }
+
+    SDValue Src = And1->getOperand(0);
+    SDValue Dst = And0->getOperand(0);
+    unsigned LSB = countTrailingZeros(Mask1Imm);
+    int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+    // The BFXIL inserts the low-order bits from a source register, so right
+    // shift the needed bits into place.
+    SDLoc DL(N);
+    unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+    SDNode *LSR = CurDAG->getMachineNode(
+        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+    // BFXIL is an alias of BFM, so translate to BFM operands.
+    unsigned ImmR = (BitWidth - LSB) % BitWidth;
+    unsigned ImmS = Width - 1;
+
+    // Create the BFXIL instruction.
+    SDValue Ops[] = {Dst, SDValue(LSR, 0),
+                     CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index b89d06be84de..8c517f85c38f 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -311,3 +311,70 @@ entry:
   store i16 %trunc, i16* %gep
   ret void
 }
+
+; The next set of tests generate a BFXIL from 'or (and X, Mask0Imm),
+; (and Y, Mask1Imm)' iff Mask0Imm and ~Mask1Imm are equivalent and one of the
+; MaskImms is a shifted mask (e.g., 0x000ffff0).
+
+; CHECK-LABEL: @test_or_and_and1
+; CHECK: lsr w8, w1, #4
+; CHECK: bfi w0, w8, #4, #12
+define i32 @test_or_and_and1(i32 %a, i32 %b) {
+entry:
+  %and = and i32 %a, -65521 ; 0xffff000f
+  %and1 = and i32 %b, 65520 ; 0x0000fff0
+  %or = or i32 %and1, %and
+  ret i32 %or
+}
+
+; CHECK-LABEL: @test_or_and_and2
+; CHECK: lsr w8, w0, #4
+; CHECK: bfi w1, w8, #4, #12
+define i32 @test_or_and_and2(i32 %a, i32 %b) {
+entry:
+  %and = and i32 %a, 65520   ; 0x0000fff0
+  %and1 = and i32 %b, -65521 ; 0xffff000f
+  %or = or i32 %and1, %and
+  ret i32 %or
+}
+
+; CHECK-LABEL: @test_or_and_and3
+; CHECK: lsr x8, x1, #16
+; CHECK: bfi x0, x8, #16, #32
+define i64 @test_or_and_and3(i64 %a, i64 %b) {
+entry:
+  %and = and i64 %a, -281474976645121 ; 0xffff00000000ffff
+  %and1 = and i64 %b, 281474976645120 ; 0x0000ffffffff0000
+  %or = or i64 %and1, %and
+  ret i64 %or
+}
+
+; Don't convert 'and' with multiple uses.
+; CHECK-LABEL: @test_or_and_and4
+; CHECK: and w8, w0, #0xffff000f
+; CHECK: and w9, w1, #0xfff0
+; CHECK: orr w0, w9, w8
+; CHECK: str w8, [x2
+define i32 @test_or_and_and4(i32 %a, i32 %b, i32* %ptr) {
+entry:
+  %and = and i32 %a, -65521
+  store i32 %and, i32* %ptr, align 4
+  %and2 = and i32 %b, 65520
+  %or = or i32 %and2, %and
+  ret i32 %or
+}
+
+; Don't convert 'and' with multiple uses.
+; CHECK-LABEL: @test_or_and_and5
+; CHECK: and w8, w1, #0xfff0
+; CHECK: and w9, w0, #0xffff000f
+; CHECK: orr w0, w8, w9
+; CHECK: str w8, [x2]
+define i32 @test_or_and_and5(i32 %a, i32 %b, i32* %ptr) {
+entry:
+  %and = and i32 %b, 65520
+  store i32 %and, i32* %ptr, align 4
+  %and1 = and i32 %a, -65521
+  %or = or i32 %and, %and1
+  ret i32 %or
+}