[X86] X86DAGToDAGISel::matchBEXTRFromAndImm(): if can't use BEXTR, fallback to BZHI is profitable (PR43381)

Summary: PR43381 notes that while we are good at matching `(X >> C1) & C2` as BEXTR/BEXTRI, we only do that if we either have BEXTRI (TBM), or if BEXTR is marked as being fast (`-mattr=+fast-bextr`). In all other cases we don't match. But that is mainly only true for AMD CPU's. However, for all the CPU's for which we have sched models, the BZHI is always fast (or the sched models are all bad.) So if we decide that it's unprofitable to emit BEXTR/BEXTRI, we should consider falling-back to BZHI if it is available, and follow-up with the shift. While it's really tempting to do something because it's cool it is wise to first think whether it actually makes sense to do. We shouldn't just use BZHI because we can, but only it it is beneficial. In particular, it isn't really worth it if the input is a register, mask is small, or we can fold a load. But it is worth it if the mask does not fit into 32-bits. (careful, i don't know much about intel cpu's, my choice of `-mcpu` may be bad here) Thus we manage to fold a load: https://godbolt.org/z/Er0OQz Or if we'd end up using BZHI anyways because the mask is large: https://godbolt.org/z/dBJ_5h But this isn'r actually profitable in general case, e.g. here we'd increase microop count (the register renaming is free, mca does not model that there it seems) https://godbolt.org/z/k6wFoz Likewise, not worth it if we just get load folding: https://godbolt.org/z/1M1deG https://bugs.llvm.org/show_bug.cgi?id=43381 Reviewers: RKSimon, craig.topper, davezarzycki, spatel Reviewed By: craig.topper, davezarzycki Subscribers: andreadb, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67875 llvm-svn: 372532
2019-09-22 22:04:29 +00:00 · 2019-09-22 22:04:29 +00:00 · 7c3d6f5a1b
parent fb218170b4
commit 7c3d6f5a1b
2 changed files with 52 additions and 17 deletions
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -3441,8 +3441,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
  // hoisting the move immediate would make it worthwhile with a less optimal
  // BEXTR?
-  if (!Subtarget->hasTBM() &&
-      !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+  bool PreferBEXTR =
+      Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+  if (!PreferBEXTR && !Subtarget->hasBMI2())
    return nullptr;

  // Must have a shift right.
@ -3481,23 +3482,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
  if (Shift + MaskSize > NVT.getSizeInBits())
    return nullptr;

-  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
-  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
-  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+  // does not fit into 32 bits. Load folding is not a sufficient reason.
+  if (!PreferBEXTR && MaskSize <= 32)
+    return nullptr;

-  // BMI requires the immediate to placed in a register.
-  if (!Subtarget->hasTBM()) {
-    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
-    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+  SDValue Control;
+  unsigned ROpc, MOpc;
+
+  if (!PreferBEXTR) {
+    assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+    // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+    // Let's perform the mask first, and apply shift later. Note that we need to
+    // widen the mask to account for the fact that we'll apply shift afterwards!
+    Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+    ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+    MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
-    New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+    Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+  } else {
+    // The 'control' of BEXTR has the pattern of:
+    // [15...8 bit][ 7...0 bit] location
+    // [ bit count][     shift] name
+    // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
+    Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+    if (Subtarget->hasTBM()) {
+      ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+      MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+    } else {
+      assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+      // BMI requires the immediate to placed in a register.
+      ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+      MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+      unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+      Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+    }
  }

  MachineSDNode *NewNode;
  SDValue Input = N0->getOperand(0);
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDValue Ops[] = {
+        Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
    // Update the chain.
@ -3505,7 +3533,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
    // Record the mem-refs
    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
  } else {
-    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+  }
+
+  if (!PreferBEXTR) {
+    // We still need to apply the shift.
+    SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+    unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+    NewNode =
+        CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
  }

  return NewNode;
--- a/llvm/test/CodeGen/X86/bmi-x86_64.ll
+++ b/llvm/test/CodeGen/X86/bmi-x86_64.ll
@ -86,9 +86,9 @@ define i64 @bextr64d(i64 %a) {
 ;
 ; BMI2-SLOW-LABEL: bextr64d:
 ; BMI2-SLOW:       # %bb.0: # %entry
-; BMI2-SLOW-NEXT:    shrq $2, %rdi
-; BMI2-SLOW-NEXT:    movb $33, %al
+; BMI2-SLOW-NEXT:    movl $35, %eax
 ; BMI2-SLOW-NEXT:    bzhiq %rax, %rdi, %rax
+; BMI2-SLOW-NEXT:    shrq $2, %rax
 ; BMI2-SLOW-NEXT:    retq
 ;
 ; BEXTR-FAST-LABEL: bextr64d:
@ -113,10 +113,9 @@ define i64 @bextr64d_load(i64* %aptr) {
 ;
 ; BMI2-SLOW-LABEL: bextr64d_load:
 ; BMI2-SLOW:       # %bb.0: # %entry
-; BMI2-SLOW-NEXT:    movq (%rdi), %rax
+; BMI2-SLOW-NEXT:    movl $35, %eax
+; BMI2-SLOW-NEXT:    bzhiq %rax, (%rdi), %rax
 ; BMI2-SLOW-NEXT:    shrq $2, %rax
-; BMI2-SLOW-NEXT:    movb $33, %cl
-; BMI2-SLOW-NEXT:    bzhiq %rcx, %rax, %rax
 ; BMI2-SLOW-NEXT:    retq
 ;
 ; BEXTR-FAST-LABEL: bextr64d_load: