[X86] Add isel patterns to match BMI/TBMI instructions when lowering has turned the root nodes into one of the flag producing binops.

This fixes the patterns that have or/and as a root. 'and' is handled differently since thy usually have a CMP wrapped around them.

I had to look for uses of the CF flag because all these nodes have non-standard CF flag behavior. A real or/xor would always clear CF. In practice we shouldn't be using the CF flag from these nodes as far as I know.

Differential Revision: https://reviews.llvm.org/D55813

llvm-svn: 349962
This commit is contained in:
Craig Topper 2018-12-21 21:42:43 +00:00
parent c6027e20d4
commit e58cd9cbc6
4 changed files with 77 additions and 48 deletions

View File

@ -472,6 +472,9 @@ namespace {
SDValue &InFlag);
bool tryOptimizeRem8Extend(SDNode *N);
bool hasNoSignFlagUses(SDValue Flags) const;
bool hasNoCarryFlagUses(SDValue Flags) const;
};
}
@ -2225,7 +2228,7 @@ static X86::CondCode getCondFromOpc(unsigned Opc) {
/// Test whether the given X86ISD::CMP node has any uses which require the SF
/// flag to be accurate.
static bool hasNoSignFlagUses(SDValue Flags) {
bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
@ -2265,7 +2268,7 @@ static bool hasNoSignFlagUses(SDValue Flags) {
/// Test whether the given node which sets flags has any uses which require the
/// CF flag to be accurate.
static bool hasNoCarryFlagUses(SDValue Flags) {
bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {

View File

@ -2387,6 +2387,16 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
// Pattern fragments to auto generate BMI instructions.
//===----------------------------------------------------------------------===//
def or_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
(X86or_flag node:$lhs, node:$rhs), [{
return hasNoCarryFlagUses(SDValue(N, 1));
}]>;
def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
(X86xor_flag node:$lhs, node:$rhs), [{
return hasNoCarryFlagUses(SDValue(N, 1));
}]>;
let Predicates = [HasBMI] in {
// FIXME: patterns for the load versions are not implemented
def : Pat<(and GR32:$src, (add GR32:$src, -1)),
@ -2403,6 +2413,14 @@ let Predicates = [HasBMI] in {
(BLSI32rr GR32:$src)>;
def : Pat<(and GR64:$src, (ineg GR64:$src)),
(BLSI64rr GR64:$src)>;
// Versions to match flag producing ops.
// X86and_flag nodes are rarely created. Those should use CMP+AND. We do
// TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
}
multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
@ -2801,6 +2819,45 @@ let Predicates = [HasTBM] in {
(TZMSK32rr GR32:$src)>;
def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
(TZMSK64rr GR64:$src)>;
// Patterns to match flag producing ops.
// X86and_flag nodes are rarely created. Those should use CMP+AND. We do
// TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
(BLCI32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
(BLCI64rr GR64:$src)>;
// Extra patterns because opt can optimize the above patterns to this.
def : Pat<(or_flag_nocf GR32:$src, (sub -2, GR32:$src)),
(BLCI32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
(BLCI64rr GR64:$src)>;
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
(BLCMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
(BLCMSK64rr GR64:$src)>;
def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, 1)),
(BLCS32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, 1)),
(BLCS64rr GR64:$src)>;
def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, -1)),
(BLSFILL32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, -1)),
(BLSFILL64rr GR64:$src)>;
def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
(BLSIC32rr GR32:$src)>;
def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
(BLSIC64rr GR64:$src)>;
def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
(T1MSKC32rr GR32:$src)>;
def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
(T1MSKC64rr GR64:$src)>;
} // HasTBM
//===----------------------------------------------------------------------===//

View File

@ -691,9 +691,7 @@ define i32 @blsmsk32_z(i32 %a, i32 %b) nounwind {
define i32 @blsmsk32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; X86-LABEL: blsmsk32_z2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal -1(%eax), %ecx
; X86-NEXT: xorl %eax, %ecx
; X86-NEXT: blsmskl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmovel %eax, %ecx
@ -703,9 +701,7 @@ define i32 @blsmsk32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; X64-LABEL: blsmsk32_z2:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal -1(%rdi), %ecx
; X64-NEXT: xorl %edi, %ecx
; X64-NEXT: blsmskl %edi, %ecx
; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
%t0 = sub i32 %a, 1
@ -800,8 +796,7 @@ define i64 @blsmsk64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; X64-LABEL: blsmsk64_z2:
; X64: # %bb.0:
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: leaq -1(%rdi), %rcx
; X64-NEXT: xorq %rdi, %rcx
; X64-NEXT: blsmskq %rdi, %rcx
; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: retq
%t0 = sub i64 %a, 1

View File

@ -226,10 +226,7 @@ define i32 @test_x86_tbm_blci_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: leal 1(%rdi), %ecx
; CHECK-NEXT: notl %ecx
; CHECK-NEXT: orl %edi, %ecx
; CHECK-NEXT: blcil %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 1, %a
@ -269,9 +266,7 @@ define i64 @test_x86_tbm_blci_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: leaq 1(%rdi), %rcx
; CHECK-NEXT: notq %rcx
; CHECK-NEXT: orq %rdi, %rcx
; CHECK-NEXT: blciq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 1, %a
@ -409,9 +404,7 @@ define i32 @test_x86_tbm_blcmsk_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blcmsk_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: leal 1(%rdi), %ecx
; CHECK-NEXT: xorl %edi, %ecx
; CHECK-NEXT: blcmskl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
@ -448,8 +441,7 @@ define i64 @test_x86_tbm_blcmsk_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blcmsk_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: leaq 1(%rdi), %rcx
; CHECK-NEXT: xorq %rdi, %rcx
; CHECK-NEXT: blcmskq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
@ -486,9 +478,7 @@ define i32 @test_x86_tbm_blcs_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blcs_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: leal 1(%rdi), %ecx
; CHECK-NEXT: orl %edi, %ecx
; CHECK-NEXT: blcsl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
@ -525,8 +515,7 @@ define i64 @test_x86_tbm_blcs_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blcs_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: leaq 1(%rdi), %rcx
; CHECK-NEXT: orq %rdi, %rcx
; CHECK-NEXT: blcsq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
@ -563,9 +552,7 @@ define i32 @test_x86_tbm_blsfill_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blsfill_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: leal -1(%rdi), %ecx
; CHECK-NEXT: orl %edi, %ecx
; CHECK-NEXT: blsfilll %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, -1
@ -602,8 +589,7 @@ define i64 @test_x86_tbm_blsfill_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blsfill_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: leaq -1(%rdi), %rcx
; CHECK-NEXT: orq %rdi, %rcx
; CHECK-NEXT: blsfillq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, -1
@ -642,10 +628,7 @@ define i32 @test_x86_tbm_blsic_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blsic_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: notl %ecx
; CHECK-NEXT: decl %edi
; CHECK-NEXT: orl %ecx, %edi
; CHECK-NEXT: blsicl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@ -685,10 +668,7 @@ define i64 @test_x86_tbm_blsic_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_blsic_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: notq %rcx
; CHECK-NEXT: decq %rdi
; CHECK-NEXT: orq %rcx, %rdi
; CHECK-NEXT: blsicq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
@ -728,10 +708,7 @@ define i32 @test_x86_tbm_t1mskc_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: notl %ecx
; CHECK-NEXT: incl %edi
; CHECK-NEXT: orl %ecx, %edi
; CHECK-NEXT: t1mskcl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@ -771,10 +748,7 @@ define i64 @test_x86_tbm_t1mskc_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: notq %rcx
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: orq %rcx, %rdi
; CHECK-NEXT: t1mskcq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1