[AArch64][GlobalISel] Fold in G_ANYEXT/G_ZEXT into TB(N)Z

This is similar to the code in getTestBitOperand in AArch64ISelLowering. Instead
of implementing all of the TB(N)Z optimizations at once, this patch implements
the simplest case first. The way that this is set up should make it fairly easy
to add the rest as we go along.

The idea here is that after determining that we can use a TB(N)Z, we can
continue looking through instructions and perform further folding.

In this case, when we have a G_ZEXT or G_ANYEXT where the extended bits are not
used, we can fold it into the TB(N)Z.

Differential Revision: https://reviews.llvm.org/D73673
This commit is contained in:
Jessica Paquette 2020-01-29 13:28:30 -08:00
parent 6170272ab9
commit c8c987d310
2 changed files with 161 additions and 1 deletions

View File

@ -990,6 +990,27 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
}
}
/// Return a register which can be used as a bit to test in a TB(N)Z.
static Register getTestBitReg(Register Reg, MachineRegisterInfo &MRI) {
assert(Reg.isValid() && "Expected valid register!");
while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
unsigned Opc = MI->getOpcode();
Register NextReg;
// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT)
NextReg = MI->getOperand(1).getReg();
// Did we find something worth folding?
if (!NextReg.isValid() || !MRI.hasOneUse(NextReg))
break;
// NextReg is worth folding. Keep looking.
Reg = NextReg;
}
return Reg;
}
bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred,
MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const {
@ -1018,7 +1039,6 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
return false;
MachineRegisterInfo &MRI = *MIB.getMRI();
Register TestReg = AndInst->getOperand(1).getReg();
// Only support EQ and NE. If we have LT, then it *is* possible to fold, but
// we don't want to do this. When we have an AND and LT, we need a TST/ANDS,
@ -1034,7 +1054,11 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI);
if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value))
return false;
// Try to optimize the TB(N)Z.
uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
Register TestReg = AndInst->getOperand(1).getReg();
TestReg = getTestBitReg(TestReg, MRI);
// Choose the correct TB(N)Z opcode to use.
unsigned Opc = 0;

View File

@ -0,0 +1,136 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
#
# Check that we can continue matching when we are in a situation where we will
# emit a TB(N)Z.
...
---
name: fold_zext
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: fold_zext
; CHECK: bb.0:
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
; CHECK: liveins: $x0
; CHECK: %copy:gpr32 = COPY $w0
; CHECK: TBNZW %copy, 3, %bb.1
; CHECK: B %bb.0
; CHECK: bb.1:
; CHECK: RET_ReallyLR
bb.0:
successors: %bb.0, %bb.1
liveins: $x0
%copy:gpr(s32) = COPY $w0
%bit:gpr(s64) = G_CONSTANT i64 8
%zero:gpr(s64) = G_CONSTANT i64 0
%fold_me:gpr(s64) = G_ZEXT %copy(s32)
%and:gpr(s64) = G_AND %fold_me, %bit
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cmp_trunc(s1), %bb.1
G_BR %bb.0
bb.1:
RET_ReallyLR
...
---
name: fold_anyext
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: fold_anyext
; CHECK: bb.0:
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
; CHECK: liveins: $x0
; CHECK: %copy:gpr32 = COPY $w0
; CHECK: TBNZW %copy, 3, %bb.1
; CHECK: B %bb.0
; CHECK: bb.1:
; CHECK: RET_ReallyLR
bb.0:
successors: %bb.0, %bb.1
liveins: $x0
%copy:gpr(s32) = COPY $w0
%bit:gpr(s64) = G_CONSTANT i64 8
%zero:gpr(s64) = G_CONSTANT i64 0
%fold_me:gpr(s64) = G_ANYEXT %copy(s32)
%and:gpr(s64) = G_AND %fold_me, %bit
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cmp_trunc(s1), %bb.1
G_BR %bb.0
bb.1:
RET_ReallyLR
...
---
name: fold_multiple
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: fold_multiple
; CHECK: bb.0:
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
; CHECK: liveins: $h0
; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub
; CHECK: %copy:gpr32all = COPY [[SUBREG_TO_REG]]
; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %copy
; CHECK: TBNZW [[COPY]], 3, %bb.1
; CHECK: B %bb.0
; CHECK: bb.1:
; CHECK: RET_ReallyLR
bb.0:
successors: %bb.0, %bb.1
liveins: $h0
%copy:gpr(s16) = COPY $h0
%bit:gpr(s64) = G_CONSTANT i64 8
%zero:gpr(s64) = G_CONSTANT i64 0
%ext1:gpr(s32) = G_ZEXT %copy(s16)
%ext2:gpr(s64) = G_ANYEXT %ext1(s32)
%and:gpr(s64) = G_AND %ext2, %bit
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cmp_trunc(s1), %bb.1
G_BR %bb.0
bb.1:
RET_ReallyLR
...
---
name: dont_fold_more_than_one_use
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: dont_fold_more_than_one_use
; CHECK: bb.0:
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
; CHECK: liveins: $x0
; CHECK: %copy:gpr32 = COPY $w0
; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %copy, %subreg.sub_32
; CHECK: %zext:gpr64 = UBFMXri [[SUBREG_TO_REG]], 0, 31
; CHECK: TBNZW %copy, 3, %bb.1
; CHECK: B %bb.0
; CHECK: bb.1:
; CHECK: $x0 = COPY %zext
; CHECK: RET_ReallyLR implicit $x0
bb.0:
successors: %bb.0, %bb.1
liveins: $x0
%copy:gpr(s32) = COPY $w0
%bit:gpr(s64) = G_CONSTANT i64 8
%zero:gpr(s64) = G_CONSTANT i64 0
%zext:gpr(s64) = G_ZEXT %copy(s32)
%and:gpr(s64) = G_AND %zext, %bit
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cmp_trunc(s1), %bb.1
G_BR %bb.0
bb.1:
$x0 = COPY %zext:gpr(s64)
RET_ReallyLR implicit $x0