[GlobalISel][AArch64] Add support for base register + offset register loads

Add support for folding G_GEPs into loads of the form

```
ldr reg, [base, off]
```

when possible. This can save an add before the load. Currently, this is only
supported for loads of 64 bits into 64 bit registers.

Add a new addressing mode function, `selectAddrModeRegisterOffset` which
performs this folding when it is profitable.

Also add a test for addressing modes for G_LOAD.

Differential Revision: https://reviews.llvm.org/D64944

llvm-svn: 366503
This commit is contained in:
Jessica Paquette 2019-07-18 21:50:11 +00:00
parent 50057f3288
commit 7a1dcc5ff1
2 changed files with 183 additions and 0 deletions

View File

@ -67,6 +67,7 @@ private:
bool earlySelect(MachineInstr &I) const;
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool earlySelectLoad(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
@ -182,6 +183,7 @@ private:
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
return selectAddrModeIndexed(Root, Width / 8);
}
ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
@ -1158,6 +1160,57 @@ bool AArch64InstructionSelector::earlySelectSHL(
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::earlySelectLoad(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Try to fold in shifts, etc into the addressing mode of a load.
assert(I.getOpcode() == TargetOpcode::G_LOAD && "unexpected op");
// Don't handle atomic loads/stores yet.
auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
unsigned MemBytes = MemOp.getSize();
// Only support 64-bit loads for now.
if (MemBytes != 8)
return false;
Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
// Don't handle vectors.
if (DstTy.isVector())
return false;
unsigned DstSize = DstTy.getSizeInBits();
// TODO: 32-bit destinations.
if (DstSize != 64)
return false;
// Check if we can do any folding from GEPs etc. into the load.
auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1));
if (!ImmFn)
return false;
// We can fold something. Emit the load here.
MachineIRBuilder MIB(I);
// Choose the instruction based off the size of the element being loaded, and
// whether or not we're loading into a FPR.
const RegisterBank &RB = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned Opc =
RB.getID() == AArch64::GPRRegBankID ? AArch64::LDRXroX : AArch64::LDRDroX;
// Construct the load.
auto LoadMI = MIB.buildInstr(Opc, {DstReg}, {});
for (auto &RenderFn : *ImmFn)
RenderFn(LoadMI);
LoadMI.addMemOperand(*I.memoperands_begin());
I.eraseFromParent();
return constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@ -1169,6 +1222,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
case TargetOpcode::G_LOAD:
return earlySelectLoad(I, MRI);
default:
return false;
}
@ -3891,6 +3946,44 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
}};
}
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3]
///
/// Where x2 is the base register, and x3 is an offset register.
///
/// When possible (or profitable) to fold a G_GEP into the address calculation,
/// this will do so. Otherwise, it will return None.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeRegisterOffset(
MachineOperand &Root) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// If we have a constant offset, then we probably don't want to match a
// register offset.
if (isBaseWithConstantOffset(Root, MRI))
return None;
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
return None;
// If this is used more than once, let's not bother folding.
// TODO: Check if they are memory ops. If they are, then we can still fold
// without having to recompute anything.
if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
return None;
// Base is the GEP's LHS, offset is its RHS.
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(2)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
/// Select a "register plus unscaled signed 9-bit immediate" address. This
/// should only match when there is an offset that is not valid for a scaled
/// immediate addressing mode. The "Size" argument is the size in bytes of the

View File

@ -0,0 +1,90 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
--- |
define void @ldrxrox_breg_oreg(i64* %addr) { ret void }
define void @ldrdrox_breg_oreg(i64* %addr) { ret void }
define void @more_than_one_use(i64* %addr) { ret void }
...
---
name: ldrxrox_breg_oreg
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
machineFunctionInfo: {}
body: |
bb.0:
liveins: $x0, $x1
; CHECK-LABEL: name: ldrxrox_breg_oreg
; CHECK: liveins: $x0, $x1
; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr)
; CHECK: $x0 = COPY [[LDRXroX]]
; CHECK: RET_ReallyLR implicit $x0
%0:gpr(p0) = COPY $x0
%1:gpr(s64) = COPY $x1
%2:gpr(p0) = G_GEP %0, %1
%4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
$x0 = COPY %4(s64)
RET_ReallyLR implicit $x0
...
---
name: ldrdrox_breg_oreg
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
machineFunctionInfo: {}
body: |
bb.0:
liveins: $d0, $x1
; CHECK-LABEL: name: ldrdrox_breg_oreg
; CHECK: liveins: $d0, $x1
; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr)
; CHECK: $d0 = COPY [[LDRDroX]]
; CHECK: RET_ReallyLR implicit $d0
%0:gpr(p0) = COPY $d0
%1:gpr(s64) = COPY $x1
%2:gpr(p0) = G_GEP %0, %1
%4:fpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
$d0 = COPY %4(s64)
RET_ReallyLR implicit $d0
...
---
name: more_than_one_use
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
machineFunctionInfo: {}
body: |
bb.0:
liveins: $x0, $x1
; This shouldn't be folded, since we reuse the result of the G_GEP outside
; the G_LOAD
; CHECK-LABEL: name: more_than_one_use
; CHECK: liveins: $x0, $x1
; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY]], [[COPY1]]
; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr)
; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[LDRXui]]
; CHECK: $x0 = COPY [[ADDXrr1]]
; CHECK: RET_ReallyLR implicit $x0
%0:gpr(p0) = COPY $x0
%1:gpr(s64) = COPY $x1
%2:gpr(p0) = G_GEP %0, %1
%4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr)
%5:gpr(s64) = G_PTRTOINT %2
%6:gpr(s64) = G_ADD %5, %4
$x0 = COPY %6(s64)
RET_ReallyLR implicit $x0