diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index 4e13fb8e2027..f8e15c88ef08 100644 --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -67,6 +67,7 @@ private: bool earlySelect(MachineInstr &I) const; bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool earlySelectLoad(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; @@ -182,6 +183,7 @@ private: ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { return selectAddrModeIndexed(Root, Width / 8); } + ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; @@ -1158,6 +1160,57 @@ bool AArch64InstructionSelector::earlySelectSHL( return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); } +bool AArch64InstructionSelector::earlySelectLoad( + MachineInstr &I, MachineRegisterInfo &MRI) const { + // Try to fold in shifts, etc into the addressing mode of a load. + assert(I.getOpcode() == TargetOpcode::G_LOAD && "unexpected op"); + + // Don't handle atomic loads/stores yet. + auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + + unsigned MemBytes = MemOp.getSize(); + + // Only support 64-bit loads for now. + if (MemBytes != 8) + return false; + + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + // Don't handle vectors. + if (DstTy.isVector()) + return false; + + unsigned DstSize = DstTy.getSizeInBits(); + // TODO: 32-bit destinations. + if (DstSize != 64) + return false; + + // Check if we can do any folding from GEPs etc. into the load. + auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1)); + if (!ImmFn) + return false; + + // We can fold something. Emit the load here. + MachineIRBuilder MIB(I); + + // Choose the instruction based off the size of the element being loaded, and + // whether or not we're loading into a FPR. + const RegisterBank &RB = *RBI.getRegBank(DstReg, MRI, TRI); + unsigned Opc = + RB.getID() == AArch64::GPRRegBankID ? AArch64::LDRXroX : AArch64::LDRDroX; + // Construct the load. + auto LoadMI = MIB.buildInstr(Opc, {DstReg}, {}); + for (auto &RenderFn : *ImmFn) + RenderFn(LoadMI); + LoadMI.addMemOperand(*I.memoperands_begin()); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); +} + bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1169,6 +1222,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { switch (I.getOpcode()) { case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); + case TargetOpcode::G_LOAD: + return earlySelectLoad(I, MRI); default: return false; } @@ -3891,6 +3946,44 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { }}; } +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3] +/// +/// Where x2 is the base register, and x3 is an offset register. +/// +/// When possible (or profitable) to fold a G_GEP into the address calculation, +/// this will do so. Otherwise, it will return None. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeRegisterOffset( + MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // We need a GEP. + MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); + if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP) + return None; + + // If this is used more than once, let's not bother folding. + // TODO: Check if they are memory ops. If they are, then we can still fold + // without having to recompute anything. + if (!MRI.hasOneUse(Gep->getOperand(0).getReg())) + return None; + + // Base is the GEP's LHS, offset is its RHS. + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(2)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; +} + /// Select a "register plus unscaled signed 9-bit immediate" address. This /// should only match when there is an offset that is not valid for a scaled /// immediate addressing mode. The "Size" argument is the size in bytes of the diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir new file mode 100644 index 000000000000..aecb772be3f3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir @@ -0,0 +1,90 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define void @ldrxrox_breg_oreg(i64* %addr) { ret void } + define void @ldrdrox_breg_oreg(i64* %addr) { ret void } + define void @more_than_one_use(i64* %addr) { ret void } +... + +--- +name: ldrxrox_breg_oreg +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: ldrxrox_breg_oreg + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr) + ; CHECK: $x0 = COPY [[LDRXroX]] + ; CHECK: RET_ReallyLR implicit $x0 + %0:gpr(p0) = COPY $x0 + %1:gpr(s64) = COPY $x1 + %2:gpr(p0) = G_GEP %0, %1 + %4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr) + $x0 = COPY %4(s64) + RET_ReallyLR implicit $x0 +... + +--- +name: ldrdrox_breg_oreg +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $d0, $x1 + ; CHECK-LABEL: name: ldrdrox_breg_oreg + ; CHECK: liveins: $d0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY]], [[COPY1]], 0, 0 :: (load 8 from %ir.addr) + ; CHECK: $d0 = COPY [[LDRDroX]] + ; CHECK: RET_ReallyLR implicit $d0 + %0:gpr(p0) = COPY $d0 + %1:gpr(s64) = COPY $x1 + %2:gpr(p0) = G_GEP %0, %1 + %4:fpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr) + $d0 = COPY %4(s64) + RET_ReallyLR implicit $d0 +... + +--- +name: more_than_one_use +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1 + ; This shouldn't be folded, since we reuse the result of the G_GEP outside + ; the G_LOAD + ; CHECK-LABEL: name: more_than_one_use + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY]], [[COPY1]] + ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr) + ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] + ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[LDRXui]] + ; CHECK: $x0 = COPY [[ADDXrr1]] + ; CHECK: RET_ReallyLR implicit $x0 + %0:gpr(p0) = COPY $x0 + %1:gpr(s64) = COPY $x1 + %2:gpr(p0) = G_GEP %0, %1 + %4:gpr(s64) = G_LOAD %2(p0) :: (load 8 from %ir.addr) + %5:gpr(s64) = G_PTRTOINT %2 + %6:gpr(s64) = G_ADD %5, %4 + $x0 = COPY %6(s64) + RET_ReallyLR implicit $x0