diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3f242fdb6d8e..5b6fcb78bcbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; + return Register(); if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); @@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, - // then we can select all ptr + 32-bit offsets not just immediate offsets. - if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + // then we can select all ptr + 32-bit offsets. + if (AddrInfo.empty()) return None; const GEPInfo &GEPInfo = AddrInfo[0]; - // SGPR offset is unsigned. - if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) - return None; - - // If we make it this far we have a load with an 32-bit immediate offset. - // It is OK to select this using a sgpr offset, because we have already - // failed trying to select this load into one of the _IMM variants since - // the _IMM Patterns are considered before the _SGPR patterns. Register PtrReg = GEPInfo.SgprParts[0]; - Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } - }}; + + // SGPR offset is unsigned. + if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) && + GEPInfo.Imm != 0) { + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } + + if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) { + if (Register OffsetReg = + matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) { + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } + } + + return None; } std::pair diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir index 03a4c8a6fadc..530f42dff2aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -6,6 +6,7 @@ define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void } define amdgpu_kernel void @smrd_wide() { ret void } define amdgpu_kernel void @constant_address_positive() { ret void } + define amdgpu_kernel void @smrd_sgpr() { ret void } ... --- @@ -210,3 +211,24 @@ body: | %3:sgpr(s32) = G_LOAD %2 :: (dereferenceable invariant load (s32), align 4, addrspace 4) S_ENDPGM 0, implicit %3 ... + +--- + +# Test a load with a register offset. +# GCN-LABEL: name: smrd_sgpr{{$}} +# GCN: S_LOAD_DWORD_SGPR %0, %1, 0 + +name: smrd_sgpr +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + %0:sgpr(p4) = COPY $sgpr0_sgpr1 + %1:sgpr(s32) = COPY $sgpr2 + %2:sgpr(s64) = G_ZEXT %1:sgpr(s32) + %4:sgpr(p4) = G_PTR_ADD %0, %2 + %5:sgpr(s32) = G_LOAD %4 :: (dereferenceable invariant load (s32), align 4, addrspace 4) + S_ENDPGM 0, implicit %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll index 6523621098a8..a4ff0b449dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -2,10 +2,14 @@ ; from a register. ; RUN: llc -march=amdgcn -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -global-isel -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GISEL %s ; GCN: %[[OFFSET:[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer ; GCN: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0 :: (invariant load (s128) from %ir.13, addrspace 4) +; GISEL: $[[OFFSET:.*]] = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer +; GISEL: S_LOAD_DWORDX4_SGPR killed renamable {{.*}}, killed renamable $[[OFFSET]], 0 :: (invariant load (<4 x s32>) from {{.*}}, addrspace 4) + define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %resNode0, i32 inreg %resNode1, <3 x i32> inreg %2, i32 inreg %3, <3 x i32> %4) local_unnamed_addr #2 { .entry: %5 = call i64 @llvm.amdgcn.s.getpc() #3