From 96352e0a1bda0fc04729ff90d0d576e8f366760f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jan 2020 23:20:38 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Handle LDS with relocations case --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 6 ++-- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 6 ++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 12 ++++++-- llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 +++ .../CodeGen/AMDGPU/GlobalISel/lds-relocs.ll | 28 +++++++++++++++++++ .../AMDGPU/GlobalISel/lds-zero-initializer.ll | 2 -- .../GlobalISel/llvm.amdgcn.atomic.dec.ll | 4 +-- .../GlobalISel/llvm.amdgcn.atomic.inc.ll | 4 +-- llvm/test/CodeGen/AMDGPU/lds-relocs.ll | 4 +-- 10 files changed, 57 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 39c8fb22f944..f50817f669f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1627,7 +1627,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( + MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; @@ -1961,7 +1962,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_FRAME_INDEX: - return selectG_FRAME_INDEX(I); + case TargetOpcode::G_GLOBAL_VALUE: + return selectG_FRAME_INDEX_GLOBAL_VALUE(I); case TargetOpcode::G_PTR_MASK: return selectG_PTR_MASK(I); case TargetOpcode::G_EXTRACT_VECTOR_ELT: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 19bd0315b8c0..f4d9defd33f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -119,7 +119,7 @@ private: bool selectG_STORE(MachineInstr &I) const; bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; - bool selectG_FRAME_INDEX(MachineInstr &I) const; + bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const; bool selectG_PTR_MASK(MachineInstr &I) const; bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5ea35bd0da11..d24bc5066f71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1734,6 +1734,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( // TODO: We could emit code to handle the initialization somewhere. if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->shouldUseLDSConstAddress(GV)) { + MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); + return true; // Leave in place; + } + B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7541c758207a..dd4a9960f6ee 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4411,6 +4411,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); } +bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { + if (!GV->hasExternalLinkage()) + return true; + + const auto OS = getTargetMachine().getTargetTriple().getOS(); + return OS == Triple::AMDHSA || OS == Triple::AMDPAL; +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -5046,9 +5054,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, GlobalAddressSDNode *GSD = cast(Op); const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - (!GV->hasExternalLinkage() || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || + shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d59495b052a4..dbdac2722c87 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -199,6 +199,10 @@ public: /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; + /// \returns true if this should use a literal constant for an LDS address, + /// and not emit a relocation for an LDS global. + bool shouldUseLDSConstAddress(const GlobalValue *GV) const; + private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll new file mode 100644 index 000000000000..704cf594a861 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -0,0 +1,28 @@ +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; FIXME: Merge with DAG test + +@lds.external = external unnamed_addr addrspace(3) global [0 x i32] +@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 + +; GCN-LABEL: {{^}}test_basic: +; GCN: s_add_u32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x80,A,A,A,A] +; GCN: v_add_u32_e32 v0, lds.external@abs32@lo, v0 ; encoding: [0xff,0x00,0x00,0x68,A,A,A,A] + +; GCN: .globl lds.external +; GCN: .amdgpu_lds lds.external, 0, 4 +; GCN: .globl lds.defined +; GCN: .amdgpu_lds lds.defined, 32, 8 +define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 { +main_body: + %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 + %tmp = load i32, i32 addrspace(3)* %gep0 + + %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave + store i32 123, i32 addrspace(3)* %gep1 + + %r = bitcast i32 %tmp to float + ret float %r +} + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index e18895d1e51c..02f77141b411 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,5 +1,3 @@ ; RUN: not llc -global-isel -march=amdgcn -mcpu=tonga < %S/../lds-zero-initializer.ll 2>&1 | FileCheck %s -; FIXME: Select should succeed ; CHECK: error: :0:0: in function load_zeroinit_lds_global void (i32 addrspace(1)*, i1): unsupported initializer for address space -; CHECK: LLVM ERROR: cannot select: %16:sreg_32(p3) = G_GLOBAL_VALUE @lds (in function: load_zeroinit_lds_global) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll index 4945f7a338b6..eb82ca55b5d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -1174,7 +1174,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ret void } -@lds0 = addrspace(3) global [512 x i32] undef +@lds0 = internal addrspace(3) global [512 x i32] undef define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0: @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ret void } -@lds1 = addrspace(3) global [512 x i64] undef, align 8 +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index c77c0f2e0ff4..916f4e7fc664 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -516,7 +516,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa ret void } -@lds0 = addrspace(3) global [512 x i32] undef, align 4 +@lds0 = internal addrspace(3) global [512 x i32] undef, align 4 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 ret void } -@lds1 = addrspace(3) global [512 x i64] undef, align 8 +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll index 63e3dd880bae..dd6bb1f0db2a 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -47,10 +47,8 @@ main_body: %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 %tmp = load i32, i32 addrspace(3)* %gep0 - %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0) - %mask.32 = trunc i64 %mask to i32 %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave - store i32 %mask.32, i32 addrspace(3)* %gep1 + store i32 123, i32 addrspace(3)* %gep1 %r = bitcast i32 %tmp to float ret float %r