From eb1f75d561762123224c17b763f68def84a05715 Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Thu, 26 Jul 2018 19:47:51 +0000 Subject: [PATCH] [AMDGPU] Fix VGPR spills where offset doesn't fit in 12 bits Scale the offset of VGPR spills by the wave size when it cannot fit in the 12-bit offset immediate field and so is added to the soffset SGPR. This accounts for hardware swizzling of scratch memory. Differential Revision: https://reviews.llvm.org/D49448 llvm-svn: 338060 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 27 ++- .../AMDGPU/spill-offset-calculation.ll | 213 ++++++++++++++++++ 2 files changed, 229 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 5bfe071c00e9..624607f6ea54 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -532,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = Desc.mayStore(); - bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffsetReg; + const unsigned EltSize = 4; const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); - unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; - unsigned Size = NumSubRegs * 4; + unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); + unsigned Size = NumSubRegs * EltSize; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); - const int64_t OriginalImmOffset = Offset; + int64_t ScratchOffsetRegDelta = 0; unsigned Align = MFI.getObjectAlignment(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); - if (!isUInt<12>(Offset + Size)) { + assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); + + if (!isUInt<12>(Offset + Size - EltSize)) { SOffset = AMDGPU::NoRegister; + // We currently only support spilling VGPRs to EltSize boundaries, meaning + // we can simplify the adjustment of Offset here to just scale with + // WavefrontSize. + Offset *= ST.getWavefrontSize(); + // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) @@ -561,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // add the offset directly to the ScratchOffset register, and then // subtract the offset after the spill to return ScratchOffset to it's // original value. - RanOutOfSGPRs = true; SOffset = ScratchOffsetReg; + ScratchOffsetRegDelta = Offset; } else { Scavenged = true; } @@ -574,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, Offset = 0; } - const unsigned EltSize = 4; - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { unsigned SubReg = NumSubRegs == 1 ? ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); @@ -607,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); } - if (RanOutOfSGPRs) { + if (ScratchOffsetRegDelta != 0) { // Subtract the offset we added to the ScratchOffset register. BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) - .addReg(ScratchOffsetReg) - .addImm(OriginalImmOffset); + .addReg(ScratchOffsetReg) + .addImm(ScratchOffsetRegDelta); } } diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll new file mode 100644 index 000000000000..3c179b580fc0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -0,0 +1,213 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s + +; Test that the VGPR spiller correctly switches to SGPR offsets when the +; instruction offset field would overflow, and that it accounts for memory +; swizzling. + +; CHECK-LABEL: test_inst_offset_kernel +define amdgpu_kernel void @test_inst_offset_kernel() { +entry: + ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in + ; the instruction offset field. + %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: test_sgpr_offset_kernel +define amdgpu_kernel void @test_sgpr_offset_kernel() { +entry: + ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not + ; fit in the instruction, and has to live in the SGPR offset. + %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; 0x40000 / 64 = 4096 (for wave64) + ; CHECK: s_add_u32 s7, s7, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Spill + ; CHECK: s_sub_u32 s7, s7, 0x40000 + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: test_sgpr_offset_subregs_kernel +define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { +entry: + ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a + ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in + ; the instruction offset field. + %alloca = alloca i8, i32 4084, align 4, addrspace(5) + %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* + + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + ; Ensure the alloca sticks around. + %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 + %b = load volatile i32, i32 addrspace(5)* %bptr + + ; Ensure the spill is of the full super-reg. + call void asm sideeffect "; $0", "r"(<2 x i32> %a) + + ret void +} + +; CHECK-LABEL: test_inst_offset_subregs_kernel +define amdgpu_kernel void @test_inst_offset_subregs_kernel() { +entry: + ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a + ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live + ; in the SGPR offset. + %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* + + ; 0x3ff00 / 64 = 4092 (for wave64) + ; CHECK: s_add_u32 s7, s7, 0x3ff00 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 offset:4 ; 4-byte Folded Spill + ; CHECK: s_sub_u32 s7, s7, 0x3ff00 + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + ; Ensure the alloca sticks around. + %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 + %b = load volatile i32, i32 addrspace(5)* %bptr + + ; Ensure the spill is of the full super-reg. + call void asm sideeffect "; $0", "r"(<2 x i32> %a) + + ret void +} + +; CHECK-LABEL: test_inst_offset_function +define void @test_inst_offset_function() { +entry: + ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in + ; the instruction offset field. + %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: test_sgpr_offset_function +define void @test_sgpr_offset_function() { +entry: + ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not + ; fit in the instruction, and has to live in the SGPR offset. + %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; 0x40000 / 64 = 4096 (for wave64) + ; CHECK: s_add_u32 s5, s5, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 ; 4-byte Folded Spill + ; CHECK: s_sub_u32 s5, s5, 0x40000 + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: test_sgpr_offset_subregs_function +define void @test_sgpr_offset_subregs_function() { +entry: + ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a + ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in + ; the instruction offset field. + %alloca = alloca i8, i32 4084, align 4, addrspace(5) + %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* + + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + ; Ensure the alloca sticks around. + %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 + %b = load volatile i32, i32 addrspace(5)* %bptr + + ; Ensure the spill is of the full super-reg. + call void asm sideeffect "; $0", "r"(<2 x i32> %a) + + ret void +} + +; CHECK-LABEL: test_inst_offset_subregs_function +define void @test_inst_offset_subregs_function() { +entry: + ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a + ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live + ; in the SGPR offset. + %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* + + ; 0x3ff00 / 64 = 4092 (for wave64) + ; CHECK: s_add_u32 s5, s5, 0x3ff00 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s5 offset:4 ; 4-byte Folded Spill + ; CHECK: s_sub_u32 s5, s5, 0x3ff00 + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + ; Ensure the alloca sticks around. + %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 + %b = load volatile i32, i32 addrspace(5)* %bptr + + ; Ensure the spill is of the full super-reg. + call void asm sideeffect "; $0", "r"(<2 x i32> %a) + + ret void +}