[AMDGPU] Omit buffer resource with flat scratch.

Differential Revision: https://reviews.llvm.org/D90979
This commit is contained in:
Stanislav Mekhanoshin 2020-11-06 13:00:10 -08:00
parent 91d2e5c81a
commit d5a465866e
15 changed files with 350 additions and 283 deletions

View File

@ -939,7 +939,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
if (!Subtarget.enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
@ -1227,12 +1228,14 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
MFI->getScratchRSrcReg());
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
if (!ST.enableFlatScratch()) {
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
MFI->getScratchRSrcReg());
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
}
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);

View File

@ -467,7 +467,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
//
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
Register ScratchRsrcReg;
if (!ST.enableFlatScratch())
ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
// Make the selected register live throughout the function.
if (ScratchRsrcReg) {

View File

@ -2117,26 +2117,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
if (!ST.enableFlatScratch()) {
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
Register PrivateSegmentBufferReg =
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
// We tentatively reserve the last registers (skipping the last registers
// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
// we'll replace these with the ones immediately after those which were
// really allocated. In the prologue copies will be inserted from the
// argument to these reserved registers.
Register PrivateSegmentBufferReg =
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
// We tentatively reserve the last registers (skipping the last registers
// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
// we'll replace these with the ones immediately after those which were
// really allocated. In the prologue copies will be inserted from the
// argument to these reserved registers.
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
}
}
MachineRegisterInfo &MRI = MF.getRegInfo();
@ -3012,14 +3014,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SmallVector<SDValue, 4> CopyFromChains;
if (!Subtarget->enableFlatScratch()) {
SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Chain = DAG.getTokenFactor(DL, CopyFromChains);
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
}
MVT PtrVT = MVT::i32;

View File

@ -1379,11 +1379,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
Register ScratchRSrc =
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
: MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(ScratchRSrc, RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
@ -1397,10 +1400,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
Register ScratchRSrc =
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
: MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(ScratchRSrc) // scratch_rsrc
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
@ -1513,21 +1519,27 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
Register ScratchRSrc =
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
: MFI->getScratchRSrcReg();
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(ScratchRSrc, RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
return;
}
unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
Register ScratchRSrc =
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
: MFI->getScratchRSrcReg();
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(ScratchRSrc) // scratch_rsrc
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);

View File

@ -75,16 +75,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
if (!isEntryFunction()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
// TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(ScratchRSrcReg);
if (!ST.enableFlatScratch()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(ScratchRSrcReg);
}
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
@ -142,7 +144,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
if (isAmdHsaOrMesa) {
PrivateSegmentBuffer = true;
if (!ST.enableFlatScratch())
PrivateSegmentBuffer = true;
if (UseFixedABI) {
DispatchPtr = true;

View File

@ -35,8 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v40, 2
; GCN-DAG: v_readlane_b32 s5, v40, 3
; MUBUF-DAG: v_readlane_b32 s4, v40, 2
; MUBUF-DAG: v_readlane_b32 s5, v40, 3
; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
; GCN: v_readlane_b32 s35, v40, 1
; GCN: v_readlane_b32 s34, v40, 0
@ -134,14 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
; FIXME: What is the expected behavior for reserved registers here?
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN: #ASMSTART
; GCN-NEXT: ; def s33
; GCN-NEXT: #ASMEND
; GCN: s_swappc_b64 s[30:31], s[4:5]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
@ -157,9 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
; GCN-NOT: s34
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: s34
@ -168,7 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-NEXT: ;;#ASMEND
; GCN-NOT: s34
; GCN: s_swappc_b64 s[30:31], s[4:5]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: s34
@ -186,9 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
; GCN-NOT: v32
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v40
@ -196,7 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND
; GCN: s_swappc_b64 s[30:31], s[4:5]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: v40

View File

@ -13,9 +13,10 @@ define void @callee_no_stack() #0 {
; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_mov_b32 s4, s33
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_no_fp_elim_all() #1 {
ret void
@ -48,7 +49,8 @@ define void @callee_with_stack() #0 {
; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_mov_b32 s4, s33
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
@ -57,7 +59,7 @@ define void @callee_with_stack() #0 {
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_all() #1 {
@ -100,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]
; MUBUF: s_sub_u32 s32, s32, 0x400{{$}}
; FLATSCR: s_sub_u32 s32, s32, 16{{$}}
@ -140,8 +144,10 @@ define void @callee_with_stack_and_call() #0 {
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v40, 0
; GCN-DAG: v_readlane_b32 s5, v40, 1
; MUBUF-DAG: v_readlane_b32 s4, v40, 0
; MUBUF-DAG: v_readlane_b32 s5, v40, 1
; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
; FLATSCR-DAG: v_readlane_b32 s1, v40, 1
; MUBUF: s_sub_u32 s32, s32, 0x400
; FLATSCR: s_sub_u32 s32, s32, 16
@ -238,9 +244,10 @@ define void @spill_only_csr_sgpr() {
; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
; MUBUF: s_add_u32 s32, s32, 0x300
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300
; MUBUF-NEXT: s_mov_b32 s33, s4
; FLATSCR: s_add_u32 s32, s32, 12
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
; GCN-NEXT: s_mov_b32 s33, s4
; FLATSCR-NEXT: s_mov_b32 s33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
@ -330,7 +337,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; GCN: s_waitcnt
; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff
; GCN-NEXT: s_mov_b32 s4, s33
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000
; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000
; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000
@ -340,7 +348,7 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @realign_stack_no_fp_elim() #1 {
@ -359,15 +367,18 @@ define void @realign_stack_no_fp_elim() #1 {
; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s4, v1, 0
; MUBUF: v_readlane_b32 s4, v1, 0
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
; MUBUF-NEXT: v_readlane_b32 s5, v1, 1
; FLATSCR: v_readlane_b32 s0, v1, 0
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
; GCN-NEXT: v_readlane_b32 s5, v1, 1
; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
; GCN-NEXT: v_readlane_b32 s33, v1, 2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN-NEXT: v_readlane_b32 s33, v1, 2
; GCN-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_setpc_b64 s[4:5]
; FLATSCR-NEXT: s_setpc_b64 s[0:1]
define void @no_unused_non_csr_sgpr_for_fp() #1 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@ -399,9 +410,11 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; MUBUF: s_add_u32 s32, s32, 0x300{{$}}
; FLATSCR: s_add_u32 s32, s32, 12{{$}}
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}}
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@ -450,9 +463,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; MUBUF-DAG: buffer_store_dword
; FLATSCR-DAG: scratch_store_dword
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}}
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@ -514,20 +529,21 @@ define void @ipra_call_with_stack() #0 {
; With no free registers, we must spill the FP to memory.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4
; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN: s_mov_b32 s33, s32
; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; FLATSCR: s_mov_b32 s0, s33
; GCN: s_mov_b32 s33, s32
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4
; GCN: s_waitcnt vmcnt(0)
; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
; GCN: ScratchSize: 8
; FLATSCR: s_mov_b32 s33, s0
; MUBUF: s_waitcnt vmcnt(0)
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
; MUBUF: ScratchSize: 8
; FLATSCR: ScratchSize: 0
define void @callee_need_to_spill_fp_to_memory() #3 {
call void asm sideeffect "; clobber nonpreserved SGPRs",
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@ -547,20 +563,19 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
; need to spill the FP to memory if there are no free lanes in the reserved
; VGPR.
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]]
; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NOT: v_writelane_b32 v40, s33
; GCN: s_mov_b32 s33, s32
; MUBUF: s_mov_b32 s33, s32
; FLATSCR: s_mov_b32 s33, s0
; GCN-NOT: v_readlane_b32 s33, v40
; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]]
; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
; GCN: s_setpc_b64
define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@ -585,14 +600,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
; If the size of the offset exceeds the MUBUF offset field we need another
; scratch VGPR to hold the offset.
; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
; GCN: s_or_saveexec_b64 s[4:5], -1
; MUBUF: s_or_saveexec_b64 s[4:5], -1
; MUBUF: v_mov_b32_e32 v0, s33
; GCN-NOT: v_mov_b32_e32 v0, 0x1008
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008
; FLATSCR-NEXT: v_mov_b32_e32 v0, s33
; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
; FLATSCR: v_mov_b32_e32 v0, 0
; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca

View File

@ -15,11 +15,11 @@ define <2 x half> @chain_hi_to_lo_private() {
; FLATSCR-LABEL: chain_hi_to_lo_private:
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s4, 2
; FLATSCR-NEXT: scratch_load_ushort v0, off, s4
; FLATSCR-NEXT: s_mov_b32 s4, 0
; FLATSCR-NEXT: s_mov_b32 s0, 2
; FLATSCR-NEXT: scratch_load_ushort v0, off, s0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4
; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
bb:
@ -256,13 +256,13 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
;
; FLATSCR-LABEL: vload2_private:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4
@ -272,8 +272,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, s6
; FLATSCR-NEXT: v_mov_b32_e32 v1, s7
; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
; FLATSCR-NEXT: v_mov_b32_e32 v1, s3
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0

View File

@ -150,10 +150,10 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 1
@ -251,10 +251,10 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 4

View File

@ -1109,13 +1109,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x3000
; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: scratch_store_dword off, v0, s32
; GFX9-NEXT: s_add_u32 s4, s32, s4
; GFX9-NEXT: s_add_u32 s0, s32, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712
; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@ -1125,12 +1125,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
; GFX10-NEXT: s_movk_i32 s4, 0x3800
; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_add_u32 s4, s32, s4
; GFX10-NEXT: s_add_u32 s0, s32, s0
; GFX10-NEXT: scratch_store_dword off, v0, s32
; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664
; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]

View File

@ -1364,8 +1364,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@ -1413,8 +1413,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@ -1463,8 +1463,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@ -1614,8 +1614,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@ -1664,8 +1664,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@ -1716,8 +1716,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)

View File

@ -56,37 +56,37 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
;
; FLATSCR-LABEL: local_stack_offset_uses_sp:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000
; FLATSCR-NEXT: s_mov_b32 s6, 0
; FLATSCR-NEXT: s_mov_b32 s2, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi
; FLATSCR-NEXT: BB0_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6
; FLATSCR-NEXT: s_add_i32 s6, s6, 1
; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120
; FLATSCR-NEXT: scratch_store_byte off, v0, s7
; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2
; FLATSCR-NEXT: s_add_i32 s2, s2, 1
; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120
; FLATSCR-NEXT: scratch_store_byte off, v0, s3
; FLATSCR-NEXT: s_cbranch_scc1 BB0_1
; FLATSCR-NEXT: ; %bb.2: ; %split
; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0
; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4
; FLATSCR-NEXT: s_movk_i32 s6, 0x2000
; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208
; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68
; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64
; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4
; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v2, s4
; FLATSCR-NEXT: v_mov_b32_e32 v3, s5
; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; FLATSCR-NEXT: s_endpgm
entry:
@ -146,36 +146,36 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff
; FLATSCR-NEXT: s_mov_b32 s6, s33
; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000
; FLATSCR-NEXT: s_add_u32 s0, s32, 0x1fff
; FLATSCR-NEXT: s_mov_b32 s2, s33
; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffe000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: s_mov_b32 s4, 0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000
; FLATSCR-NEXT: scratch_store_dword off, v2, s33
; FLATSCR-NEXT: BB1_1: ; %loadstoreloop
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000
; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4
; FLATSCR-NEXT: s_add_i32 s4, s4, 1
; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120
; FLATSCR-NEXT: scratch_store_byte off, v2, s5
; FLATSCR-NEXT: s_add_u32 s1, vcc_hi, s0
; FLATSCR-NEXT: s_add_i32 s0, s0, 1
; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
; FLATSCR-NEXT: scratch_store_byte off, v2, s1
; FLATSCR-NEXT: s_cbranch_scc1 BB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0
; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
; FLATSCR-NEXT: s_add_u32 s4, s5, s4
; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4
; FLATSCR-NEXT: s_movk_i32 s4, 0x2000
; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
; FLATSCR-NEXT: s_add_u32 s4, s5, s4
; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208
; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68
; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64
; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000
; FLATSCR-NEXT: s_mov_b32 s33, s6
; FLATSCR-NEXT: s_mov_b32 s33, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc

View File

@ -54,36 +54,36 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: s_mov_b32 s32, 16
; FLATSCR-NEXT: s_mov_b32 s33, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0
; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
; FLATSCR-NEXT: s_mov_b32 s6, s32
; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
; FLATSCR-NEXT: s_add_i32 s8, s6, s7
; FLATSCR-NEXT: s_add_u32 s6, s6, s7
; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: scratch_store_dword off, v1, s6
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2
; FLATSCR-NEXT: s_mov_b32 s32, s8
; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4
; FLATSCR-NEXT: s_add_i32 s8, s8, s6
; FLATSCR-NEXT: scratch_load_dword v1, off, s8
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2
; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4
; FLATSCR-NEXT: s_add_i32 s4, s4, s2
; FLATSCR-NEXT: scratch_load_dword v1, off, s4
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB0_3: ; %bb.2
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@ -162,31 +162,31 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
;
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; FLATSCR-NEXT: s_mov_b32 s32, 64
; FLATSCR-NEXT: s_mov_b32 s33, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0
; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0
; FLATSCR-NEXT: s_cbranch_scc1 BB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: scratch_store_dword off, v1, s6
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2
; FLATSCR-NEXT: s_mov_b32 s32, s6
; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4
; FLATSCR-NEXT: s_add_i32 s6, s6, s7
; FLATSCR-NEXT: scratch_load_dword v1, off, s6
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2
; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
; FLATSCR-NEXT: scratch_load_dword v1, off, s2
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB1_2: ; %bb.1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@ -261,38 +261,38 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s9, s33
; FLATSCR-NEXT: s_mov_b32 s5, s33
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; FLATSCR-NEXT: s_mov_b32 s33, s32
; FLATSCR-NEXT: s_add_u32 s32, s32, 16
; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz BB2_3
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
; FLATSCR-NEXT: s_cbranch_execz BB2_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
; FLATSCR-NEXT: s_mov_b32 s6, s32
; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
; FLATSCR-NEXT: s_add_i32 s8, s6, s7
; FLATSCR-NEXT: s_add_u32 s6, s6, s7
; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: scratch_store_dword off, v2, s6
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8
; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; FLATSCR-NEXT: s_mov_b32 s32, s8
; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB2_3: ; %bb.2
; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_sub_u32 s32, s32, 16
; FLATSCR-NEXT: s_mov_b32 s33, s9
; FLATSCR-NEXT: s_mov_b32 s33, s5
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@ -361,33 +361,33 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: s_add_u32 s4, s32, 63
; FLATSCR-NEXT: s_mov_b32 s7, s33
; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0
; FLATSCR-NEXT: s_add_u32 s0, s32, 63
; FLATSCR-NEXT: s_mov_b32 s3, s33
; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffffc0
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80
; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz BB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: scratch_store_dword off, v2, s6
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; FLATSCR-NEXT: s_mov_b32 s32, s6
; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; FLATSCR-NEXT: BB3_2: ; %bb.1
; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80
; FLATSCR-NEXT: s_mov_b32 s33, s7
; FLATSCR-NEXT: s_mov_b32 s33, s3
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:

View File

@ -1,10 +1,11 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; Test that the VGPR spiller correctly switches to SGPR offsets when the
; instruction offset field would overflow, and that it accounts for memory
; swizzling.
; CHECK-LABEL: test_inst_offset_kernel
; GCN-LABEL: test_inst_offset_kernel
define amdgpu_kernel void @test_inst_offset_kernel() {
entry:
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@ -13,7 +14,8 @@ entry:
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill.
@ -25,7 +27,7 @@ entry:
ret void
}
; CHECK-LABEL: test_sgpr_offset_kernel
; GCN-LABEL: test_sgpr_offset_kernel
define amdgpu_kernel void @test_sgpr_offset_kernel() {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -35,8 +37,10 @@ entry:
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; 0x40000 / 64 = 4096 (for wave64)
; CHECK: s_mov_b32 s6, 0x40000
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
; MUBUF: s_mov_b32 s6, 0x40000
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 s2, 0x1000
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill
@ -51,7 +55,7 @@ entry:
; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
; pointer to temporarily update, so we just crash.
; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
define void @test_sgpr_offset_function_scavenge_fail() #2 {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -74,9 +78,11 @@ entry:
; 0x40000 / 64 = 4096 (for wave64)
%a = load volatile i32, i32 addrspace(5)* %aptr
; CHECK: s_add_u32 s32, s32, 0x40000
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
; CHECK: s_sub_u32 s32, s32, 0x40000
; MUBUF: s_add_u32 s32, s32, 0x40000
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
; MUBUF: s_sub_u32 s32, s32, 0x40000
; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
@ -91,16 +97,18 @@ entry:
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
; CHECK: s_add_u32 s32, s32, 0x40000
; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
; CHECK: s_sub_u32 s32, s32, 0x40000
; MUBUF: s_add_u32 s32, s32, 0x40000
; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
; MUBUF: s_sub_u32 s32, s32, 0x40000
; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
; Force %a to spill with no free SGPRs
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
ret void
}
; CHECK-LABEL: test_sgpr_offset_subregs_kernel
; GCN-LABEL: test_sgpr_offset_subregs_kernel
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
entry:
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@ -110,8 +118,11 @@ entry:
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@ -128,7 +139,7 @@ entry:
ret void
}
; CHECK-LABEL: test_inst_offset_subregs_kernel
; GCN-LABEL: test_inst_offset_subregs_kernel
define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
entry:
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@ -139,9 +150,12 @@ entry:
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; 0x3ff00 / 64 = 4092 (for wave64)
; CHECK: s_mov_b32 s6, 0x3ff00
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
; MUBUF: s_mov_b32 s6, 0x3ff00
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@ -158,7 +172,7 @@ entry:
ret void
}
; CHECK-LABEL: test_inst_offset_function
; GCN-LABEL: test_inst_offset_function
define void @test_inst_offset_function() {
entry:
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@ -167,7 +181,8 @@ entry:
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill.
@ -179,7 +194,7 @@ entry:
ret void
}
; CHECK-LABEL: test_sgpr_offset_function
; GCN-LABEL: test_sgpr_offset_function
define void @test_sgpr_offset_function() {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -189,8 +204,10 @@ entry:
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; 0x40000 / 64 = 4096 (for wave64)
; CHECK: s_add_u32 s4, s32, 0x40000
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
; MUBUF: s_add_u32 s4, s32, 0x40000
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
; FLATSCR: s_add_u32 s0, s32, 0x1000
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
; Force %a to spill
@ -202,7 +219,7 @@ entry:
ret void
}
; CHECK-LABEL: test_sgpr_offset_subregs_function
; GCN-LABEL: test_sgpr_offset_subregs_function
define void @test_sgpr_offset_subregs_function() {
entry:
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@ -212,8 +229,10 @@ entry:
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
@ -230,7 +249,7 @@ entry:
ret void
}
; CHECK-LABEL: test_inst_offset_subregs_function
; GCN-LABEL: test_inst_offset_subregs_function
define void @test_inst_offset_subregs_function() {
entry:
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@ -241,9 +260,12 @@ entry:
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; 0x3ff00 / 64 = 4092 (for wave64)
; CHECK: s_add_u32 s4, s32, 0x3ff00
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
; MUBUF: s_add_u32 s4, s32, 0x3ff00
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr

View File

@ -60,26 +60,18 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
; FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24
; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; FLATSCR-NEXT: s_mov_b32 s38, -1
; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000
; FLATSCR-NEXT: s_add_u32 s36, s36, s5
; FLATSCR-NEXT: s_addc_u32 s37, s37, 0
; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000
; FLATSCR-NEXT: v_mov_b32_e32 v3, 0
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000
; FLATSCR-NEXT: ; implicit-def: $vcc_hi
; FLATSCR-NEXT: s_getpc_b64 s[4:5]
; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37]
; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39]
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo
; FLATSCR-NEXT: s_cbranch_execz BB0_2