forked from OSchip/llvm-project
[AMDGPU] Omit buffer resource with flat scratch.
Differential Revision: https://reviews.llvm.org/D90979
This commit is contained in:
parent
91d2e5c81a
commit
d5a465866e
|
@ -939,7 +939,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
|
|||
if (IsEntryFunc) {
|
||||
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
|
||||
} else {
|
||||
CCInfo.AllocateReg(Info->getScratchRSrcReg());
|
||||
if (!Subtarget.enableFlatScratch())
|
||||
CCInfo.AllocateReg(Info->getScratchRSrcReg());
|
||||
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
|
||||
}
|
||||
|
||||
|
@ -1227,12 +1228,14 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
|
|||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
// Insert copies for the SRD. In the HSA case, this should be an identity
|
||||
// copy.
|
||||
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
|
||||
MFI->getScratchRSrcReg());
|
||||
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
|
||||
MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
|
||||
if (!ST.enableFlatScratch()) {
|
||||
// Insert copies for the SRD. In the HSA case, this should be an identity
|
||||
// copy.
|
||||
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
|
||||
MFI->getScratchRSrcReg());
|
||||
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
|
||||
MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
|
||||
}
|
||||
|
||||
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
|
||||
MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
|
||||
|
|
|
@ -467,7 +467,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
|||
//
|
||||
// This will return `Register()` in cases where there are no actual
|
||||
// uses of the SRSRC.
|
||||
Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
|
||||
Register ScratchRsrcReg;
|
||||
if (!ST.enableFlatScratch())
|
||||
ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
|
||||
|
||||
// Make the selected register live throughout the function.
|
||||
if (ScratchRsrcReg) {
|
||||
|
|
|
@ -2117,26 +2117,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
|||
// the scratch registers to pass in.
|
||||
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
|
||||
|
||||
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
|
||||
// If we have stack objects, we unquestionably need the private buffer
|
||||
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
||||
// SGPR inputs. We can reserve those and use them directly.
|
||||
if (!ST.enableFlatScratch()) {
|
||||
if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
|
||||
// If we have stack objects, we unquestionably need the private buffer
|
||||
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
||||
// SGPR inputs. We can reserve those and use them directly.
|
||||
|
||||
Register PrivateSegmentBufferReg =
|
||||
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
|
||||
} else {
|
||||
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
|
||||
// We tentatively reserve the last registers (skipping the last registers
|
||||
// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
|
||||
// we'll replace these with the ones immediately after those which were
|
||||
// really allocated. In the prologue copies will be inserted from the
|
||||
// argument to these reserved registers.
|
||||
Register PrivateSegmentBufferReg =
|
||||
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
|
||||
} else {
|
||||
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
|
||||
// We tentatively reserve the last registers (skipping the last registers
|
||||
// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
|
||||
// we'll replace these with the ones immediately after those which were
|
||||
// really allocated. In the prologue copies will be inserted from the
|
||||
// argument to these reserved registers.
|
||||
|
||||
// Without HSA, relocations are used for the scratch pointer and the
|
||||
// buffer resource setup is always inserted in the prologue. Scratch wave
|
||||
// offset is still in an input SGPR.
|
||||
Info.setScratchRSrcReg(ReservedBufferReg);
|
||||
// Without HSA, relocations are used for the scratch pointer and the
|
||||
// buffer resource setup is always inserted in the prologue. Scratch wave
|
||||
// offset is still in an input SGPR.
|
||||
Info.setScratchRSrcReg(ReservedBufferReg);
|
||||
}
|
||||
}
|
||||
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
@ -3012,14 +3014,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
|||
if (!IsSibCall) {
|
||||
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
|
||||
|
||||
SmallVector<SDValue, 4> CopyFromChains;
|
||||
if (!Subtarget->enableFlatScratch()) {
|
||||
SmallVector<SDValue, 4> CopyFromChains;
|
||||
|
||||
// In the HSA case, this should be an identity copy.
|
||||
SDValue ScratchRSrcReg
|
||||
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
|
||||
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
|
||||
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
|
||||
Chain = DAG.getTokenFactor(DL, CopyFromChains);
|
||||
// In the HSA case, this should be an identity copy.
|
||||
SDValue ScratchRSrcReg
|
||||
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
|
||||
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
|
||||
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
|
||||
Chain = DAG.getTokenFactor(DL, CopyFromChains);
|
||||
}
|
||||
}
|
||||
|
||||
MVT PtrVT = MVT::i32;
|
||||
|
|
|
@ -1379,11 +1379,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|||
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
|
||||
}
|
||||
|
||||
Register ScratchRSrc =
|
||||
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
|
||||
: MFI->getScratchRSrcReg();
|
||||
BuildMI(MBB, MI, DL, OpDesc)
|
||||
.addReg(SrcReg, getKillRegState(isKill)) // data
|
||||
.addFrameIndex(FrameIndex) // addr
|
||||
.addMemOperand(MMO)
|
||||
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
|
||||
.addReg(ScratchRSrc, RegState::Implicit)
|
||||
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
|
||||
// Add the scratch resource registers as implicit uses because we may end up
|
||||
// needing them, and need to ensure that the reserved registers are
|
||||
|
@ -1397,10 +1400,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|||
: getVGPRSpillSaveOpcode(SpillSize);
|
||||
MFI->setHasSpilledVGPRs();
|
||||
|
||||
Register ScratchRSrc =
|
||||
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
|
||||
: MFI->getScratchRSrcReg();
|
||||
BuildMI(MBB, MI, DL, get(Opcode))
|
||||
.addReg(SrcReg, getKillRegState(isKill)) // data
|
||||
.addFrameIndex(FrameIndex) // addr
|
||||
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
||||
.addReg(ScratchRSrc) // scratch_rsrc
|
||||
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
|
||||
.addImm(0) // offset
|
||||
.addMemOperand(MMO);
|
||||
|
@ -1513,21 +1519,27 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|||
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
|
||||
}
|
||||
|
||||
Register ScratchRSrc =
|
||||
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
|
||||
: MFI->getScratchRSrcReg();
|
||||
if (RI.spillSGPRToVGPR())
|
||||
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
|
||||
BuildMI(MBB, MI, DL, OpDesc, DestReg)
|
||||
.addFrameIndex(FrameIndex) // addr
|
||||
.addMemOperand(MMO)
|
||||
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
|
||||
.addReg(ScratchRSrc, RegState::Implicit)
|
||||
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
|
||||
: getVGPRSpillRestoreOpcode(SpillSize);
|
||||
Register ScratchRSrc =
|
||||
ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
|
||||
: MFI->getScratchRSrcReg();
|
||||
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
|
||||
.addFrameIndex(FrameIndex) // vaddr
|
||||
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
||||
.addReg(ScratchRSrc) // scratch_rsrc
|
||||
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
|
||||
.addImm(0) // offset
|
||||
.addMemOperand(MMO);
|
||||
|
|
|
@ -75,16 +75,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
}
|
||||
|
||||
if (!isEntryFunction()) {
|
||||
// Non-entry functions have no special inputs for now, other registers
|
||||
// required for scratch access.
|
||||
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
|
||||
|
||||
// TODO: Pick a high register, and shift down, similar to a kernel.
|
||||
FrameOffsetReg = AMDGPU::SGPR33;
|
||||
StackPtrOffsetReg = AMDGPU::SGPR32;
|
||||
|
||||
ArgInfo.PrivateSegmentBuffer =
|
||||
ArgDescriptor::createRegister(ScratchRSrcReg);
|
||||
if (!ST.enableFlatScratch()) {
|
||||
// Non-entry functions have no special inputs for now, other registers
|
||||
// required for scratch access.
|
||||
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
|
||||
|
||||
ArgInfo.PrivateSegmentBuffer =
|
||||
ArgDescriptor::createRegister(ScratchRSrcReg);
|
||||
}
|
||||
|
||||
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
|
||||
ImplicitArgPtr = true;
|
||||
|
@ -142,7 +144,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
|
||||
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
|
||||
if (isAmdHsaOrMesa) {
|
||||
PrivateSegmentBuffer = true;
|
||||
if (!ST.enableFlatScratch())
|
||||
PrivateSegmentBuffer = true;
|
||||
|
||||
if (UseFixedABI) {
|
||||
DispatchPtr = true;
|
||||
|
|
|
@ -35,8 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
|
|||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: s_swappc_b64
|
||||
; GCN-DAG: v_readlane_b32 s4, v40, 2
|
||||
; GCN-DAG: v_readlane_b32 s5, v40, 3
|
||||
; MUBUF-DAG: v_readlane_b32 s4, v40, 2
|
||||
; MUBUF-DAG: v_readlane_b32 s5, v40, 3
|
||||
; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
|
||||
; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
|
||||
; GCN: v_readlane_b32 s35, v40, 1
|
||||
; GCN: v_readlane_b32 s34, v40, 0
|
||||
|
||||
|
@ -134,14 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
|
|||
; FIXME: What is the expected behavior for reserved registers here?
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
|
||||
; GCN: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GCN: #ASMSTART
|
||||
; GCN-NEXT: ; def s33
|
||||
; GCN-NEXT: #ASMEND
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN-NEXT: ; use s33
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
@ -157,9 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
|
|||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
|
||||
; GCN-NOT: s34
|
||||
|
||||
; GCN: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s34
|
||||
|
@ -168,7 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
|
|||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NOT: s34
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
|
||||
; GCN-NOT: s34
|
||||
|
||||
|
@ -186,9 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
|
|||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
|
||||
|
||||
; GCN-NOT: v32
|
||||
; GCN: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GCN-NOT: v40
|
||||
|
||||
|
@ -196,7 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
|
|||
; GCN-NEXT: ; def v40
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
|
||||
; GCN-NOT: v40
|
||||
|
||||
|
|
|
@ -13,9 +13,10 @@ define void @callee_no_stack() #0 {
|
|||
; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_mov_b32 s4, s33
|
||||
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
|
||||
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_mov_b32 s33, s4
|
||||
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @callee_no_stack_no_fp_elim_all() #1 {
|
||||
ret void
|
||||
|
@ -48,7 +49,8 @@ define void @callee_with_stack() #0 {
|
|||
; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_mov_b32 s4, s33
|
||||
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
|
||||
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
|
||||
|
@ -57,7 +59,7 @@ define void @callee_with_stack() #0 {
|
|||
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
|
||||
; GCN-NEXT: s_mov_b32 s33, s4
|
||||
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @callee_with_stack_no_fp_elim_all() #1 {
|
||||
|
@ -100,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
|
|||
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
|
||||
; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
|
||||
; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
|
||||
; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
|
||||
; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
|
||||
; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]
|
||||
|
||||
; MUBUF: s_sub_u32 s32, s32, 0x400{{$}}
|
||||
; FLATSCR: s_sub_u32 s32, s32, 16{{$}}
|
||||
|
@ -140,8 +144,10 @@ define void @callee_with_stack_and_call() #0 {
|
|||
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN-DAG: v_readlane_b32 s4, v40, 0
|
||||
; GCN-DAG: v_readlane_b32 s5, v40, 1
|
||||
; MUBUF-DAG: v_readlane_b32 s4, v40, 0
|
||||
; MUBUF-DAG: v_readlane_b32 s5, v40, 1
|
||||
; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
|
||||
; FLATSCR-DAG: v_readlane_b32 s1, v40, 1
|
||||
|
||||
; MUBUF: s_sub_u32 s32, s32, 0x400
|
||||
; FLATSCR: s_sub_u32 s32, s32, 16
|
||||
|
@ -238,9 +244,10 @@ define void @spill_only_csr_sgpr() {
|
|||
; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
|
||||
; MUBUF: s_add_u32 s32, s32, 0x300
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s4
|
||||
; FLATSCR: s_add_u32 s32, s32, 12
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
|
||||
; GCN-NEXT: s_mov_b32 s33, s4
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
|
||||
|
@ -330,7 +337,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
|
|||
; GCN: s_waitcnt
|
||||
; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
|
||||
; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff
|
||||
; GCN-NEXT: s_mov_b32 s4, s33
|
||||
; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33
|
||||
; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
|
||||
; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000
|
||||
; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000
|
||||
; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000
|
||||
|
@ -340,7 +348,7 @@ define void @no_new_vgpr_for_fp_csr() #1 {
|
|||
; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
|
||||
; GCN-NEXT: s_mov_b32 s33, s4
|
||||
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @realign_stack_no_fp_elim() #1 {
|
||||
|
@ -359,15 +367,18 @@ define void @realign_stack_no_fp_elim() #1 {
|
|||
; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
|
||||
; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN: v_readlane_b32 s4, v1, 0
|
||||
; MUBUF: v_readlane_b32 s4, v1, 0
|
||||
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
|
||||
; MUBUF-NEXT: v_readlane_b32 s5, v1, 1
|
||||
; FLATSCR: v_readlane_b32 s0, v1, 0
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 8
|
||||
; GCN-NEXT: v_readlane_b32 s5, v1, 1
|
||||
; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
|
||||
; GCN-NEXT: v_readlane_b32 s33, v1, 2
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; GCN-NEXT: v_readlane_b32 s33, v1, 2
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_setpc_b64 s[4:5]
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[0:1]
|
||||
define void @no_unused_non_csr_sgpr_for_fp() #1 {
|
||||
%alloca = alloca i32, addrspace(5)
|
||||
store volatile i32 0, i32 addrspace(5)* %alloca
|
||||
|
@ -399,9 +410,11 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
|
|||
; MUBUF: s_add_u32 s32, s32, 0x300{{$}}
|
||||
; FLATSCR: s_add_u32 s32, s32, 12{{$}}
|
||||
|
||||
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
|
||||
; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
|
||||
; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
|
||||
; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
|
||||
; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}}
|
||||
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
|
||||
|
@ -450,9 +463,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
|
|||
; MUBUF-DAG: buffer_store_dword
|
||||
; FLATSCR-DAG: scratch_store_dword
|
||||
|
||||
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
|
||||
; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0
|
||||
; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
|
||||
; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1
|
||||
; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}}
|
||||
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
|
||||
|
@ -514,20 +529,21 @@ define void @ipra_call_with_stack() #0 {
|
|||
|
||||
; With no free registers, we must spill the FP to memory.
|
||||
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
|
||||
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
|
||||
; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4
|
||||
; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; FLATSCR: s_mov_b32 s0, s33
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
|
||||
; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
; GCN: ScratchSize: 8
|
||||
; FLATSCR: s_mov_b32 s33, s0
|
||||
; MUBUF: s_waitcnt vmcnt(0)
|
||||
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
; MUBUF: ScratchSize: 8
|
||||
; FLATSCR: ScratchSize: 0
|
||||
define void @callee_need_to_spill_fp_to_memory() #3 {
|
||||
call void asm sideeffect "; clobber nonpreserved SGPRs",
|
||||
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
|
||||
|
@ -547,20 +563,19 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
|
|||
; need to spill the FP to memory if there are no free lanes in the reserved
|
||||
; VGPR.
|
||||
; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
|
||||
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
|
||||
; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
|
||||
; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]]
|
||||
; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]]
|
||||
; GCN-NOT: v_writelane_b32 v40, s33
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; MUBUF: s_mov_b32 s33, s32
|
||||
; FLATSCR: s_mov_b32 s33, s0
|
||||
; GCN-NOT: v_readlane_b32 s33, v40
|
||||
; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
||||
; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
|
||||
; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]]
|
||||
; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
|
||||
; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]]
|
||||
; GCN: s_setpc_b64
|
||||
define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
|
||||
call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
|
||||
"~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
|
||||
|
@ -585,14 +600,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
|
|||
; If the size of the offset exceeds the MUBUF offset field we need another
|
||||
; scratch VGPR to hold the offset.
|
||||
; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
|
||||
; GCN: s_or_saveexec_b64 s[4:5], -1
|
||||
; MUBUF: s_or_saveexec_b64 s[4:5], -1
|
||||
; MUBUF: v_mov_b32_e32 v0, s33
|
||||
; GCN-NOT: v_mov_b32_e32 v0, 0x1008
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008
|
||||
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
|
||||
; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s33
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill
|
||||
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
|
||||
; FLATSCR: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
|
||||
define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 {
|
||||
%alloca = alloca i32, addrspace(5)
|
||||
store volatile i32 0, i32 addrspace(5)* %alloca
|
||||
|
|
|
@ -15,11 +15,11 @@ define <2 x half> @chain_hi_to_lo_private() {
|
|||
; FLATSCR-LABEL: chain_hi_to_lo_private:
|
||||
; FLATSCR: ; %bb.0: ; %bb
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_mov_b32 s4, 2
|
||||
; FLATSCR-NEXT: scratch_load_ushort v0, off, s4
|
||||
; FLATSCR-NEXT: s_mov_b32 s4, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 2
|
||||
; FLATSCR-NEXT: scratch_load_ushort v0, off, s0
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4
|
||||
; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
|
@ -256,13 +256,13 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
|
|||
;
|
||||
; FLATSCR-LABEL: vload2_private:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4
|
||||
|
@ -272,8 +272,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
|
|||
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6
|
||||
; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s6
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s7
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s3
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
|
|
|
@ -150,10 +150,10 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
|
|||
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
|
||||
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
|
||||
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
|
||||
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
|
||||
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
|
||||
%p.0 = load i16, i16 addrspace(5)* %p, align 1
|
||||
|
@ -251,10 +251,10 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
|
|||
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
|
||||
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0
|
||||
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1
|
||||
; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1
|
||||
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
|
||||
%p.0 = load i16, i16 addrspace(5)* %p, align 4
|
||||
|
|
|
@ -1109,13 +1109,13 @@ define void @store_load_large_imm_offset_foo() {
|
|||
; GFX9-LABEL: store_load_large_imm_offset_foo:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_movk_i32 s4, 0x3000
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x3000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 13
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s32
|
||||
; GFX9-NEXT: s_add_u32 s4, s32, s4
|
||||
; GFX9-NEXT: s_add_u32 s0, s32, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -1125,12 +1125,12 @@ define void @store_load_large_imm_offset_foo() {
|
|||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 13
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0x3800
|
||||
; GFX10-NEXT: s_movk_i32 s0, 0x3800
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: s_add_u32 s4, s32, s4
|
||||
; GFX10-NEXT: s_add_u32 s0, s32, s0
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s32
|
||||
; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664
|
||||
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
|
|
|
@ -1364,8 +1364,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1413,8 +1413,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1463,8 +1463,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1614,8 +1614,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1664,8 +1664,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1716,8 +1716,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
|||
; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
|
|
@ -56,37 +56,37 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
|
|||
;
|
||||
; FLATSCR-LABEL: local_stack_offset_uses_sp:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000
|
||||
; FLATSCR-NEXT: s_mov_b32 s6, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s2, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi
|
||||
; FLATSCR-NEXT: BB0_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6
|
||||
; FLATSCR-NEXT: s_add_i32 s6, s6, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v0, s7
|
||||
; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s2, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v0, s3
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB0_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0
|
||||
; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4
|
||||
; FLATSCR-NEXT: s_movk_i32 s6, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6
|
||||
; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208
|
||||
; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68
|
||||
; FLATSCR-NEXT: s_movk_i32 s6, 0x3000
|
||||
; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0
|
||||
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
|
||||
; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
|
||||
; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
|
||||
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, s4
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v3, s5
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
|
||||
; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
|
||||
; FLATSCR-NEXT: s_endpgm
|
||||
entry:
|
||||
|
@ -146,36 +146,36 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
|
|||
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff
|
||||
; FLATSCR-NEXT: s_mov_b32 s6, s33
|
||||
; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s32, 0x1fff
|
||||
; FLATSCR-NEXT: s_mov_b32 s2, s33
|
||||
; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffe000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s4, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s33
|
||||
; FLATSCR-NEXT: BB1_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4
|
||||
; FLATSCR-NEXT: s_add_i32 s4, s4, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v2, s5
|
||||
; FLATSCR-NEXT: s_add_u32 s1, vcc_hi, s0
|
||||
; FLATSCR-NEXT: s_add_i32 s0, s0, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v2, s1
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB1_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0
|
||||
; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s5, s4
|
||||
; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4
|
||||
; FLATSCR-NEXT: s_movk_i32 s4, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s5, s4
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
|
||||
; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000
|
||||
; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
|
||||
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
|
||||
; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
|
||||
; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
|
||||
; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s6
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s2
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
|
||||
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
|
||||
|
|
|
@ -54,36 +54,36 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
|||
;
|
||||
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
|
||||
; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, 16
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, 0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
|
||||
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB0_3
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
|
||||
; FLATSCR-NEXT: s_mov_b32 s6, s32
|
||||
; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
|
||||
; FLATSCR-NEXT: s_add_i32 s8, s6, s7
|
||||
; FLATSCR-NEXT: s_add_u32 s6, s6, s7
|
||||
; FLATSCR-NEXT: s_mov_b32 s2, s32
|
||||
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
|
||||
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
|
||||
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
|
||||
; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s8
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4
|
||||
; FLATSCR-NEXT: s_add_i32 s8, s8, s6
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s8
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s4
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4
|
||||
; FLATSCR-NEXT: s_add_i32 s4, s4, s2
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s4
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
|
||||
; FLATSCR-NEXT: BB0_3: ; %bb.2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
|
@ -162,31 +162,31 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
|||
;
|
||||
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, 64
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, 0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB1_2
|
||||
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
|
||||
; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
|
||||
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
|
||||
; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4
|
||||
; FLATSCR-NEXT: s_add_i32 s6, s6, s7
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s6
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s2
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
|
||||
; FLATSCR-NEXT: scratch_load_dword v1, off, s2
|
||||
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s4
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s5
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
|
||||
; FLATSCR-NEXT: BB1_2: ; %bb.1
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
|
@ -261,38 +261,38 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
|||
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_mov_b32 s9, s33
|
||||
; FLATSCR-NEXT: s_mov_b32 s5, s33
|
||||
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s32
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 16
|
||||
; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; FLATSCR-NEXT: s_cbranch_execz BB2_3
|
||||
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
|
||||
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
|
||||
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
|
||||
; FLATSCR-NEXT: s_cbranch_execz BB2_3
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
|
||||
; FLATSCR-NEXT: s_mov_b32 s6, s32
|
||||
; FLATSCR-NEXT: s_movk_i32 s7, 0x1000
|
||||
; FLATSCR-NEXT: s_add_i32 s8, s6, s7
|
||||
; FLATSCR-NEXT: s_add_u32 s6, s6, s7
|
||||
; FLATSCR-NEXT: s_mov_b32 s2, s32
|
||||
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
|
||||
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
|
||||
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s8
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s4
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
|
||||
; FLATSCR-NEXT: BB2_3: ; %bb.2
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 16
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s9
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s5
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
|
@ -361,33 +361,33 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
|||
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
|
||||
; FLATSCR: ; %bb.0: ; %entry
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s32, 63
|
||||
; FLATSCR-NEXT: s_mov_b32 s7, s33
|
||||
; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s32, 63
|
||||
; FLATSCR-NEXT: s_mov_b32 s3, s33
|
||||
; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffffc0
|
||||
; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80
|
||||
; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; FLATSCR-NEXT: s_cbranch_execz BB3_2
|
||||
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
|
||||
; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
|
||||
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s6
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s2
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
|
||||
; FLATSCR-NEXT: BB3_2: ; %bb.1
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s7
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s3
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
|
||||
|
||||
; Test that the VGPR spiller correctly switches to SGPR offsets when the
|
||||
; instruction offset field would overflow, and that it accounts for memory
|
||||
; swizzling.
|
||||
|
||||
; CHECK-LABEL: test_inst_offset_kernel
|
||||
; GCN-LABEL: test_inst_offset_kernel
|
||||
define amdgpu_kernel void @test_inst_offset_kernel() {
|
||||
entry:
|
||||
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
|
||||
|
@ -13,7 +14,8 @@ entry:
|
|||
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
|
||||
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
|
||||
%a = load volatile i32, i32 addrspace(5)* %aptr
|
||||
|
||||
; Force %a to spill.
|
||||
|
@ -25,7 +27,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sgpr_offset_kernel
|
||||
; GCN-LABEL: test_sgpr_offset_kernel
|
||||
define amdgpu_kernel void @test_sgpr_offset_kernel() {
|
||||
entry:
|
||||
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
||||
|
@ -35,8 +37,10 @@ entry:
|
|||
|
||||
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
||||
; 0x40000 / 64 = 4096 (for wave64)
|
||||
; CHECK: s_mov_b32 s6, 0x40000
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
|
||||
; MUBUF: s_mov_b32 s6, 0x40000
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
|
||||
; FLATSCR: s_movk_i32 s2, 0x1000
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
|
||||
%a = load volatile i32, i32 addrspace(5)* %aptr
|
||||
|
||||
; Force %a to spill
|
||||
|
@ -51,7 +55,7 @@ entry:
|
|||
; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
|
||||
; pointer to temporarily update, so we just crash.
|
||||
|
||||
; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
|
||||
; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
|
||||
define void @test_sgpr_offset_function_scavenge_fail() #2 {
|
||||
entry:
|
||||
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
||||
|
@ -74,9 +78,11 @@ entry:
|
|||
; 0x40000 / 64 = 4096 (for wave64)
|
||||
%a = load volatile i32, i32 addrspace(5)* %aptr
|
||||
|
||||
; CHECK: s_add_u32 s32, s32, 0x40000
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
|
||||
; CHECK: s_sub_u32 s32, s32, 0x40000
|
||||
; MUBUF: s_add_u32 s32, s32, 0x40000
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
|
||||
; MUBUF: s_sub_u32 s32, s32, 0x40000
|
||||
; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
|
||||
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
|
||||
|
||||
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
||||
|
@ -91,16 +97,18 @@ entry:
|
|||
|
||||
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
|
||||
|
||||
; CHECK: s_add_u32 s32, s32, 0x40000
|
||||
; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
|
||||
; CHECK: s_sub_u32 s32, s32, 0x40000
|
||||
; MUBUF: s_add_u32 s32, s32, 0x40000
|
||||
; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
|
||||
; MUBUF: s_sub_u32 s32, s32, 0x40000
|
||||
; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
|
||||
; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
|
||||
|
||||
; Force %a to spill with no free SGPRs
|
||||
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sgpr_offset_subregs_kernel
|
||||
; GCN-LABEL: test_sgpr_offset_subregs_kernel
|
||||
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
|
||||
entry:
|
||||
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
||||
|
@ -110,8 +118,11 @@ entry:
|
|||
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
||||
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
|
||||
; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
|
||||
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
||||
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
||||
|
||||
|
@ -128,7 +139,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_inst_offset_subregs_kernel
|
||||
; GCN-LABEL: test_inst_offset_subregs_kernel
|
||||
define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
|
||||
entry:
|
||||
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
|
||||
|
@ -139,9 +150,12 @@ entry:
|
|||
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
||||
|
||||
; 0x3ff00 / 64 = 4092 (for wave64)
|
||||
; CHECK: s_mov_b32 s6, 0x3ff00
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
|
||||
; MUBUF: s_mov_b32 s6, 0x3ff00
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
|
||||
; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
|
||||
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
||||
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
||||
|
||||
|
@ -158,7 +172,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_inst_offset_function
|
||||
; GCN-LABEL: test_inst_offset_function
|
||||
define void @test_inst_offset_function() {
|
||||
entry:
|
||||
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
|
||||
|
@ -167,7 +181,8 @@ entry:
|
|||
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
|
||||
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
|
||||
%a = load volatile i32, i32 addrspace(5)* %aptr
|
||||
|
||||
; Force %a to spill.
|
||||
|
@ -179,7 +194,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sgpr_offset_function
|
||||
; GCN-LABEL: test_sgpr_offset_function
|
||||
define void @test_sgpr_offset_function() {
|
||||
entry:
|
||||
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
||||
|
@ -189,8 +204,10 @@ entry:
|
|||
|
||||
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
||||
; 0x40000 / 64 = 4096 (for wave64)
|
||||
; CHECK: s_add_u32 s4, s32, 0x40000
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
|
||||
; MUBUF: s_add_u32 s4, s32, 0x40000
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
|
||||
; FLATSCR: s_add_u32 s0, s32, 0x1000
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
|
||||
%a = load volatile i32, i32 addrspace(5)* %aptr
|
||||
|
||||
; Force %a to spill
|
||||
|
@ -202,7 +219,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sgpr_offset_subregs_function
|
||||
; GCN-LABEL: test_sgpr_offset_subregs_function
|
||||
define void @test_sgpr_offset_subregs_function() {
|
||||
entry:
|
||||
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
||||
|
@ -212,8 +229,10 @@ entry:
|
|||
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
||||
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill
|
||||
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
||||
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
||||
|
||||
|
@ -230,7 +249,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_inst_offset_subregs_function
|
||||
; GCN-LABEL: test_inst_offset_subregs_function
|
||||
define void @test_inst_offset_subregs_function() {
|
||||
entry:
|
||||
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
|
||||
|
@ -241,9 +260,12 @@ entry:
|
|||
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
||||
|
||||
; 0x3ff00 / 64 = 4092 (for wave64)
|
||||
; CHECK: s_add_u32 s4, s32, 0x3ff00
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
|
||||
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
|
||||
; MUBUF: s_add_u32 s4, s32, 0x3ff00
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
|
||||
; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
|
||||
; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
|
||||
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
|
||||
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
||||
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
||||
|
||||
|
|
|
@ -60,26 +60,18 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
|
|||
; FLATSCR-NEXT: s_addc_u32 s3, s3, 0
|
||||
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
|
||||
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24
|
||||
; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; FLATSCR-NEXT: s_mov_b32 s38, -1
|
||||
; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000
|
||||
; FLATSCR-NEXT: s_add_u32 s36, s36, s5
|
||||
; FLATSCR-NEXT: s_addc_u32 s37, s37, 0
|
||||
; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v3, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000
|
||||
; FLATSCR-NEXT: ; implicit-def: $vcc_hi
|
||||
; FLATSCR-NEXT: s_getpc_b64 s[4:5]
|
||||
; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
|
||||
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
|
||||
; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
|
||||
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo
|
||||
; FLATSCR-NEXT: s_cbranch_execz BB0_2
|
||||
|
|
Loading…
Reference in New Issue