From d5a465866eea7f1779869c679a4c25ac2cbae59e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 6 Nov 2020 13:00:10 -0800 Subject: [PATCH] [AMDGPU] Omit buffer resource with flat scratch. Differential Revision: https://reviews.llvm.org/D90979 --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 17 +-- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 54 +++++---- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 ++- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 17 +-- .../AMDGPU/call-preserved-registers.ll | 42 ++++--- .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 107 +++++++++------- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 22 ++-- .../fast-unaligned-load-store.private.ll | 8 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 16 +-- llvm/test/CodeGen/AMDGPU/load-lo16.ll | 24 ++-- .../local-stack-alloc-block-sp-reference.ll | 82 ++++++------- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 114 +++++++++--------- .../AMDGPU/spill-offset-calculation.ll | 86 ++++++++----- ...tack-pointer-offset-relative-frameindex.ll | 20 +-- 15 files changed, 350 insertions(+), 283 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index af58df181918..cdea537a09dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -939,7 +939,8 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (IsEntryFunc) { TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); } else { - CCInfo.AllocateReg(Info->getScratchRSrcReg()); + if (!Subtarget.enableFlatScratch()) + CCInfo.AllocateReg(Info->getScratchRSrcReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -1227,12 +1228,14 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const SIMachineFunctionInfo *MFI = MF.getInfo(); - // Insert copies for the SRD. In the HSA case, this should be an identity - // copy. - auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), - MFI->getScratchRSrcReg()); - MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + if (!ST.enableFlatScratch()) { + // Insert copies for the SRD. In the HSA case, this should be an identity + // copy. + auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), + MFI->getScratchRSrcReg()); + MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + } for (std::pair ArgReg : ImplicitArgRegs) { MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index f8cc31c0503a..1158f9360b03 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -467,7 +467,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // // This will return `Register()` in cases where there are no actual // uses of the SRSRC. - Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + Register ScratchRsrcReg; + if (!ST.enableFlatScratch()) + ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); // Make the selected register live throughout the function. if (ScratchRsrcReg) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7987ac72e451..217b6387f266 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2117,26 +2117,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. + if (!ST.enableFlatScratch()) { + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. - Register PrivateSegmentBufferReg = - Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - } else { - unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); - // We tentatively reserve the last registers (skipping the last registers - // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, - // we'll replace these with the ones immediately after those which were - // really allocated. In the prologue copies will be inserted from the - // argument to these reserved registers. + Register PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); + } else { + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. - // Without HSA, relocations are used for the scratch pointer and the - // buffer resource setup is always inserted in the prologue. Scratch wave - // offset is still in an input SGPR. - Info.setScratchRSrcReg(ReservedBufferReg); + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info.setScratchRSrcReg(ReservedBufferReg); + } } MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -3012,14 +3014,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - SmallVector CopyFromChains; + if (!Subtarget->enableFlatScratch()) { + SmallVector CopyFromChains; - // In the HSA case, this should be an identity copy. - SDValue ScratchRSrcReg - = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); - Chain = DAG.getTokenFactor(DL, CopyFromChains); + // In the HSA case, this should be an identity copy. + SDValue ScratchRSrcReg + = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); + RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); + Chain = DAG.getTokenFactor(DL, CopyFromChains); + } } MVT PtrVT = MVT::i32; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e6c4e9cd6610..8c10a971115f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1379,11 +1379,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } + Register ScratchRSrc = + ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy + : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(ScratchRSrc, RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are @@ -1397,10 +1400,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); + Register ScratchRSrc = + ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy + : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchRSrc) // scratch_rsrc .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); @@ -1513,21 +1519,27 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } + Register ScratchRSrc = + ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy + : MFI->getScratchRSrcReg(); if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(ScratchRSrc, RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); return; } unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) : getVGPRSpillRestoreOpcode(SpillSize); + Register ScratchRSrc = + ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy + : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchRSrc) // scratch_rsrc .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 046dcf42a02a..8032bc5f9de9 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -75,16 +75,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } if (!isEntryFunction()) { - // Non-entry functions have no special inputs for now, other registers - // required for scratch access. - ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; - ArgInfo.PrivateSegmentBuffer = - ArgDescriptor::createRegister(ScratchRSrcReg); + if (!ST.enableFlatScratch()) { + // Non-entry functions have no special inputs for now, other registers + // required for scratch access. + ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + } if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; @@ -142,7 +144,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); if (isAmdHsaOrMesa) { - PrivateSegmentBuffer = true; + if (!ST.enableFlatScratch()) + PrivateSegmentBuffer = true; if (UseFixedABI) { DispatchPtr = true; diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index 56748eafab28..b3af5fc946df 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -35,8 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v40, 2 -; GCN-DAG: v_readlane_b32 s5, v40, 3 +; MUBUF-DAG: v_readlane_b32 s4, v40, 2 +; MUBUF-DAG: v_readlane_b32 s5, v40, 3 +; FLATSCR-DAG: v_readlane_b32 s0, v40, 2 +; FLATSCR-DAG: v_readlane_b32 s1, v40, 3 ; GCN: v_readlane_b32 s35, v40, 1 ; GCN: v_readlane_b32 s34, v40, 0 @@ -134,14 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND -; GCN: s_swappc_b64 s[30:31], s[4:5] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND @@ -157,9 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace( ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 @@ -168,7 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace( ; GCN-NEXT: ;;#ASMEND ; GCN-NOT: s34 -; GCN: s_swappc_b64 s[30:31], s[4:5] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN-NOT: s34 @@ -186,9 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace( ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v40 @@ -196,7 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace( ; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND -; GCN: s_swappc_b64 s[30:31], s[4:5] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] ; GCN-NOT: v40 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 7bc9dcfb20a8..20d50200e3a2 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -13,9 +13,10 @@ define void @callee_no_stack() #0 { ; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 +; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_all() #1 { ret void @@ -48,7 +49,8 @@ define void @callee_with_stack() #0 { ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 +; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_add_u32 s32, s32, 8 @@ -57,7 +59,7 @@ define void @callee_with_stack() #0 { ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}} ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 -; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { @@ -100,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 { ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]] -; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]] +; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]] +; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]] +; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]] +; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]] ; MUBUF: s_sub_u32 s32, s32, 0x400{{$}} ; FLATSCR: s_sub_u32 s32, s32, 16{{$}} @@ -140,8 +144,10 @@ define void @callee_with_stack_and_call() #0 { ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v40, 0 -; GCN-DAG: v_readlane_b32 s5, v40, 1 +; MUBUF-DAG: v_readlane_b32 s4, v40, 0 +; MUBUF-DAG: v_readlane_b32 s5, v40, 1 +; FLATSCR-DAG: v_readlane_b32 s0, v40, 0 +; FLATSCR-DAG: v_readlane_b32 s1, v40, 1 ; MUBUF: s_sub_u32 s32, s32, 0x400 ; FLATSCR: s_sub_u32 s32, s32, 16 @@ -238,9 +244,10 @@ define void @spill_only_csr_sgpr() { ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_add_u32 s32, s32, 0x300 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF-NEXT: s_mov_b32 s33, s4 ; FLATSCR: s_add_u32 s32, s32, 12 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 -; GCN-NEXT: s_mov_b32 s33, s4 +; FLATSCR-NEXT: s_mov_b32 s33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { @@ -330,7 +337,8 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; GCN: s_waitcnt ; MUBUF-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 ; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff -; GCN-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 +; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 ; MUBUF-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 ; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x100000 @@ -340,7 +348,7 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000 -; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { @@ -359,15 +367,18 @@ define void @realign_stack_no_fp_elim() #1 { ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v1, 0 +; MUBUF: v_readlane_b32 s4, v1, 0 ; MUBUF-NEXT: s_add_u32 s32, s32, 0x200 +; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 +; FLATSCR: v_readlane_b32 s0, v1, 0 ; FLATSCR-NEXT: s_add_u32 s32, s32, 8 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8 -; GCN-NEXT: v_readlane_b32 s33, v1, 2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[4:5] +; FLATSCR-NEXT: s_setpc_b64 s[0:1] define void @no_unused_non_csr_sgpr_for_fp() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -399,9 +410,11 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF: s_add_u32 s32, s32, 0x300{{$}} ; FLATSCR: s_add_u32 s32, s32, 12{{$}} -; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0 ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300{{$}} ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 @@ -450,9 +463,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword -; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; MUBUF: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0 ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; MUBUF: v_readlane_b32 s5, [[CSR_VGPR]], 1 +; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1 ; MUBUF-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 @@ -514,20 +529,21 @@ define void @ipra_call_with_stack() #0 { ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 -; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4 -; GCN: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN: s_mov_b32 s33, s32 -; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] +; FLATSCR: s_mov_b32 s0, s33 +; GCN: s_mov_b32 s33, s32 +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 -; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4 -; GCN: s_waitcnt vmcnt(0) -; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] -; GCN: s_mov_b64 exec, [[COPY_EXEC2]] -; GCN: s_setpc_b64 -; GCN: ScratchSize: 8 +; FLATSCR: s_mov_b32 s33, s0 +; MUBUF: s_waitcnt vmcnt(0) +; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] +; GCN: s_setpc_b64 +; MUBUF: ScratchSize: 8 +; FLATSCR: ScratchSize: 0 define void @callee_need_to_spill_fp_to_memory() #3 { call void asm sideeffect "; clobber nonpreserved SGPRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} @@ -547,20 +563,19 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; need to spill the FP to memory if there are no free lanes in the reserved ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 ; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]] -; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]] -; GCN: s_mov_b64 exec, [[COPY_EXEC1]] +; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NOT: v_writelane_b32 v40, s33 -; GCN: s_mov_b32 s33, s32 +; MUBUF: s_mov_b32 s33, s32 +; FLATSCR: s_mov_b32 s33, s0 ; GCN-NOT: v_readlane_b32 s33, v40 -; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; MUBUF: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]] -; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]] -; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]] -; GCN: s_mov_b64 exec, [[COPY_EXEC2]] -; GCN: s_setpc_b64 +; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] +; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] +; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} @@ -585,14 +600,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_or_saveexec_b64 s[4:5], -1 +; MUBUF: s_or_saveexec_b64 s[4:5], -1 ; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x1008 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill -; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008 -; FLATSCR-NEXT: v_mov_b32_e32 v0, s33 -; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill +; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004 +; FLATSCR: v_mov_b32_e32 v0, 0 +; FLATSCR: scratch_store_dword off, v0, [[SOFF]] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 354bc0f0d7f2..9e3ced2e7f42 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -15,11 +15,11 @@ define <2 x half> @chain_hi_to_lo_private() { ; FLATSCR-LABEL: chain_hi_to_lo_private: ; FLATSCR: ; %bb.0: ; %bb ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s4, 2 -; FLATSCR-NEXT: scratch_load_ushort v0, off, s4 -; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_mov_b32 s0, 2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 +; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s4 +; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] bb: @@ -256,13 +256,13 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; ; FLATSCR-LABEL: vload2_private: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 ; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4 @@ -272,8 +272,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6 ; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v0, s6 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s7 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 92761a692115..1711f3a517cb 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -150,10 +150,10 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 @@ -251,10 +251,10 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-FLASTSCR-NEXT: s_mov_b32 s4, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 1e2732e39136..b0532e36ed54 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1109,13 +1109,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x3000 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: scratch_store_dword off, v0, s32 -; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: s_add_u32 s0, s32, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712 -; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712 +; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 +; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1125,12 +1125,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s4, 0x3800 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_add_u32 s0, s32, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s32 -; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664 -; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664 +; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 +; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index e75873ee2ce3..44fe6cdf915b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1364,8 +1364,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1413,8 +1413,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1463,8 +1463,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1614,8 +1614,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1664,8 +1664,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1716,8 +1716,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-FLATSCR-NEXT: s_movk_i32 s4, 0xffe -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s4 +; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index c725b0c339ff..c8f7fdebc4e4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -56,37 +56,37 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 -; FLATSCR-NEXT: s_mov_b32 s6, 0 +; FLATSCR-NEXT: s_mov_b32 s2, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi ; FLATSCR-NEXT: BB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_u32 s7, 0x3000, s6 -; FLATSCR-NEXT: s_add_i32 s6, s6, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s6, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s7 +; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_cbranch_scc1 BB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s6, 0x20d0 -; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 -; FLATSCR-NEXT: scratch_load_dword v1, off, s6 offset:4 -; FLATSCR-NEXT: s_movk_i32 s6, 0x2000 -; FLATSCR-NEXT: s_add_u32 s6, 0x3000, s6 -; FLATSCR-NEXT: scratch_load_dword v0, off, s6 offset:208 -; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 -; FLATSCR-NEXT: scratch_load_dword v2, off, s6 offset:68 -; FLATSCR-NEXT: s_movk_i32 s6, 0x3000 -; FLATSCR-NEXT: scratch_load_dword v3, off, s6 offset:64 +; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2 +; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4 +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2 +; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208 +; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68 +; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v2, s4 -; FLATSCR-NEXT: v_mov_b32_e32 v3, s5 +; FLATSCR-NEXT: v_mov_b32_e32 v3, s1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 ; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; FLATSCR-NEXT: s_endpgm entry: @@ -146,36 +146,36 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac ; FLATSCR-LABEL: func_local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_add_u32 s4, s32, 0x1fff -; FLATSCR-NEXT: s_mov_b32 s6, s33 -; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffe000 +; FLATSCR-NEXT: s_add_u32 s0, s32, 0x1fff +; FLATSCR-NEXT: s_mov_b32 s2, s33 +; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffe000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: s_mov_b32 s4, 0 +; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000 ; FLATSCR-NEXT: scratch_store_dword off, v2, s33 ; FLATSCR-NEXT: BB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000 -; FLATSCR-NEXT: s_add_u32 s5, vcc_hi, s4 -; FLATSCR-NEXT: s_add_i32 s4, s4, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v2, s5 +; FLATSCR-NEXT: s_add_u32 s1, vcc_hi, s0 +; FLATSCR-NEXT: s_add_i32 s0, s0, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v2, s1 ; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s4, 0x20d0 -; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 -; FLATSCR-NEXT: s_add_u32 s4, s5, s4 -; FLATSCR-NEXT: scratch_load_dword v3, off, s4 offset:4 -; FLATSCR-NEXT: s_movk_i32 s4, 0x2000 -; FLATSCR-NEXT: s_add_u32 s5, s33, 0x1000 -; FLATSCR-NEXT: s_add_u32 s4, s5, s4 -; FLATSCR-NEXT: scratch_load_dword v2, off, s4 offset:208 -; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 -; FLATSCR-NEXT: scratch_load_dword v4, off, s4 offset:68 -; FLATSCR-NEXT: s_add_u32 s4, s33, 0x1000 -; FLATSCR-NEXT: scratch_load_dword v5, off, s4 offset:64 +; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 +; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s0, s1, s0 +; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000 +; FLATSCR-NEXT: s_add_u32 s0, s1, s0 +; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208 +; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68 +; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000 +; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000 -; FLATSCR-NEXT: s_mov_b32 s33, s6 +; FLATSCR-NEXT: s_mov_b32 s33, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 7aa0c8f70205..4ec62515b103 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -54,36 +54,36 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: s_mov_b32 s32, 16 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: s_cmp_lg_u32 s8, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0 ; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_cmp_lg_u32 s9, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 ; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_mov_b32 s6, s32 -; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 -; FLATSCR-NEXT: s_add_i32 s8, s6, s7 -; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: s_mov_b32 s2, s32 +; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 +; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: scratch_store_dword off, v1, s2 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 -; FLATSCR-NEXT: s_lshl_b32 s6, s10, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s8 -; FLATSCR-NEXT: scratch_store_dword off, v1, s8 offset:4 -; FLATSCR-NEXT: s_add_i32 s8, s8, s6 -; FLATSCR-NEXT: scratch_load_dword v1, off, s8 -; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s4 +; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4 +; FLATSCR-NEXT: s_add_i32 s4, s4, s2 +; FLATSCR-NEXT: scratch_load_dword v1, off, s4 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off ; FLATSCR-NEXT: BB0_3: ; %bb.2 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 @@ -162,31 +162,31 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; FLATSCR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; FLATSCR-NEXT: s_mov_b32 s32, 64 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: s_cmp_lg_u32 s6, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 ; FLATSCR-NEXT: s_cbranch_scc1 BB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 -; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: scratch_store_dword off, v1, s6 +; FLATSCR-NEXT: scratch_store_dword off, v1, s2 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 -; FLATSCR-NEXT: s_lshl_b32 s7, s7, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s6 -; FLATSCR-NEXT: scratch_store_dword off, v1, s6 offset:4 -; FLATSCR-NEXT: s_add_i32 s6, s6, s7 -; FLATSCR-NEXT: scratch_load_dword v1, off, s6 -; FLATSCR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4 +; FLATSCR-NEXT: s_add_i32 s2, s2, s3 +; FLATSCR-NEXT: scratch_load_dword v1, off, s2 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s4 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s5 +; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off ; FLATSCR-NEXT: BB1_2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 @@ -261,38 +261,38 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s9, s33 +; FLATSCR-NEXT: s_mov_b32 s5, s33 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_u32 s32, s32, 16 -; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz BB2_3 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; FLATSCR-NEXT: s_and_b64 exec, exec, vcc ; FLATSCR-NEXT: s_cbranch_execz BB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_mov_b32 s6, s32 -; FLATSCR-NEXT: s_movk_i32 s7, 0x1000 -; FLATSCR-NEXT: s_add_i32 s8, s6, s7 -; FLATSCR-NEXT: s_add_u32 s6, s6, s7 +; FLATSCR-NEXT: s_mov_b32 s2, s32 +; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 +; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: scratch_store_dword off, v2, s2 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: scratch_store_dword off, v2, s8 offset:4 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s8 +; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 -; FLATSCR-NEXT: s_mov_b32 s32, s8 +; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off ; FLATSCR-NEXT: BB2_3: ; %bb.2 -; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 -; FLATSCR-NEXT: s_mov_b32 s33, s9 +; FLATSCR-NEXT: s_mov_b32 s33, s5 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -361,33 +361,33 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_add_u32 s4, s32, 63 -; FLATSCR-NEXT: s_mov_b32 s7, s33 -; FLATSCR-NEXT: s_and_b32 s33, s4, 0xffffffc0 +; FLATSCR-NEXT: s_add_u32 s0, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s3, s33 +; FLATSCR-NEXT: s_and_b32 s33, s0, 0xffffffc0 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_add_u32 s32, s32, 0x80 -; FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz BB3_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s6, s32, 0x1000 -; FLATSCR-NEXT: s_and_b32 s6, s6, 0xfffff000 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: scratch_store_dword off, v2, s6 +; FLATSCR-NEXT: scratch_store_dword off, v2, s2 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: scratch_store_dword off, v2, s6 offset:4 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s6 +; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 -; FLATSCR-NEXT: s_mov_b32 s32, s6 +; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off ; FLATSCR-NEXT: BB3_2: ; %bb.1 -; FLATSCR-NEXT: s_or_b64 exec, exec, s[4:5] +; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80 -; FLATSCR-NEXT: s_mov_b32 s33, s7 +; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 969edbf12647..62213b2e04e5 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -1,10 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s ; Test that the VGPR spiller correctly switches to SGPR offsets when the ; instruction offset field would overflow, and that it accounts for memory ; swizzling. -; CHECK-LABEL: test_inst_offset_kernel +; GCN-LABEL: test_inst_offset_kernel define amdgpu_kernel void @test_inst_offset_kernel() { entry: ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in @@ -13,7 +14,8 @@ entry: %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. @@ -25,7 +27,7 @@ entry: ret void } -; CHECK-LABEL: test_sgpr_offset_kernel +; GCN-LABEL: test_sgpr_offset_kernel define amdgpu_kernel void @test_sgpr_offset_kernel() { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not @@ -35,8 +37,10 @@ entry: %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_mov_b32 s6, 0x40000 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + ; MUBUF: s_mov_b32 s6, 0x40000 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + ; FLATSCR: s_movk_i32 s2, 0x1000 + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill @@ -51,7 +55,7 @@ entry: ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack ; pointer to temporarily update, so we just crash. -; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail +; GCN-LABEL: test_sgpr_offset_function_scavenge_fail define void @test_sgpr_offset_function_scavenge_fail() #2 { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not @@ -74,9 +78,11 @@ entry: ; 0x40000 / 64 = 4096 (for wave64) %a = load volatile i32, i32 addrspace(5)* %aptr - ; CHECK: s_add_u32 s32, s32, 0x40000 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill - ; CHECK: s_sub_u32 s32, s32, 0x40000 + ; MUBUF: s_add_u32 s32, s32, 0x40000 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill + ; MUBUF: s_sub_u32 s32, s32, 0x40000 + ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000 + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() @@ -91,16 +97,18 @@ entry: call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 - ; CHECK: s_add_u32 s32, s32, 0x40000 - ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload - ; CHECK: s_sub_u32 s32, s32, 0x40000 + ; MUBUF: s_add_u32 s32, s32, 0x40000 + ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload + ; MUBUF: s_sub_u32 s32, s32, 0x40000 + ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000 + ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) ret void } -; CHECK-LABEL: test_sgpr_offset_subregs_kernel +; GCN-LABEL: test_sgpr_offset_subregs_kernel define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { entry: ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a @@ -110,8 +118,11 @@ entry: %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill + ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8 + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -128,7 +139,7 @@ entry: ret void } -; CHECK-LABEL: test_inst_offset_subregs_kernel +; GCN-LABEL: test_inst_offset_subregs_kernel define amdgpu_kernel void @test_inst_offset_subregs_kernel() { entry: ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a @@ -139,9 +150,12 @@ entry: %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_mov_b32 s6, 0x3ff00 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill + ; MUBUF: s_mov_b32 s6, 0x3ff00 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill + ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -158,7 +172,7 @@ entry: ret void } -; CHECK-LABEL: test_inst_offset_function +; GCN-LABEL: test_inst_offset_function define void @test_inst_offset_function() { entry: ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in @@ -167,7 +181,8 @@ entry: %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. @@ -179,7 +194,7 @@ entry: ret void } -; CHECK-LABEL: test_sgpr_offset_function +; GCN-LABEL: test_sgpr_offset_function define void @test_sgpr_offset_function() { entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not @@ -189,8 +204,10 @@ entry: %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s4, s32, 0x40000 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill + ; MUBUF: s_add_u32 s4, s32, 0x40000 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill + ; FLATSCR: s_add_u32 s0, s32, 0x1000 + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill @@ -202,7 +219,7 @@ entry: ret void } -; CHECK-LABEL: test_sgpr_offset_subregs_function +; GCN-LABEL: test_sgpr_offset_subregs_function define void @test_sgpr_offset_subregs_function() { entry: ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a @@ -212,8 +229,10 @@ entry: %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -230,7 +249,7 @@ entry: ret void } -; CHECK-LABEL: test_inst_offset_subregs_function +; GCN-LABEL: test_inst_offset_subregs_function define void @test_inst_offset_subregs_function() { entry: ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a @@ -241,9 +260,12 @@ entry: %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s4, s32, 0x3ff00 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill + ; MUBUF: s_add_u32 s4, s32, 0x3ff00 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill + ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 62b5222f9621..8c4c7069fffa 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -60,26 +60,18 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 ; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 -; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; FLATSCR-NEXT: s_mov_b32 s38, -1 -; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000 -; FLATSCR-NEXT: s_add_u32 s36, s36, s5 -; FLATSCR-NEXT: s_addc_u32 s37, s37, 0 +; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000 ; FLATSCR-NEXT: ; implicit-def: $vcc_hi -; FLATSCR-NEXT: s_getpc_b64 s[4:5] -; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37] -; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39] -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo ; FLATSCR-NEXT: s_cbranch_execz BB0_2