[AMDGPU] Omit buffer resource with flat scratch.

Differential Revision: https://reviews.llvm.org/D90979
2020-11-06 13:00:10 -08:00 · 2020-11-06 13:00:10 -08:00 · d5a465866e
parent 91d2e5c81a
commit d5a465866e
15 changed files with 350 additions and 283 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -939,7 +939,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
  if (IsEntryFunc) {
    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
  } else {
-    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!Subtarget.enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
  }

@ -1227,12 +1228,14 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

-  // Insert copies for the SRD. In the HSA case, this should be an identity
-  // copy.
-  auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
-                                             MFI->getScratchRSrcReg());
-  MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-  MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+  if (!ST.enableFlatScratch()) {
+    // Insert copies for the SRD. In the HSA case, this should be an identity
+    // copy.
+    auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
+                                               MFI->getScratchRSrcReg());
+    MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+    MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+  }

  for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
    MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -467,7 +467,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
  //
  // This will return `Register()` in cases where there are no actual
  // uses of the SRSRC.
-  Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+  Register ScratchRsrcReg;
+  if (!ST.enableFlatScratch())
+    ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);

  // Make the selected register live throughout the function.
  if (ScratchRsrcReg) {
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -2117,26 +2117,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
  // the scratch registers to pass in.
  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();

-  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
-    // If we have stack objects, we unquestionably need the private buffer
-    // resource. For the Code Object V2 ABI, this will be the first 4 user
-    // SGPR inputs. We can reserve those and use them directly.
+  if (!ST.enableFlatScratch()) {
+    if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the Code Object V2 ABI, this will be the first 4 user
+      // SGPR inputs. We can reserve those and use them directly.

-    Register PrivateSegmentBufferReg =
-        Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
-    Info.setScratchRSrcReg(PrivateSegmentBufferReg);
-  } else {
-    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
-    // We tentatively reserve the last registers (skipping the last registers
-    // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
-    // we'll replace these with the ones immediately after those which were
-    // really allocated. In the prologue copies will be inserted from the
-    // argument to these reserved registers.
+      Register PrivateSegmentBufferReg =
+          Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
+    } else {
+      unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+      // We tentatively reserve the last registers (skipping the last registers
+      // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+      // we'll replace these with the ones immediately after those which were
+      // really allocated. In the prologue copies will be inserted from the
+      // argument to these reserved registers.

-    // Without HSA, relocations are used for the scratch pointer and the
-    // buffer resource setup is always inserted in the prologue. Scratch wave
-    // offset is still in an input SGPR.
-    Info.setScratchRSrcReg(ReservedBufferReg);
+      // Without HSA, relocations are used for the scratch pointer and the
+      // buffer resource setup is always inserted in the prologue. Scratch wave
+      // offset is still in an input SGPR.
+      Info.setScratchRSrcReg(ReservedBufferReg);
+    }
  }

  MachineRegisterInfo &MRI = MF.getRegInfo();
@ -3012,14 +3014,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
  if (!IsSibCall) {
    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

-    SmallVector<SDValue, 4> CopyFromChains;
+    if (!Subtarget->enableFlatScratch()) {
+      SmallVector<SDValue, 4> CopyFromChains;

-    // In the HSA case, this should be an identity copy.
-    SDValue ScratchRSrcReg
-      = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
-    RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-    CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
-    Chain = DAG.getTokenFactor(DL, CopyFromChains);
+      // In the HSA case, this should be an identity copy.
+      SDValue ScratchRSrcReg
+        = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+      RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+      CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+      Chain = DAG.getTokenFactor(DL, CopyFromChains);
+    }
  }

  MVT PtrVT = MVT::i32;
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -1379,11 +1379,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
    }

+    Register ScratchRSrc =
+      ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+                             : MFI->getScratchRSrcReg();
    BuildMI(MBB, MI, DL, OpDesc)
      .addReg(SrcReg, getKillRegState(isKill)) // data
      .addFrameIndex(FrameIndex)               // addr
      .addMemOperand(MMO)
-      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(ScratchRSrc, RegState::Implicit)
      .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
    // Add the scratch resource registers as implicit uses because we may end up
    // needing them, and need to ensure that the reserved registers are
@ -1397,10 +1400,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                    : getVGPRSpillSaveOpcode(SpillSize);
  MFI->setHasSpilledVGPRs();

+  Register ScratchRSrc =
+    ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+                           : MFI->getScratchRSrcReg();
  BuildMI(MBB, MI, DL, get(Opcode))
    .addReg(SrcReg, getKillRegState(isKill)) // data
    .addFrameIndex(FrameIndex)               // addr
-    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
+    .addReg(ScratchRSrc)                     // scratch_rsrc
    .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
    .addImm(0)                               // offset
    .addMemOperand(MMO);
@ -1513,21 +1519,27 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
    }

+    Register ScratchRSrc =
+      ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+                             : MFI->getScratchRSrcReg();
    if (RI.spillSGPRToVGPR())
      FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
    BuildMI(MBB, MI, DL, OpDesc, DestReg)
      .addFrameIndex(FrameIndex) // addr
      .addMemOperand(MMO)
-      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(ScratchRSrc, RegState::Implicit)
      .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
    return;
  }

  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
                                    : getVGPRSpillRestoreOpcode(SpillSize);
+  Register ScratchRSrc =
+    ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy
+                           : MFI->getScratchRSrcReg();
  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
    .addFrameIndex(FrameIndex)        // vaddr
-    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(ScratchRSrc)              // scratch_rsrc
    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
    .addImm(0)                           // offset
    .addMemOperand(MMO);
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -75,16 +75,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
  }

  if (!isEntryFunction()) {
-    // Non-entry functions have no special inputs for now, other registers
-    // required for scratch access.
-    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-
    // TODO: Pick a high register, and shift down, similar to a kernel.
    FrameOffsetReg = AMDGPU::SGPR33;
    StackPtrOffsetReg = AMDGPU::SGPR32;

-    ArgInfo.PrivateSegmentBuffer =
-      ArgDescriptor::createRegister(ScratchRSrcReg);
+    if (!ST.enableFlatScratch()) {
+      // Non-entry functions have no special inputs for now, other registers
+      // required for scratch access.
+      ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+      ArgInfo.PrivateSegmentBuffer =
+        ArgDescriptor::createRegister(ScratchRSrcReg);
+    }

    if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
      ImplicitArgPtr = true;
@ -142,7 +144,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)

  bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
  if (isAmdHsaOrMesa) {
-    PrivateSegmentBuffer = true;
+    if (!ST.enableFlatScratch())
+      PrivateSegmentBuffer = true;

    if (UseFixedABI) {
      DispatchPtr = true;
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@ -35,8 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v40, 2
-; GCN-DAG: v_readlane_b32 s5, v40, 3
+; MUBUF-DAG:   v_readlane_b32 s4, v40, 2
+; MUBUF-DAG:   v_readlane_b32 s5, v40, 3
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
 ; GCN: v_readlane_b32 s35, v40, 1
 ; GCN: v_readlane_b32 s34, v40, 0

@ -134,14 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
 ; FIXME: What is the expected behavior for reserved registers here?

 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; MUBUF:        s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: #ASMSTART
 ; GCN-NEXT: ; def s33
 ; GCN-NEXT: #ASMEND
-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
 ; GCN: ;;#ASMSTART
 ; GCN-NEXT: ; use s33
 ; GCN-NEXT: ;;#ASMEND
@ -157,9 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
 ; GCN-NOT: s34

-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; MUBUF:        s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
 ; GCN: s_mov_b32 s32, 0

 ; GCN-NOT: s34
@ -168,7 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
 ; GCN-NEXT: ;;#ASMEND

 ; GCN-NOT: s34
-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]

 ; GCN-NOT: s34

@ -186,9 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}

 ; GCN-NOT: v32
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
 ; GCN: s_mov_b32 s32, 0
 ; GCN-NOT: v40

@ -196,7 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
 ; GCN-NEXT: ; def v40
 ; GCN-NEXT: ;;#ASMEND

-; GCN: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]

 ; GCN-NOT: v40

--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@ -13,9 +13,10 @@ define void @callee_no_stack() #0 {
 ; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_no_fp_elim_all() #1 {
  ret void
@ -48,7 +49,8 @@ define void @callee_with_stack() #0 {
 ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; MUBUF-NEXT:   s_add_u32 s32, s32, 0x200
 ; FLATSCR-NEXT: s_add_u32 s32, s32, 8
@ -57,7 +59,7 @@ define void @callee_with_stack() #0 {
 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_all() #1 {
@ -100,8 +102,10 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {

 ; GCN: s_swappc_b64

-; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
-; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
+; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
+; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]

 ; MUBUF:    s_sub_u32 s32, s32, 0x400{{$}}
 ; FLATSCR:  s_sub_u32 s32, s32, 16{{$}}
@ -140,8 +144,10 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN: s_swappc_b64

-; GCN-DAG: v_readlane_b32 s4, v40, 0
-; GCN-DAG: v_readlane_b32 s5, v40, 1
+; MUBUF-DAG: v_readlane_b32 s4, v40, 0
+; MUBUF-DAG: v_readlane_b32 s5, v40, 1
+; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
+; FLATSCR-DAG: v_readlane_b32 s1, v40, 1

 ; MUBUF:   s_sub_u32 s32, s32, 0x400
 ; FLATSCR: s_sub_u32 s32, s32, 16
@ -238,9 +244,10 @@ define void @spill_only_csr_sgpr() {
 ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; MUBUF:        s_add_u32 s32, s32, 0x300
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300
+; MUBUF-NEXT:   s_mov_b32 s33, s4
 ; FLATSCR:      s_add_u32 s32, s32, 12
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
-; GCN-NEXT: s_mov_b32 s33, s4
+; FLATSCR-NEXT: s_mov_b32 s33, s0
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
@ -330,7 +337,8 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; GCN: s_waitcnt
 ; MUBUF-NEXT:   s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
 ; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff
-; GCN-NEXT:     s_mov_b32 s4, s33
+; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
+; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
 ; MUBUF-NEXT:   s_and_b32 s33, [[SCRATCH]], 0xfff80000
 ; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000
 ; MUBUF-NEXT:   s_add_u32 s32, s32, 0x100000
@ -340,7 +348,7 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x100000
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
@ -359,15 +367,18 @@ define void @realign_stack_no_fp_elim() #1 {
 ; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
 ; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s4, v1, 0
+; MUBUF:        v_readlane_b32 s4, v1, 0
 ; MUBUF-NEXT:   s_add_u32 s32, s32, 0x200
+; MUBUF-NEXT:   v_readlane_b32 s5, v1, 1
+; FLATSCR:      v_readlane_b32 s0, v1, 0
 ; FLATSCR-NEXT: s_add_u32 s32, s32, 8
-; GCN-NEXT:     v_readlane_b32 s5, v1, 1
+; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
-; GCN-NEXT: v_readlane_b32 s33, v1, 2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[4:5]
+; GCN-NEXT:     v_readlane_b32 s33, v1, 2
+; GCN-NEXT:     s_waitcnt vmcnt(0)
+; MUBUF-NEXT:   s_setpc_b64 s[4:5]
+; FLATSCR-NEXT: s_setpc_b64 s[0:1]
 define void @no_unused_non_csr_sgpr_for_fp() #1 {
  %alloca = alloca i32, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
@ -399,9 +410,11 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF:       s_add_u32 s32, s32, 0x300{{$}}
 ; FLATSCR:     s_add_u32 s32, s32, 12{{$}}

-; GCN:          v_readlane_b32 s4, [[CSR_VGPR]], 0
+; MUBUF:        v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR:      v_readlane_b32 s0, [[CSR_VGPR]], 0
 ; GCN: ;;#ASMSTART
-; GCN:          v_readlane_b32 s5, [[CSR_VGPR]], 1
+; MUBUF:        v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR:      v_readlane_b32 s1, [[CSR_VGPR]], 1
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300{{$}}
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@ -450,9 +463,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; MUBUF-DAG:   buffer_store_dword
 ; FLATSCR-DAG: scratch_store_dword

-; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; MUBUF:   v_readlane_b32 s4, [[CSR_VGPR]], 0
+; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
 ; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s5, [[CSR_VGPR]], 1
+; MUBUF:   v_readlane_b32 s5, [[CSR_VGPR]], 1
+; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
 ; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x40300{{$}}
 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
@ -514,20 +529,21 @@ define void @ipra_call_with_stack() #0 {

 ; With no free registers, we must spill the FP to memory.
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
+; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
 ; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
-; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4
-; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN: s_mov_b32 s33, s32
-; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF:   s_mov_b64 exec, [[COPY_EXEC1]]
+; FLATSCR: s_mov_b32 s0, s33
+; GCN:     s_mov_b32 s33, s32
+; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
-; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4
-; GCN: s_waitcnt vmcnt(0)
-; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
-; GCN: s_setpc_b64
-; GCN: ScratchSize: 8
+; FLATSCR: s_mov_b32 s33, s0
+; MUBUF:   s_waitcnt vmcnt(0)
+; MUBUF:   v_readfirstlane_b32 s33, [[TMP_VGPR2]]
+; MUBUF:   s_mov_b64 exec, [[COPY_EXEC2]]
+; GCN:     s_setpc_b64
+; MUBUF:   ScratchSize: 8
+; FLATSCR: ScratchSize: 0
 define void @callee_need_to_spill_fp_to_memory() #3 {
  call void asm sideeffect "; clobber nonpreserved SGPRs",
    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@ -547,20 +563,19 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
 ; need to spill the FP to memory if there are no free lanes in the reserved
 ; VGPR.
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
+; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
 ; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
-; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
+; MUBUF:   s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NOT: v_writelane_b32 v40, s33
-; GCN: s_mov_b32 s33, s32
+; MUBUF:   s_mov_b32 s33, s32
+; FLATSCR: s_mov_b32 s33, s0
 ; GCN-NOT: v_readlane_b32 s33, v40
-; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
-; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]]
-; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
-; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
-; GCN: s_setpc_b64
+; MUBUF:   v_readfirstlane_b32 s33, [[TMP_VGPR2]]
+; MUBUF:   s_mov_b64 exec, [[COPY_EXEC2]]
+; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
  call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
@ -585,14 +600,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; If the size of the offset exceeds the MUBUF offset field we need another
 ; scratch VGPR to hold the offset.
 ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
-; GCN: s_or_saveexec_b64 s[4:5], -1
+; MUBUF: s_or_saveexec_b64 s[4:5], -1
 ; MUBUF: v_mov_b32_e32 v0, s33
 ; GCN-NOT: v_mov_b32_e32 v0, 0x1008
 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008
 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
-; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s33
-; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill
+; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s33, 0x1004
+; FLATSCR: v_mov_b32_e32 v0, 0
+; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 {
  %alloca = alloca i32, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@ -15,11 +15,11 @@ define <2 x half> @chain_hi_to_lo_private() {
 ; FLATSCR-LABEL: chain_hi_to_lo_private:
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 s4, 2
-; FLATSCR-NEXT:    scratch_load_ushort v0, off, s4
-; FLATSCR-NEXT:    s_mov_b32 s4, 0
+; FLATSCR-NEXT:    s_mov_b32 s0, 2
+; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s4
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 bb:
@ -256,13 +256,13 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ;
 ; FLATSCR-LABEL: vload2_private:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
 ; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:4
@ -272,8 +272,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:6
 ; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:4
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s6
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s7
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:8
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@ -150,10 +150,10 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
 ; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-FLASTSCR-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    v_bfi_b32 v1, v1, 0, v0
-; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s0, v1
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
  %p.0 = load i16, i16 addrspace(5)* %p, align 1
@ -251,10 +251,10 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
 ; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-FLASTSCR-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    v_bfi_b32 v1, v1, 0, v0
-; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s0, v1
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
  %p.0 = load i16, i16 addrspace(5)* %p, align 4
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@ -1109,13 +1109,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-LABEL: store_load_large_imm_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x3000
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    scratch_store_dword off, v0, s32
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
+; GFX9-NEXT:    s_add_u32 s0, s32, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    scratch_store_dword off, v0, s4 offset:3712
-; GFX9-NEXT:    scratch_load_dword v0, off, s4 offset:3712
+; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@ -1125,12 +1125,12 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    s_movk_i32 s4, 0x3800
+; GFX10-NEXT:    s_movk_i32 s0, 0x3800
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
+; GFX10-NEXT:    s_add_u32 s0, s32, s0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s32
-; GFX10-NEXT:    scratch_store_dword off, v1, s4 offset:1664
-; GFX10-NEXT:    scratch_load_dword v0, off, s4 offset:1664
+; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
+; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@ -1364,8 +1364,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1413,8 +1413,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1463,8 +1463,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1614,8 +1614,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1664,8 +1664,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@ -1716,8 +1716,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@ -56,37 +56,37 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
 ;
 ; FLATSCR-LABEL: local_stack_offset_uses_sp:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_movk_i32 vcc_hi, 0x2000
-; FLATSCR-NEXT:    s_mov_b32 s6, 0
+; FLATSCR-NEXT:    s_mov_b32 s2, 0
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi
 ; FLATSCR-NEXT:  BB0_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLATSCR-NEXT:    s_add_u32 s7, 0x3000, s6
-; FLATSCR-NEXT:    s_add_i32 s6, s6, 1
-; FLATSCR-NEXT:    s_cmpk_lt_u32 s6, 0x2120
-; FLATSCR-NEXT:    scratch_store_byte off, v0, s7
+; FLATSCR-NEXT:    s_add_u32 s3, 0x3000, s2
+; FLATSCR-NEXT:    s_add_i32 s2, s2, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s2, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v0, s3
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
-; FLATSCR-NEXT:    s_movk_i32 s6, 0x20d0
-; FLATSCR-NEXT:    s_add_u32 s6, 0x3000, s6
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s6 offset:4
-; FLATSCR-NEXT:    s_movk_i32 s6, 0x2000
-; FLATSCR-NEXT:    s_add_u32 s6, 0x3000, s6
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s6 offset:208
-; FLATSCR-NEXT:    s_movk_i32 s6, 0x3000
-; FLATSCR-NEXT:    scratch_load_dword v2, off, s6 offset:68
-; FLATSCR-NEXT:    s_movk_i32 s6, 0x3000
-; FLATSCR-NEXT:    scratch_load_dword v3, off, s6 offset:64
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x20d0
+; FLATSCR-NEXT:    s_add_u32 s2, 0x3000, s2
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s2 offset:4
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    s_add_u32 s2, 0x3000, s2
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s2 offset:208
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x3000
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s2 offset:68
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x3000
+; FLATSCR-NEXT:    scratch_load_dword v3, off, s2 offset:64
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v2, s4
-; FLATSCR-NEXT:    v_mov_b32_e32 v3, s5
+; FLATSCR-NEXT:    v_mov_b32_e32 v3, s1
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, s0
 ; FLATSCR-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; FLATSCR-NEXT:    s_endpgm
 entry:
@ -146,36 +146,36 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
 ; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_add_u32 s4, s32, 0x1fff
-; FLATSCR-NEXT:    s_mov_b32 s6, s33
-; FLATSCR-NEXT:    s_and_b32 s33, s4, 0xffffe000
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 0x1fff
+; FLATSCR-NEXT:    s_mov_b32 s2, s33
+; FLATSCR-NEXT:    s_and_b32 s33, s0, 0xffffe000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT:    s_mov_b32 s4, 0
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x6000
 ; FLATSCR-NEXT:    scratch_store_dword off, v2, s33
 ; FLATSCR-NEXT:  BB1_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; FLATSCR-NEXT:    s_add_u32 vcc_hi, s33, 0x1000
-; FLATSCR-NEXT:    s_add_u32 s5, vcc_hi, s4
-; FLATSCR-NEXT:    s_add_i32 s4, s4, 1
-; FLATSCR-NEXT:    s_cmpk_lt_u32 s4, 0x2120
-; FLATSCR-NEXT:    scratch_store_byte off, v2, s5
+; FLATSCR-NEXT:    s_add_u32 s1, vcc_hi, s0
+; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v2, s1
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB1_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
-; FLATSCR-NEXT:    s_movk_i32 s4, 0x20d0
-; FLATSCR-NEXT:    s_add_u32 s5, s33, 0x1000
-; FLATSCR-NEXT:    s_add_u32 s4, s5, s4
-; FLATSCR-NEXT:    scratch_load_dword v3, off, s4 offset:4
-; FLATSCR-NEXT:    s_movk_i32 s4, 0x2000
-; FLATSCR-NEXT:    s_add_u32 s5, s33, 0x1000
-; FLATSCR-NEXT:    s_add_u32 s4, s5, s4
-; FLATSCR-NEXT:    scratch_load_dword v2, off, s4 offset:208
-; FLATSCR-NEXT:    s_add_u32 s4, s33, 0x1000
-; FLATSCR-NEXT:    scratch_load_dword v4, off, s4 offset:68
-; FLATSCR-NEXT:    s_add_u32 s4, s33, 0x1000
-; FLATSCR-NEXT:    scratch_load_dword v5, off, s4 offset:64
+; FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
+; FLATSCR-NEXT:    s_add_u32 s1, s33, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s0, s1, s0
+; FLATSCR-NEXT:    scratch_load_dword v3, off, s0 offset:4
+; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT:    s_add_u32 s1, s33, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s0, s1, s0
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s0 offset:208
+; FLATSCR-NEXT:    s_add_u32 s0, s33, 0x1000
+; FLATSCR-NEXT:    scratch_load_dword v4, off, s0 offset:68
+; FLATSCR-NEXT:    s_add_u32 s0, s33, 0x1000
+; FLATSCR-NEXT:    scratch_load_dword v5, off, s0 offset:64
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x6000
-; FLATSCR-NEXT:    s_mov_b32 s33, s6
+; FLATSCR-NEXT:    s_mov_b32 s33, s2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@ -54,36 +54,36 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ;
 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    s_mov_b32 s32, 16
 ; FLATSCR-NEXT:    s_mov_b32 s33, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    s_cmp_lg_u32 s8, 0
+; FLATSCR-NEXT:    s_cmp_lg_u32 s4, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_cmp_lg_u32 s9, 0
+; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_mov_b32 s6, s32
-; FLATSCR-NEXT:    s_movk_i32 s7, 0x1000
-; FLATSCR-NEXT:    s_add_i32 s8, s6, s7
-; FLATSCR-NEXT:    s_add_u32 s6, s6, s7
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
+; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
+; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v1, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 1
-; FLATSCR-NEXT:    s_lshl_b32 s6, s10, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s8
-; FLATSCR-NEXT:    scratch_store_dword off, v1, s8 offset:4
-; FLATSCR-NEXT:    s_add_i32 s8, s8, s6
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s8
-; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT:    s_lshl_b32 s2, s6, 2
+; FLATSCR-NEXT:    s_mov_b32 s32, s4
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s4 offset:4
+; FLATSCR-NEXT:    s_add_i32 s4, s4, s2
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s4
+; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
 ; FLATSCR-NEXT:  BB0_3: ; %bb.2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
@ -162,31 +162,31 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ;
 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; FLATSCR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; FLATSCR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
 ; FLATSCR-NEXT:    s_mov_b32 s32, 64
 ; FLATSCR-NEXT:    s_mov_b32 s33, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    s_cmp_lg_u32 s6, 0
+; FLATSCR-NEXT:    s_cmp_lg_u32 s2, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB1_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s6, s32, 0x1000
-; FLATSCR-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v1, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 1
-; FLATSCR-NEXT:    s_lshl_b32 s7, s7, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s6
-; FLATSCR-NEXT:    scratch_store_dword off, v1, s6 offset:4
-; FLATSCR-NEXT:    s_add_i32 s6, s6, s7
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s6
-; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT:    s_lshl_b32 s3, s3, 2
+; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s2 offset:4
+; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s2
+; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
 ; FLATSCR-NEXT:  BB1_2: ; %bb.1
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
@ -261,38 +261,38 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 s9, s33
+; FLATSCR-NEXT:    s_mov_b32 s5, s33
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_add_u32 s32, s32, 16
-; FLATSCR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz BB2_3
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
 ; FLATSCR-NEXT:    s_cbranch_execz BB2_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_mov_b32 s6, s32
-; FLATSCR-NEXT:    s_movk_i32 s7, 0x1000
-; FLATSCR-NEXT:    s_add_i32 s8, s6, s7
-; FLATSCR-NEXT:    s_add_u32 s6, s6, s7
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
+; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
+; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s8 offset:4
-; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s8
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s4 offset:4
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s4
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
-; FLATSCR-NEXT:    s_mov_b32 s32, s8
+; FLATSCR-NEXT:    s_mov_b32 s32, s4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
 ; FLATSCR-NEXT:  BB2_3: ; %bb.2
-; FLATSCR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 16
-; FLATSCR-NEXT:    s_mov_b32 s33, s9
+; FLATSCR-NEXT:    s_mov_b32 s33, s5
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]

@ -361,33 +361,33 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_add_u32 s4, s32, 63
-; FLATSCR-NEXT:    s_mov_b32 s7, s33
-; FLATSCR-NEXT:    s_and_b32 s33, s4, 0xffffffc0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 63
+; FLATSCR-NEXT:    s_mov_b32 s3, s33
+; FLATSCR-NEXT:    s_and_b32 s33, s0, 0xffffffc0
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x80
-; FLATSCR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz BB3_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s6, s32, 0x1000
-; FLATSCR-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s6 offset:4
-; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s2 offset:4
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
-; FLATSCR-NEXT:    s_mov_b32 s32, s6
+; FLATSCR-NEXT:    s_mov_b32 s32, s2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
 ; FLATSCR-NEXT:  BB3_2: ; %bb.1
-; FLATSCR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x80
-; FLATSCR-NEXT:    s_mov_b32 s33, s7
+; FLATSCR-NEXT:    s_mov_b32 s33, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@ -1,10 +1,11 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s

 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
 ; instruction offset field would overflow, and that it accounts for memory
 ; swizzling.

-; CHECK-LABEL: test_inst_offset_kernel
+; GCN-LABEL: test_inst_offset_kernel
 define amdgpu_kernel void @test_inst_offset_kernel() {
 entry:
  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@ -13,7 +14,8 @@ entry:
  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*

  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
  %a = load volatile i32, i32 addrspace(5)* %aptr

  ; Force %a to spill.
@ -25,7 +27,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_sgpr_offset_kernel
+; GCN-LABEL: test_sgpr_offset_kernel
 define amdgpu_kernel void @test_sgpr_offset_kernel() {
 entry:
  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -35,8 +37,10 @@ entry:

  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  ; 0x40000 / 64 = 4096 (for wave64)
-  ; CHECK: s_mov_b32 s6, 0x40000
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+  ; MUBUF:   s_mov_b32 s6, 0x40000
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+  ; FLATSCR: s_movk_i32 s2, 0x1000
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
  %a = load volatile i32, i32 addrspace(5)* %aptr

  ; Force %a to spill
@ -51,7 +55,7 @@ entry:
 ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
 ; pointer to temporarily update, so we just crash.

-; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
+; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
 define void @test_sgpr_offset_function_scavenge_fail() #2 {
 entry:
  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -74,9 +78,11 @@ entry:
  ; 0x40000 / 64 = 4096 (for wave64)
  %a = load volatile i32, i32 addrspace(5)* %aptr

-  ; CHECK: s_add_u32 s32, s32, 0x40000
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
-  ; CHECK: s_sub_u32 s32, s32, 0x40000
+  ; MUBUF:   s_add_u32 s32, s32, 0x40000
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
+  ; MUBUF:   s_sub_u32 s32, s32, 0x40000
+  ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)

  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
@ -91,16 +97,18 @@ entry:

  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0

-  ; CHECK: s_add_u32 s32, s32, 0x40000
-  ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
-  ; CHECK: s_sub_u32 s32, s32, 0x40000
+  ; MUBUF:   s_add_u32 s32, s32, 0x40000
+  ; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
+  ; MUBUF:   s_sub_u32 s32, s32, 0x40000
+  ; FLATSCR: s_add_u32 [[SOFF:s[0-9+]]], s32, 0x1000
+  ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload

   ; Force %a to spill with no free SGPRs
  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
  ret void
 }

-; CHECK-LABEL: test_sgpr_offset_subregs_kernel
+; GCN-LABEL: test_sgpr_offset_subregs_kernel
 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 entry:
  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@ -110,8 +118,11 @@ entry:
  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*

-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
+  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr

@ -128,7 +139,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_inst_offset_subregs_kernel
+; GCN-LABEL: test_inst_offset_subregs_kernel
 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 entry:
  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@ -139,9 +150,12 @@ entry:
  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*

  ; 0x3ff00 / 64 = 4092 (for wave64)
-  ; CHECK: s_mov_b32 s6, 0x3ff00
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
+  ; MUBUF:   s_mov_b32 s6, 0x3ff00
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
+  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr

@ -158,7 +172,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_inst_offset_function
+; GCN-LABEL: test_inst_offset_function
 define void @test_inst_offset_function() {
 entry:
  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
@ -167,7 +181,8 @@ entry:
  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*

  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
  %a = load volatile i32, i32 addrspace(5)* %aptr

  ; Force %a to spill.
@ -179,7 +194,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_sgpr_offset_function
+; GCN-LABEL: test_sgpr_offset_function
 define void @test_sgpr_offset_function() {
 entry:
  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@ -189,8 +204,10 @@ entry:

  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  ; 0x40000 / 64 = 4096 (for wave64)
-  ; CHECK: s_add_u32 s4, s32, 0x40000
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+  ; MUBUF:   s_add_u32 s4, s32, 0x40000
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+  ; FLATSCR: s_add_u32 s0, s32, 0x1000
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
  %a = load volatile i32, i32 addrspace(5)* %aptr

  ; Force %a to spill
@ -202,7 +219,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_sgpr_offset_subregs_function
+; GCN-LABEL: test_sgpr_offset_subregs_function
 define void @test_sgpr_offset_subregs_function() {
 entry:
  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
@ -212,8 +229,10 @@ entry:
  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*

-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4088 ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s32 offset:4092 ; 4-byte Folded Spill
  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr

@ -230,7 +249,7 @@ entry:
  ret void
 }

-; CHECK-LABEL: test_inst_offset_subregs_function
+; GCN-LABEL: test_inst_offset_subregs_function
 define void @test_inst_offset_subregs_function() {
 entry:
  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
@ -241,9 +260,12 @@ entry:
  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*

  ; 0x3ff00 / 64 = 4092 (for wave64)
-  ; CHECK: s_add_u32 s4, s32, 0x3ff00
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
+  ; MUBUF: s_add_u32 s4, s32, 0x3ff00
+  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
+  ; FLATSCR: s_add_u32 [[SOFF:s[0-9]+]], s32, 0xffc
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]]          ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] offset:4 ; 4-byte Folded Spill
  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr

--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@ -60,26 +60,18 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; FLATSCR-NEXT:    s_addc_u32 s3, s3, 0
 ; FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; FLATSCR-NEXT:    s_load_dword s0, s[0:1], 0x24
-; FLATSCR-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; FLATSCR-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; FLATSCR-NEXT:    s_mov_b32 s38, -1
-; FLATSCR-NEXT:    s_mov_b32 s39, 0x31c16000
-; FLATSCR-NEXT:    s_add_u32 s36, s36, s5
-; FLATSCR-NEXT:    s_addc_u32 s37, s37, 0
+; FLATSCR-NEXT:    s_load_dword s2, s[0:1], 0x24
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x2000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v3, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x400000
 ; FLATSCR-NEXT:    ; implicit-def: $vcc_hi
-; FLATSCR-NEXT:    s_getpc_b64 s[4:5]
-; FLATSCR-NEXT:    s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
-; FLATSCR-NEXT:    s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; FLATSCR-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; FLATSCR-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; FLATSCR-NEXT:    s_cbranch_execz BB0_2