[AMDGPU] Implement flat scratch init for pal

Extract the scratch offset from the scratch buffer descriptor that is
stored in the global table.

Differential Revision: https://reviews.llvm.org/D91701
This commit is contained in:
Sebastian Neubauer 2020-10-15 13:26:44 +02:00
parent 1b5921f4d8
commit 7a18bdb350
7 changed files with 1249 additions and 28 deletions

View File

@ -296,6 +296,31 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addMemOperand(MMO);
}
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, const SIInstrInfo *TII,
Register TargetReg) {
MachineFunction *MF = MBB.getParent();
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
if (MFI->getGITPtrHigh() != 0xffffffff) {
BuildMI(MBB, I, DL, SMovB32, TargetHi)
.addImm(MFI->getGITPtrHigh())
.addReg(TargetReg, RegState::ImplicitDefine);
} else {
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, TargetReg);
}
Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
MF->getRegInfo().addLiveIn(GitPtrLo);
MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, TargetLo)
.addReg(GitPtrLo);
}
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@ -315,16 +340,74 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.
Register FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);
Register FlatScrInitLo;
Register FlatScrInitHi;
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
LivePhysRegs LiveRegs;
LiveRegs.init(*TRI);
LiveRegs.addLiveIns(MBB);
Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
Register FlatScrInit = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
FlatScrInit = Reg;
break;
}
}
assert(FlatScrInit && "Failed to find free register for scratch init");
FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
buildGitPtr(MBB, I, DL, TII, FlatScrInit);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
auto *MMO = MF.getMachineMemOperand(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
8, Align(4));
unsigned Offset =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
.addReg(FlatScrInit)
.addImm(EncodedOffset) // offset
.addImm(0) // glc
.addImm(0) // dlc
.addMemOperand(MMO);
// Mask the offset in [47:0] of the descriptor
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
.addReg(FlatScrInitHi)
.addImm(0xffff);
} else {
Register FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
}
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
@ -582,26 +665,9 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
if (MFI->getGITPtrHigh() != 0xffffffff) {
BuildMI(MBB, I, DL, SMovB32, RsrcHi)
.addImm(MFI->getGITPtrHigh())
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else {
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
Register GitPtrLo = MFI->getGITPtrLoReg(MF);
MF.getRegInfo().addLiveIn(GitPtrLo);
MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
.addReg(GitPtrLo)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
buildGitPtr(MBB, I, DL, TII, Rsrc01);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).

View File

@ -2030,7 +2030,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchIDReg);
}
if (Info.hasFlatScratchInit()) {
if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);

View File

@ -2151,6 +2151,12 @@ SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
ST.getMaxNumSGPRs(MF) / 4);
}
ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
ST.getMaxNumSGPRs(MF) / 2);
}
ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));

View File

@ -324,6 +324,10 @@ public:
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
/// Return all SGPR64 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR64(const MachineFunction &MF) const;
/// Return all SGPR32 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;

View File

@ -23,7 +23,7 @@ machineFunctionInfo:
body: |
; CHECK: $sgpr1 = COPY killed $sgpr5
; CHECK: $sgpr4_sgpr5 = S_GETPC_B64
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8
; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4)
bb.0:
successors: %bb.1, %bb.2

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,8 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s
; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR-PAL %s
; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR-PAL,GFX9_10-FLATSCR %s
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0
@ -25,6 +27,28 @@
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; MUBUF-DAG: s_mov_b32 s2, -1
@ -44,6 +68,7 @@
; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
; GCN-NOT: s_mov_b32 s0
; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
@ -68,6 +93,27 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s0
@ -98,6 +144,27 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
@ -152,6 +219,27 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
@ -184,6 +272,27 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
@ -217,6 +326,27 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD