forked from OSchip/llvm-project
[AMDGPU] Implement flat scratch init for pal
Extract the scratch offset from the scratch buffer descriptor that is stored in the global table. Differential Revision: https://reviews.llvm.org/D91701
This commit is contained in:
parent
1b5921f4d8
commit
7a18bdb350
|
@ -296,6 +296,31 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
|
|||
.addMemOperand(MMO);
|
||||
}
|
||||
|
||||
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
||||
const DebugLoc &DL, const SIInstrInfo *TII,
|
||||
Register TargetReg) {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
||||
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
|
||||
Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
|
||||
Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
|
||||
|
||||
if (MFI->getGITPtrHigh() != 0xffffffff) {
|
||||
BuildMI(MBB, I, DL, SMovB32, TargetHi)
|
||||
.addImm(MFI->getGITPtrHigh())
|
||||
.addReg(TargetReg, RegState::ImplicitDefine);
|
||||
} else {
|
||||
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
|
||||
BuildMI(MBB, I, DL, GetPC64, TargetReg);
|
||||
}
|
||||
Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
|
||||
MF->getRegInfo().addLiveIn(GitPtrLo);
|
||||
MBB.addLiveIn(GitPtrLo);
|
||||
BuildMI(MBB, I, DL, SMovB32, TargetLo)
|
||||
.addReg(GitPtrLo);
|
||||
}
|
||||
|
||||
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
|
||||
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
|
||||
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
||||
|
@ -315,16 +340,74 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
|
|||
// pointer. Because we only detect if flat instructions are used at all,
|
||||
// this will be used more often than necessary on VI.
|
||||
|
||||
Register FlatScratchInitReg =
|
||||
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
|
||||
assert(FlatScratchInitReg);
|
||||
Register FlatScrInitLo;
|
||||
Register FlatScrInitHi;
|
||||
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
MRI.addLiveIn(FlatScratchInitReg);
|
||||
MBB.addLiveIn(FlatScratchInitReg);
|
||||
if (ST.isAmdPalOS()) {
|
||||
// Extract the scratch offset from the descriptor in the GIT
|
||||
LivePhysRegs LiveRegs;
|
||||
LiveRegs.init(*TRI);
|
||||
LiveRegs.addLiveIns(MBB);
|
||||
|
||||
Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
|
||||
Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
|
||||
// Find unused reg to load flat scratch init into
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
Register FlatScrInit = AMDGPU::NoRegister;
|
||||
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
|
||||
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
|
||||
AllSGPR64s = AllSGPR64s.slice(
|
||||
std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
|
||||
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
|
||||
for (MCPhysReg Reg : AllSGPR64s) {
|
||||
if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
|
||||
!TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
|
||||
FlatScrInit = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(FlatScrInit && "Failed to find free register for scratch init");
|
||||
|
||||
FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
|
||||
FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
|
||||
|
||||
buildGitPtr(MBB, I, DL, TII, FlatScrInit);
|
||||
|
||||
// We now have the GIT ptr - now get the scratch descriptor from the entry
|
||||
// at offset 0 (or offset 16 for a compute shader).
|
||||
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
||||
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
|
||||
auto *MMO = MF.getMachineMemOperand(
|
||||
PtrInfo,
|
||||
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
|
||||
MachineMemOperand::MODereferenceable,
|
||||
8, Align(4));
|
||||
unsigned Offset =
|
||||
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
|
||||
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
|
||||
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
|
||||
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
|
||||
.addReg(FlatScrInit)
|
||||
.addImm(EncodedOffset) // offset
|
||||
.addImm(0) // glc
|
||||
.addImm(0) // dlc
|
||||
.addMemOperand(MMO);
|
||||
|
||||
// Mask the offset in [47:0] of the descriptor
|
||||
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
|
||||
BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
|
||||
.addReg(FlatScrInitHi)
|
||||
.addImm(0xffff);
|
||||
} else {
|
||||
Register FlatScratchInitReg =
|
||||
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
|
||||
assert(FlatScratchInitReg);
|
||||
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
MRI.addLiveIn(FlatScratchInitReg);
|
||||
MBB.addLiveIn(FlatScratchInitReg);
|
||||
|
||||
FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
|
||||
FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
|
||||
}
|
||||
|
||||
// Do a 64-bit pointer add.
|
||||
if (ST.flatScratchIsPointer()) {
|
||||
|
@ -582,26 +665,9 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
|
|||
if (ST.isAmdPalOS()) {
|
||||
// The pointer to the GIT is formed from the offset passed in and either
|
||||
// the amdgpu-git-ptr-high function attribute or the top part of the PC
|
||||
Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
|
||||
Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
|
||||
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
|
||||
|
||||
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
|
||||
|
||||
if (MFI->getGITPtrHigh() != 0xffffffff) {
|
||||
BuildMI(MBB, I, DL, SMovB32, RsrcHi)
|
||||
.addImm(MFI->getGITPtrHigh())
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
} else {
|
||||
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
|
||||
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
|
||||
}
|
||||
Register GitPtrLo = MFI->getGITPtrLoReg(MF);
|
||||
MF.getRegInfo().addLiveIn(GitPtrLo);
|
||||
MBB.addLiveIn(GitPtrLo);
|
||||
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
|
||||
.addReg(GitPtrLo)
|
||||
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
||||
buildGitPtr(MBB, I, DL, TII, Rsrc01);
|
||||
|
||||
// We now have the GIT ptr - now get the scratch descriptor from the entry
|
||||
// at offset 0 (or offset 16 for a compute shader).
|
||||
|
|
|
@ -2030,7 +2030,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
|
|||
CCInfo.AllocateReg(DispatchIDReg);
|
||||
}
|
||||
|
||||
if (Info.hasFlatScratchInit()) {
|
||||
if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
|
||||
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
|
||||
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
|
||||
CCInfo.AllocateReg(FlatScratchInitReg);
|
||||
|
|
|
@ -2151,6 +2151,12 @@ SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
|
|||
ST.getMaxNumSGPRs(MF) / 4);
|
||||
}
|
||||
|
||||
ArrayRef<MCPhysReg>
|
||||
SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
|
||||
return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
|
||||
ST.getMaxNumSGPRs(MF) / 2);
|
||||
}
|
||||
|
||||
ArrayRef<MCPhysReg>
|
||||
SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
|
||||
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
|
||||
|
|
|
@ -324,6 +324,10 @@ public:
|
|||
/// of the subtarget.
|
||||
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
|
||||
|
||||
/// Return all SGPR64 which satisfy the waves per execution unit requirement
|
||||
/// of the subtarget.
|
||||
ArrayRef<MCPhysReg> getAllSGPR64(const MachineFunction &MF) const;
|
||||
|
||||
/// Return all SGPR32 which satisfy the waves per execution unit requirement
|
||||
/// of the subtarget.
|
||||
ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
|
||||
|
|
|
@ -23,7 +23,7 @@ machineFunctionInfo:
|
|||
body: |
|
||||
; CHECK: $sgpr1 = COPY killed $sgpr5
|
||||
; CHECK: $sgpr4_sgpr5 = S_GETPC_B64
|
||||
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8
|
||||
; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4)
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -6,6 +6,8 @@
|
|||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR-PAL %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR-PAL,GFX9_10-FLATSCR %s
|
||||
|
||||
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0
|
||||
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0
|
||||
|
@ -25,6 +27,28 @@
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
|
||||
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
|
||||
; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
|
||||
; MUBUF-DAG: s_mov_b32 s2, -1
|
||||
|
@ -44,6 +68,7 @@
|
|||
; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
|
||||
; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
|
||||
; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
|
||||
; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
|
||||
; GCN-NOT: s_mov_b32 s0
|
||||
|
||||
; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
|
||||
|
@ -68,6 +93,27 @@ define amdgpu_ps float @ps_main(i32 %idx) {
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
|
||||
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
|
||||
; GCN-NOT: s_mov_b32 s0
|
||||
|
||||
|
@ -98,6 +144,27 @@ define amdgpu_vs float @vs_main(i32 %idx) {
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
|
||||
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
|
||||
|
||||
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
|
||||
|
@ -152,6 +219,27 @@ define amdgpu_hs float @hs_main(i32 %idx) {
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
|
||||
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
|
||||
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
|
||||
|
@ -184,6 +272,27 @@ define amdgpu_gs float @gs_main(i32 %idx) {
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
|
||||
|
||||
|
@ -217,6 +326,27 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
|
|||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
|
||||
; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
|
||||
; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
|
||||
; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
|
||||
; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
|
||||
; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
|
||||
; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
|
||||
; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
|
||||
; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
|
||||
; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
|
||||
; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
|
||||
; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
|
||||
|
||||
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
|
||||
|
||||
|
|
Loading…
Reference in New Issue