forked from OSchip/llvm-project
[AMDGPU] Make SGPR spills exec mask agnostic
Explicitly set the exec mask for SGPR spills and reloads. This fixes a bug where SGPR spills to memory could be incorrect if the exec mask was 0 (or differed between spill and reload). Additionally pack scalar subregisters (upto 16/32 per VGPR), so that the majority of scalar types can be spilt or reloaded with a simple memory access. This should amortize some of the additional overhead of manipulating the exec mask. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D80282
This commit is contained in:
parent
a09bb6d77b
commit
da33c96d47
|
@ -863,6 +863,145 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
|
|||
}
|
||||
}
|
||||
|
||||
// Generate a VMEM access which loads or stores the VGPR containing an SGPR
|
||||
// spill such that all the lanes set in VGPRLanes are loaded or stored.
|
||||
// This generates exec mask manipulation and will use SGPRs available in MI
|
||||
// or VGPR lanes in the VGPR to save and restore the exec mask.
|
||||
void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
|
||||
int Index, int Offset,
|
||||
unsigned EltSize, Register VGPR,
|
||||
int64_t VGPRLanes,
|
||||
RegScavenger *RS,
|
||||
bool IsLoad) const {
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
MachineFunction *MF = MBB->getParent();
|
||||
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
|
||||
Register SuperReg = MI->getOperand(0).getReg();
|
||||
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
|
||||
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
|
||||
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
|
||||
unsigned FirstPart = isWave32 ? Offset * 16 : Offset * 32;
|
||||
|
||||
bool IsKill = MI->getOperand(0).isKill();
|
||||
const DebugLoc &DL = MI->getDebugLoc();
|
||||
|
||||
const bool SuperRegIsExec =
|
||||
SuperReg == AMDGPU::EXEC || SuperReg == AMDGPU::EXEC_LO;
|
||||
|
||||
// If exec mask is stored in the VGPR, make sure it is stored after
|
||||
// any lanes used by the spill (16 lanes on Wave32, 32 lanes on Wave64).
|
||||
const unsigned ExecLoLane = SuperRegIsExec ? 0 : (isWave32 ? 16 : 32);
|
||||
const unsigned ExecHiLane = SuperRegIsExec ? 1 : (isWave32 ? 17 : 33);
|
||||
|
||||
// Try to use the src/dst SGPRs to hold a copy of the exec mask.
|
||||
// Use VGPR lanes when this is not possible, i.e. the src value
|
||||
// must be valid after the spill or src is smaller than exec mask.
|
||||
bool StoreExecInVGPR = !IsLoad && (SuperRegIsExec || !IsKill);
|
||||
|
||||
// On Wave32 only handle EXEC_LO.
|
||||
// On Wave64 only update EXEC_HI if there is sufficent space for a copy.
|
||||
bool OnlyExecLo = isWave32 || NumSubRegs == 1;
|
||||
|
||||
unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
Register SavedExecReg;
|
||||
|
||||
// Backup EXEC
|
||||
if (SuperRegIsExec) {
|
||||
// Do nothing; exec is already stored in VGPR or will be overwritten
|
||||
} else if (StoreExecInVGPR) {
|
||||
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
|
||||
VGPR)
|
||||
.addReg(AMDGPU::EXEC_LO)
|
||||
.addImm(ExecLoLane)
|
||||
.addReg(VGPR, getUndefRegState(IsLoad));
|
||||
|
||||
if (!isWave32) {
|
||||
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
|
||||
VGPR)
|
||||
.addReg(AMDGPU::EXEC_HI)
|
||||
.addImm(ExecHiLane)
|
||||
.addReg(VGPR);
|
||||
}
|
||||
} else {
|
||||
if (OnlyExecLo) {
|
||||
SavedExecReg = NumSubRegs == 1
|
||||
? SuperReg
|
||||
: getSubReg(SuperReg, SplitParts[FirstPart]);
|
||||
} else {
|
||||
SavedExecReg =
|
||||
getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart]),
|
||||
AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
|
||||
// If src/dst is an odd size it is possible subreg0 is not aligned.
|
||||
if (!SavedExecReg && NumSubRegs > 2)
|
||||
SavedExecReg =
|
||||
getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart + 1]),
|
||||
AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
|
||||
}
|
||||
|
||||
assert(SavedExecReg);
|
||||
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
|
||||
}
|
||||
|
||||
// Setup EXEC
|
||||
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
|
||||
|
||||
// Load/store VGPR
|
||||
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
|
||||
assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
|
||||
|
||||
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
|
||||
? getBaseRegister()
|
||||
: getFrameRegister(*MF);
|
||||
|
||||
Align Alignment = FrameInfo.getObjectAlign(Index);
|
||||
MachinePointerInfo PtrInfo =
|
||||
MachinePointerInfo::getFixedStack(*MF, Index);
|
||||
MachineMemOperand *MMO = MF->getMachineMemOperand(
|
||||
PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
|
||||
EltSize, Alignment);
|
||||
|
||||
if (IsLoad) {
|
||||
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
|
||||
Index,
|
||||
VGPR, false,
|
||||
MFI->getScratchRSrcReg(), FrameReg,
|
||||
Offset * EltSize, MMO,
|
||||
RS);
|
||||
} else {
|
||||
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
|
||||
Index,
|
||||
VGPR, !StoreExecInVGPR,
|
||||
MFI->getScratchRSrcReg(), FrameReg,
|
||||
Offset * EltSize, MMO,
|
||||
RS);
|
||||
// This only ever adds one VGPR spill
|
||||
MFI->addToSpilledVGPRs(1);
|
||||
}
|
||||
|
||||
// Restore EXEC
|
||||
if (SuperRegIsExec && IsLoad) {
|
||||
// Do nothing; exec will be overwritten
|
||||
} else if (StoreExecInVGPR) {
|
||||
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
|
||||
AMDGPU::EXEC_LO)
|
||||
.addReg(VGPR, getKillRegState(!IsLoad && isWave32))
|
||||
.addImm(ExecLoLane);
|
||||
if (!isWave32) {
|
||||
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
|
||||
AMDGPU::EXEC_HI)
|
||||
.addReg(VGPR, getKillRegState(!IsLoad))
|
||||
.addImm(ExecHiLane);
|
||||
}
|
||||
} else {
|
||||
assert(SavedExecReg);
|
||||
BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
|
||||
.addReg(SavedExecReg, RegState::Kill);
|
||||
}
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
|
||||
int Index,
|
||||
RegScavenger *RS,
|
||||
|
@ -884,8 +1023,6 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
|
|||
bool IsKill = MI->getOperand(0).isKill();
|
||||
const DebugLoc &DL = MI->getDebugLoc();
|
||||
|
||||
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
|
||||
|
||||
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
|
||||
SuperReg != MFI->getFrameOffsetReg()));
|
||||
|
||||
|
@ -897,17 +1034,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
|
|||
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
|
||||
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
|
||||
|
||||
// Scavenged temporary VGPR to use. It must be scavenged once for any number
|
||||
// of spilled subregs.
|
||||
Register TmpVGPR;
|
||||
|
||||
// SubReg carries the "Kill" flag when SubReg == SuperReg.
|
||||
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
|
||||
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
|
||||
if (SpillToVGPR) {
|
||||
if (SpillToVGPR) {
|
||||
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
|
||||
|
||||
// During SGPR spilling to VGPR, determine if the VGPR is defined. The
|
||||
|
@ -929,42 +1059,52 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
|
|||
// FIXME: Since this spills to another register instead of an actual
|
||||
// frame index, we should delete the frame index when all references to
|
||||
// it are fixed.
|
||||
} else {
|
||||
// XXX - Can to VGPR spill fail for some subregisters but not others?
|
||||
if (OnlyToVGPR)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// Scavenged temporary VGPR to use. It must be scavenged once for any number
|
||||
// of spilled subregs.
|
||||
Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||
|
||||
// Spill SGPR to a frame index.
|
||||
if (!TmpVGPR.isValid())
|
||||
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||
// SubReg carries the "Kill" flag when SubReg == SuperReg.
|
||||
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
|
||||
|
||||
MachineInstrBuilder Mov
|
||||
= BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
|
||||
.addReg(SubReg, SubKillState);
|
||||
unsigned PerVGPR = isWave32 ? 16 : 32;
|
||||
unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
|
||||
int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
|
||||
|
||||
// There could be undef components of a spilled super register.
|
||||
// TODO: Can we detect this and skip the spill?
|
||||
if (NumSubRegs > 1) {
|
||||
// The last implicit use of the SuperReg carries the "Kill" flag.
|
||||
unsigned SuperKillState = 0;
|
||||
if (i + 1 == e)
|
||||
SuperKillState |= getKillRegState(IsKill);
|
||||
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
|
||||
for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
|
||||
unsigned TmpVGPRFlags = RegState::Undef;
|
||||
|
||||
// Write sub registers into the VGPR
|
||||
for (unsigned i = Offset * PerVGPR,
|
||||
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
|
||||
i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
|
||||
MachineInstrBuilder WriteLane =
|
||||
BuildMI(*MBB, MI, DL,
|
||||
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
|
||||
TmpVGPR)
|
||||
.addReg(SubReg, SubKillState)
|
||||
.addImm(i % PerVGPR)
|
||||
.addReg(TmpVGPR, TmpVGPRFlags);
|
||||
TmpVGPRFlags = 0;
|
||||
|
||||
// There could be undef components of a spilled super register.
|
||||
// TODO: Can we detect this and skip the spill?
|
||||
if (NumSubRegs > 1) {
|
||||
// The last implicit use of the SuperReg carries the "Kill" flag.
|
||||
unsigned SuperKillState = 0;
|
||||
if (i + 1 == NumSubRegs)
|
||||
SuperKillState |= getKillRegState(IsKill);
|
||||
WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
|
||||
}
|
||||
}
|
||||
|
||||
Align Alignment = FrameInfo.getObjectAlign(Index);
|
||||
MachinePointerInfo PtrInfo
|
||||
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
|
||||
MachineMemOperand *MMO =
|
||||
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize,
|
||||
commonAlignment(Alignment, EltSize * i));
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
|
||||
.addReg(TmpVGPR, RegState::Kill) // src
|
||||
.addFrameIndex(Index) // vaddr
|
||||
.addReg(MFI->getScratchRSrcReg()) // srrsrc
|
||||
.addReg(MFI->getStackPtrOffsetReg()) // soffset
|
||||
.addImm(i * 4) // offset
|
||||
.addMemOperand(MMO);
|
||||
// Write out VGPR
|
||||
buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
|
||||
RS, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -987,7 +1127,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
|
|||
if (OnlyToVGPR && !SpillToVGPR)
|
||||
return false;
|
||||
|
||||
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const DebugLoc &DL = MI->getDebugLoc();
|
||||
|
||||
|
@ -1002,13 +1141,11 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
|
|||
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
|
||||
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
|
||||
|
||||
Register TmpVGPR;
|
||||
if (SpillToVGPR) {
|
||||
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
|
||||
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
|
||||
if (SpillToVGPR) {
|
||||
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
|
||||
auto MIB =
|
||||
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
|
||||
|
@ -1018,36 +1155,36 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
|
|||
|
||||
if (NumSubRegs > 1 && i == 0)
|
||||
MIB.addReg(SuperReg, RegState::ImplicitDefine);
|
||||
} else {
|
||||
if (OnlyToVGPR)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||
|
||||
// Restore SGPR from a stack slot.
|
||||
// FIXME: We should use S_LOAD_DWORD here for VI.
|
||||
if (!TmpVGPR.isValid())
|
||||
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||
Align Alignment = FrameInfo.getObjectAlign(Index);
|
||||
unsigned PerVGPR = isWave32 ? 16 : 32;
|
||||
unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
|
||||
int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
|
||||
|
||||
MachinePointerInfo PtrInfo
|
||||
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
|
||||
for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
|
||||
// Load in VGPR data
|
||||
buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
|
||||
RS, true);
|
||||
|
||||
MachineMemOperand *MMO =
|
||||
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize,
|
||||
commonAlignment(Alignment, EltSize * i));
|
||||
// Unpack lanes
|
||||
for (unsigned i = Offset * PerVGPR,
|
||||
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
|
||||
i < e; ++i) {
|
||||
Register SubReg =
|
||||
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
|
||||
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
|
||||
.addFrameIndex(Index) // vaddr
|
||||
.addReg(MFI->getScratchRSrcReg()) // srsrc
|
||||
.addReg(MFI->getStackPtrOffsetReg()) // soffset
|
||||
.addImm(i * 4) // offset
|
||||
.addMemOperand(MMO);
|
||||
bool LastSubReg = (i + 1 == e);
|
||||
auto MIB =
|
||||
BuildMI(*MBB, MI, DL,
|
||||
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
|
||||
.addReg(TmpVGPR, getKillRegState(LastSubReg))
|
||||
.addImm(i);
|
||||
|
||||
auto MIB =
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
|
||||
.addReg(TmpVGPR, RegState::Kill);
|
||||
|
||||
if (NumSubRegs > 1)
|
||||
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
|
||||
if (NumSubRegs > 1 && i == 0)
|
||||
MIB.addReg(SuperReg, RegState::ImplicitDefine);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -103,6 +103,11 @@ public:
|
|||
const TargetRegisterClass *getPointerRegClass(
|
||||
const MachineFunction &MF, unsigned Kind = 0) const override;
|
||||
|
||||
void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
|
||||
int Offset, unsigned EltSize, Register VGPR,
|
||||
int64_t VGPRLanes, RegScavenger *RS,
|
||||
bool IsLoad) const;
|
||||
|
||||
/// If \p OnlyToVGPR is true, this will only succeed if this
|
||||
bool spillSGPR(MachineBasicBlock::iterator MI,
|
||||
int FI, RegScavenger *RS,
|
||||
|
|
|
@ -28,10 +28,9 @@
|
|||
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill
|
||||
|
||||
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
|
||||
|
||||
|
@ -56,13 +55,10 @@
|
|||
|
||||
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
|
||||
|
||||
; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
|
||||
|
||||
|
@ -109,10 +105,9 @@ endif:
|
|||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
|
||||
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
|
||||
|
||||
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
|
||||
|
||||
|
@ -132,13 +127,10 @@ endif:
|
|||
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
|
||||
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
|
||||
|
||||
; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
|
||||
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
|
||||
|
@ -186,10 +178,9 @@ end:
|
|||
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
|
||||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
|
||||
; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
|
||||
; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
|
||||
; GCN: s_mov_b64 exec, [[CMP0]]
|
||||
|
||||
|
@ -202,13 +193,10 @@ end:
|
|||
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
|
||||
|
||||
|
||||
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]]
|
||||
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]]
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
|
||||
|
||||
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
|
||||
; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0
|
||||
; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1
|
||||
|
||||
; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
|
||||
|
||||
|
@ -221,10 +209,9 @@ end:
|
|||
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]
|
||||
|
||||
|
||||
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
|
||||
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
|
||||
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], 0
|
||||
; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], 1
|
||||
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
|
||||
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
|
||||
|
@ -249,13 +236,10 @@ end:
|
|||
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
|
||||
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
|
||||
|
||||
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
|
||||
; VMEM: s_waitcnt vmcnt(0)
|
||||
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
|
||||
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
|
||||
|
||||
; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
|
||||
|
||||
|
|
|
@ -563,9 +563,8 @@ ret:
|
|||
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
|
||||
|
||||
; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
|
||||
; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]]
|
||||
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
|
||||
; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]]
|
||||
; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]], 0
|
||||
; GCN: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]], 1
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
|
||||
define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
|
||||
|
|
|
@ -0,0 +1,445 @@
|
|||
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s
|
||||
|
||||
|
||||
# CHECK-LABEL: name: check_spill
|
||||
|
||||
# S32 with kill
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# CHECK: $exec_lo = S_MOV_B32 1
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
|
||||
# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
|
||||
# S32 without kill
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: $exec_lo = S_MOV_B32 1
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
|
||||
# CHECK: $exec_lo = V_READLANE
|
||||
|
||||
# S64 with kill
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 3
|
||||
# GCN64: $exec = S_MOV_B64 3
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S64 without kill
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN64: V_WRITELANE
|
||||
# GCN32: $exec_lo = S_MOV_B32 3
|
||||
# GCN64: $exec = S_MOV_B64 3
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
|
||||
# CHECK: $exec_lo = V_READLANE
|
||||
# GCN64: $exec_hi = V_READLANE
|
||||
|
||||
# S96
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 7
|
||||
# GCN64: $exec = S_MOV_B64 7
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S128
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 15
|
||||
# GCN64: $exec = S_MOV_B64 15
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S160
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 31
|
||||
# GCN64: $exec = S_MOV_B64 31
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S256
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 255
|
||||
# GCN64: $exec = S_MOV_B64 255
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S512
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN64: $exec = S_MOV_B64 65535
|
||||
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
|
||||
# S1024
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr64 = S_MOV_B32 $exec_lo
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# CHECK: V_WRITELANE
|
||||
# GCN32: $sgpr80 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN64: $exec = S_MOV_B64 4294967295
|
||||
# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 164
|
||||
# GCN64: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
|
||||
|
||||
--- |
|
||||
|
||||
define amdgpu_kernel void @check_spill() #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @check_reload() #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "frame-pointer"="all" }
|
||||
...
|
||||
---
|
||||
name: check_spill
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$sgpr4_sgpr5' }
|
||||
- { reg: '$sgpr6_sgpr7' }
|
||||
- { reg: '$sgpr8' }
|
||||
frameInfo:
|
||||
maxAlignment: 4
|
||||
stack:
|
||||
- { id: 0, type: spill-slot, size: 4, alignment: 4 }
|
||||
- { id: 1, type: spill-slot, size: 8, alignment: 4 }
|
||||
- { id: 2, type: spill-slot, size: 12, alignment: 4 }
|
||||
- { id: 3, type: spill-slot, size: 16, alignment: 4 }
|
||||
- { id: 4, type: spill-slot, size: 20, alignment: 4 }
|
||||
- { id: 5, type: spill-slot, size: 32, alignment: 4 }
|
||||
- { id: 6, type: spill-slot, size: 64, alignment: 4 }
|
||||
- { id: 7, type: spill-slot, size: 128, alignment: 4 }
|
||||
machineFunctionInfo:
|
||||
explicitKernArgSize: 660
|
||||
maxKernArgAlign: 4
|
||||
isEntryFunction: true
|
||||
waveLimiter: true
|
||||
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
|
||||
stackPtrOffsetReg: '$sgpr32'
|
||||
frameOffsetReg: '$sgpr33'
|
||||
argumentInfo:
|
||||
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
|
||||
workGroupIDX: { reg: '$sgpr8' }
|
||||
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
|
||||
|
||||
renamable $sgpr12 = IMPLICIT_DEF
|
||||
SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12 = IMPLICIT_DEF
|
||||
SI_SPILL_S32_SAVE $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13 = IMPLICIT_DEF
|
||||
SI_SPILL_S64_SAVE killed $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13 = IMPLICIT_DEF
|
||||
SI_SPILL_S64_SAVE $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
|
||||
SI_SPILL_S96_SAVE killed $sgpr12_sgpr13_sgpr14, %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
|
||||
SI_SPILL_S128_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
|
||||
SI_SPILL_S160_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16, %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
|
||||
SI_SPILL_S256_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
|
||||
SI_SPILL_S512_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
|
||||
SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
|
||||
# CHECK-LABEL: name: check_reload
|
||||
|
||||
# S32
|
||||
# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# CHECK: $exec_lo = S_MOV_B32 1
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
|
||||
# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
|
||||
# S64
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 3
|
||||
# GCN64: $exec = S_MOV_B64 3
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
|
||||
# S96
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 7
|
||||
# GCN64: $exec = S_MOV_B64 7
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
# CHECK: $sgpr14 = V_READLANE
|
||||
|
||||
# S128
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 15
|
||||
# GCN64: $exec = S_MOV_B64 15
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
# CHECK: $sgpr14 = V_READLANE
|
||||
# CHECK: $sgpr15 = V_READLANE
|
||||
|
||||
# S160
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 31
|
||||
# GCN64: $exec = S_MOV_B64 31
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
# CHECK: $sgpr14 = V_READLANE
|
||||
# CHECK: $sgpr15 = V_READLANE
|
||||
# CHECK: $sgpr16 = V_READLANE
|
||||
|
||||
# S256
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 255
|
||||
# GCN64: $exec = S_MOV_B64 255
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
# CHECK: $sgpr14 = V_READLANE
|
||||
# CHECK: $sgpr15 = V_READLANE
|
||||
# CHECK: $sgpr16 = V_READLANE
|
||||
# CHECK: $sgpr17 = V_READLANE
|
||||
# CHECK: $sgpr18 = V_READLANE
|
||||
# CHECK: $sgpr19 = V_READLANE
|
||||
|
||||
# S512
|
||||
# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN64: $exec = S_MOV_B64 65535
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
|
||||
# CHECK: $sgpr12 = V_READLANE
|
||||
# CHECK: $sgpr13 = V_READLANE
|
||||
# CHECK: $sgpr14 = V_READLANE
|
||||
# CHECK: $sgpr15 = V_READLANE
|
||||
# CHECK: $sgpr16 = V_READLANE
|
||||
# CHECK: $sgpr17 = V_READLANE
|
||||
# CHECK: $sgpr18 = V_READLANE
|
||||
# CHECK: $sgpr19 = V_READLANE
|
||||
# CHECK: $sgpr20 = V_READLANE
|
||||
# CHECK: $sgpr21 = V_READLANE
|
||||
# CHECK: $sgpr22 = V_READLANE
|
||||
# CHECK: $sgpr23 = V_READLANE
|
||||
# CHECK: $sgpr24 = V_READLANE
|
||||
# CHECK: $sgpr25 = V_READLANE
|
||||
# CHECK: $sgpr26 = V_READLANE
|
||||
# CHECK: $sgpr27 = V_READLANE
|
||||
|
||||
# S1024
|
||||
# GCN32: $sgpr64 = S_MOV_B32 $exec_lo
|
||||
# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN64: $exec = S_MOV_B64 4294967295
|
||||
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
|
||||
# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
|
||||
# CHECK: $sgpr64 = V_READLANE
|
||||
# CHECK: $sgpr65 = V_READLANE
|
||||
# CHECK: $sgpr66 = V_READLANE
|
||||
# CHECK: $sgpr67 = V_READLANE
|
||||
# CHECK: $sgpr68 = V_READLANE
|
||||
# CHECK: $sgpr69 = V_READLANE
|
||||
# CHECK: $sgpr70 = V_READLANE
|
||||
# CHECK: $sgpr71 = V_READLANE
|
||||
# CHECK: $sgpr72 = V_READLANE
|
||||
# CHECK: $sgpr73 = V_READLANE
|
||||
# CHECK: $sgpr74 = V_READLANE
|
||||
# CHECK: $sgpr75 = V_READLANE
|
||||
# CHECK: $sgpr76 = V_READLANE
|
||||
# CHECK: $sgpr77 = V_READLANE
|
||||
# CHECK: $sgpr78 = V_READLANE
|
||||
# CHECK: $sgpr79 = V_READLANE
|
||||
# GCN32: $sgpr80 = S_MOV_B32 $exec_lo
|
||||
# GCN32: $exec_lo = S_MOV_B32 65535
|
||||
# GCN32: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 164
|
||||
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80
|
||||
# CHECK: $sgpr80 = V_READLANE
|
||||
# CHECK: $sgpr81 = V_READLANE
|
||||
# CHECK: $sgpr82 = V_READLANE
|
||||
# CHECK: $sgpr83 = V_READLANE
|
||||
# CHECK: $sgpr84 = V_READLANE
|
||||
# CHECK: $sgpr85 = V_READLANE
|
||||
# CHECK: $sgpr86 = V_READLANE
|
||||
# CHECK: $sgpr87 = V_READLANE
|
||||
# CHECK: $sgpr88 = V_READLANE
|
||||
# CHECK: $sgpr89 = V_READLANE
|
||||
# CHECK: $sgpr90 = V_READLANE
|
||||
# CHECK: $sgpr91 = V_READLANE
|
||||
# CHECK: $sgpr92 = V_READLANE
|
||||
# CHECK: $sgpr93 = V_READLANE
|
||||
# CHECK: $sgpr94 = V_READLANE
|
||||
# CHECK: $sgpr95 = V_READLANE
|
||||
|
||||
---
|
||||
name: check_reload
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$sgpr4_sgpr5' }
|
||||
- { reg: '$sgpr6_sgpr7' }
|
||||
- { reg: '$sgpr8' }
|
||||
frameInfo:
|
||||
maxAlignment: 4
|
||||
stack:
|
||||
- { id: 0, type: spill-slot, size: 4, alignment: 4 }
|
||||
- { id: 1, type: spill-slot, size: 8, alignment: 4 }
|
||||
- { id: 2, type: spill-slot, size: 12, alignment: 4 }
|
||||
- { id: 3, type: spill-slot, size: 16, alignment: 4 }
|
||||
- { id: 4, type: spill-slot, size: 20, alignment: 4 }
|
||||
- { id: 5, type: spill-slot, size: 32, alignment: 4 }
|
||||
- { id: 6, type: spill-slot, size: 64, alignment: 4 }
|
||||
- { id: 7, type: spill-slot, size: 128, alignment: 4 }
|
||||
machineFunctionInfo:
|
||||
explicitKernArgSize: 660
|
||||
maxKernArgAlign: 4
|
||||
isEntryFunction: true
|
||||
waveLimiter: true
|
||||
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
|
||||
stackPtrOffsetReg: '$sgpr32'
|
||||
frameOffsetReg: '$sgpr33'
|
||||
argumentInfo:
|
||||
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
|
||||
workGroupIDX: { reg: '$sgpr8' }
|
||||
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
|
||||
|
||||
renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
||||
|
||||
renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
|
|
@ -6,9 +6,13 @@
|
|||
; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
|
||||
|
||||
; Make sure we are handling hazards correctly.
|
||||
; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
|
||||
; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
|
||||
; SGPR-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; SGPR-NEXT: s_waitcnt vmcnt(0)
|
||||
; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
|
||||
; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
|
||||
; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1
|
||||
; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2
|
||||
; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3
|
||||
; SGPR-NEXT: s_nop 4
|
||||
; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2
|
||||
|
||||
; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
|
||||
; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
|
||||
; TOVMEM-DAG: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
|
||||
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill
|
||||
|
||||
; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
|
||||
|
@ -24,7 +24,7 @@
|
|||
|
||||
; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload
|
||||
; TOVMEM: s_waitcnt vmcnt(0)
|
||||
; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
|
||||
; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0
|
||||
; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
|
||||
|
||||
; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
|
||||
|
|
|
@ -35,9 +35,9 @@ entry:
|
|||
}
|
||||
|
||||
; CHECK-LABEL: test_limited_sgpr
|
||||
; GFX6: s_add_u32 s32, s32, 0x84100
|
||||
; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
|
||||
; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
|
||||
; GFX6-NEXT: s_sub_u32 s32, s32, 0x84100
|
||||
; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
|
||||
; GFX6: NumSgprs: 48
|
||||
; GFX6: ScratchSize: 8624
|
||||
define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
|
||||
|
|
|
@ -11,11 +11,9 @@
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -42,13 +40,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -77,15 +71,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -116,17 +104,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <5 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -162,23 +142,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -230,39 +196,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 14
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
@ -346,71 +282,9 @@ ret:
|
|||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 30
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 31
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x32(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
%wide.sgpr = call <32 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
|
Loading…
Reference in New Issue