AMDGPU: Fix MMO when splitting spill

The size and offset were wrong. The size of the object was
being used for the size of the access, when here it is really
being split into 4-byte accesses. The underlying object size
is set in the MachinePointerInfo, which also didn't have the
offset set.

llvm-svn: 287806
This commit is contained in:
Matt Arsenault 2016-11-23 20:52:53 +00:00
parent fa6339f321
commit 2669a76f01
5 changed files with 104 additions and 72 deletions

View File

@ -397,28 +397,36 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp, unsigned LoadStoreOp,
const MachineOperand *SrcDst, int Index,
unsigned ValueReg,
bool IsKill,
unsigned ScratchRsrcReg, unsigned ScratchRsrcReg,
unsigned ScratchOffset, unsigned ScratchOffsetReg,
int64_t Offset, int64_t InstOffset,
MachineMemOperand *MMO,
RegScavenger *RS) const { RegScavenger *RS) const {
unsigned Value = SrcDst->getReg();
bool IsKill = SrcDst->isKill();
MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent(); MachineFunction *MF = MI->getParent()->getParent();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo(); const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
DebugLoc DL = MI->getDebugLoc(); const MCInstrDesc &Desc = TII->get(LoadStoreOp);
bool IsStore = MI->mayStore(); const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = Desc.mayStore();
bool RanOutOfSGPRs = false; bool RanOutOfSGPRs = false;
bool Scavenged = false; bool Scavenged = false;
unsigned SOffset = ScratchOffset; unsigned SOffset = ScratchOffsetReg;
unsigned OriginalImmOffset = Offset;
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
unsigned Size = NumSubRegs * 4; unsigned Size = NumSubRegs * 4;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
const int64_t OriginalImmOffset = Offset;
unsigned Align = MFI.getObjectAlignment(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
if (!isUInt<12>(Offset + Size)) { if (!isUInt<12>(Offset + Size)) {
SOffset = AMDGPU::NoRegister; SOffset = AMDGPU::NoRegister;
@ -437,19 +445,23 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// subtract the offset after the spill to return ScratchOffset to it's // subtract the offset after the spill to return ScratchOffset to it's
// original value. // original value.
RanOutOfSGPRs = true; RanOutOfSGPRs = true;
SOffset = ScratchOffset; SOffset = ScratchOffsetReg;
} else { } else {
Scavenged = true; Scavenged = true;
} }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
.addReg(ScratchOffset) .addReg(ScratchOffsetReg)
.addImm(Offset); .addImm(Offset);
Offset = 0; Offset = 0;
} }
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { const unsigned EltSize = 4;
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ? unsigned SubReg = NumSubRegs == 1 ?
Value : getSubReg(Value, getSubRegFromChannel(i)); ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0; unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore); unsigned SrcDstRegState = getDefRegState(!IsStore);
@ -459,7 +471,12 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill); SrcDstRegState |= getKillRegState(IsKill);
} }
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
MachineMemOperand *NewMMO
= MF->getMachineMemOperand(PInfo, MMO->getFlags(),
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, Desc)
.addReg(SubReg, getDefRegState(!IsStore)) .addReg(SubReg, getDefRegState(!IsStore))
.addReg(ScratchRsrcReg) .addReg(ScratchRsrcReg)
.addReg(SOffset, SOffsetRegState) .addReg(SOffset, SOffsetRegState)
@ -467,14 +484,15 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
.addImm(0) // glc .addImm(0) // glc
.addImm(0) // slc .addImm(0) // slc
.addImm(0) // tfe .addImm(0) // tfe
.addReg(Value, RegState::Implicit | SrcDstRegState) .addMemOperand(NewMMO)
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); .addReg(ValueReg, RegState::Implicit | SrcDstRegState);
} }
if (RanOutOfSGPRs) { if (RanOutOfSGPRs) {
// Subtract the offset we added to the ScratchOffset register. // Subtract the offset we added to the ScratchOffset register.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
.addReg(ScratchOffset) .addReg(ScratchOffsetReg)
.addImm(OriginalImmOffset); .addImm(OriginalImmOffset);
} }
} }
@ -497,6 +515,8 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
const unsigned EltSize = 4;
// SubReg carries the "Kill" flag when SubReg == SuperReg. // SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
@ -518,13 +538,12 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
} }
int64_t FrOffset = FrameInfo.getObjectOffset(Index); int64_t FrOffset = FrameInfo.getObjectOffset(Index);
unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index); unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index); = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
Size, Align); EltSize, MinAlign(Align, EltSize * i));
unsigned OffsetReg = AMDGPU::M0; unsigned OffsetReg = AMDGPU::M0;
// Add i * 4 wave offset. // Add i * 4 wave offset.
@ -597,13 +616,12 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
} }
unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index); unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index); = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
Size, Align); EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpReg, RegState::Kill) // src .addReg(TmpReg, RegState::Kill) // src
.addFrameIndex(Index) // vaddr .addFrameIndex(Index) // vaddr
@ -644,18 +662,19 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int64_t FrOffset = FrameInfo.getObjectOffset(Index); int64_t FrOffset = FrameInfo.getObjectOffset(Index);
const unsigned EltSize = 4;
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ? unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
if (SpillToSMEM) { if (SpillToSMEM) {
unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index); unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index); = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
Size, Align); EltSize, MinAlign(Align, EltSize * i));
unsigned OffsetReg = AMDGPU::M0; unsigned OffsetReg = AMDGPU::M0;
@ -693,16 +712,15 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
} else { } else {
// Restore SGPR from a stack slot. // Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI. // FIXME: We should use S_LOAD_DWORD here for VI.
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Align = FrameInfo.getObjectAlignment(Index); unsigned Align = FrameInfo.getObjectAlignment(Index);
unsigned Size = FrameInfo.getObjectSize(Index);
MachinePointerInfo PtrInfo MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index); = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO = MF->getMachineMemOperand( MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
PtrInfo, MachineMemOperand::MOLoad, Size, Align); MachineMemOperand::MOLoad, EltSize,
MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
.addFrameIndex(Index) // vaddr .addFrameIndex(Index) // vaddr
@ -710,8 +728,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
.addReg(MFI->getScratchWaveOffsetReg()) // soffset .addReg(MFI->getScratchWaveOffsetReg()) // soffset
.addImm(i * 4) // offset .addImm(i * 4) // offset
.addMemOperand(MMO); .addMemOperand(MMO);
BuildMI(*MBB, MI, DL, BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpReg, RegState::Kill) .addReg(TmpReg, RegState::Kill)
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
} }
@ -767,28 +784,38 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V32_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
TII->getNamedOperand(*MI, AMDGPU::OpName::vdata), Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
FrameInfo.getObjectOffset(Index) + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); *MI->memoperands_begin(),
RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent(); MI->eraseFromParent();
break; break;
}
case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: { case AMDGPU::SI_SPILL_V512_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
TII->getNamedOperand(*MI, AMDGPU::OpName::vdata), Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
FrameInfo.getObjectOffset(Index) + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); *MI->memoperands_begin(),
RS);
MI->eraseFromParent(); MI->eraseFromParent();
break; break;
} }

View File

@ -252,9 +252,14 @@ public:
private: private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI, void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp, const MachineOperand *SrcDst, unsigned LoadStoreOp,
unsigned ScratchRsrcReg, unsigned ScratchOffset, int Index,
int64_t Offset, unsigned ValueReg,
bool ValueIsKill,
unsigned ScratchRsrcReg,
unsigned ScratchOffsetReg,
int64_t InstrOffset,
MachineMemOperand *MMO,
RegScavenger *RS) const; RegScavenger *RS) const;
}; };

View File

@ -26,9 +26,9 @@
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; Spill load ; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@ -55,11 +55,11 @@
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
@ -108,9 +108,9 @@ endif:
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
@ -133,11 +133,11 @@ endif:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
@ -187,9 +187,9 @@ end:
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, [[CMP0]] ; GCN: s_mov_b64 exec, [[CMP0]]
; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN: s_waitcnt vmcnt(0) expcnt(0)
@ -208,7 +208,7 @@ end:
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
@ -224,9 +224,9 @@ end:
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
@ -255,11 +255,11 @@ end:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload ; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0) ; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

View File

@ -17,26 +17,26 @@
; Make sure scratch wave offset register is correctly incremented and ; Make sure scratch wave offset register is correctly incremented and
; then restored. ; then restored.
; SMEM: s_mov_b32 m0, s91{{$}} ; SMEM: s_mov_b32 m0, s91{{$}}
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x100{{$}} ; SMEM: s_add_u32 m0, s91, 0x100{{$}}
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x200{{$}} ; SMEM: s_add_u32 m0, s91, 0x200{{$}}
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x300{{$}} ; SMEM: s_add_u32 m0, s91, 0x300{{$}}
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill ; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_mov_b32 m0, s91{{$}} ; SMEM: s_mov_b32 m0, s91{{$}}
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x100{{$}} ; SMEM: s_add_u32 m0, s91, 0x100{{$}}
; SMEM: s_waitcnt lgkmcnt(0) ; SMEM: s_waitcnt lgkmcnt(0)
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x200{{$}} ; SMEM: s_add_u32 m0, s91, 0x200{{$}}
; SMEM: s_waitcnt lgkmcnt(0) ; SMEM: s_waitcnt lgkmcnt(0)
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x300{{$}} ; SMEM: s_add_u32 m0, s91, 0x300{{$}}
; SMEM: s_waitcnt lgkmcnt(0) ; SMEM: s_waitcnt lgkmcnt(0)
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload ; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; ALL: s_endpgm ; ALL: s_endpgm
define void @test(i32 addrspace(1)* %out, i32 %in) { define void @test(i32 addrspace(1)* %out, i32 %in) {

View File

@ -20,8 +20,8 @@
; VI-DAG: s_mov_b32 s15, 0xe80000 ; VI-DAG: s_mov_b32 s15, 0xe80000
; s11 is offset system SGPR ; s11 is offset system SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256 ; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024 ; GCN: ScratchSize: 1024