forked from OSchip/llvm-project
[AMDGPU] Insert waterfall loops for divergent calls
Extend loadSRsrcFromVGPR to allow moving a range of instructions into the loop. The call instruction is surrounded by copies into physical registers which should be part of the waterfall loop. Differential Revision: https://reviews.llvm.org/D88291
This commit is contained in:
parent
974ddb54c9
commit
7f2a641aad
|
@ -4700,9 +4700,13 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
|
|||
.addImm(AMDGPU::sub1);
|
||||
|
||||
Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
|
||||
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
|
||||
.addReg(CurReg)
|
||||
.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
|
||||
auto Cmp =
|
||||
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
|
||||
.addReg(CurReg);
|
||||
if (NumSubRegs <= 2)
|
||||
Cmp.addReg(VRsrc);
|
||||
else
|
||||
Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
|
||||
|
||||
// Combine the comparision results with AND.
|
||||
if (CondReg == AMDGPU::NoRegister) // First.
|
||||
|
@ -4752,13 +4756,20 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
|
|||
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
|
||||
// with SGPRs by iterating over all unique values across all lanes.
|
||||
static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
|
||||
MachineOperand &Rsrc, MachineDominatorTree *MDT) {
|
||||
MachineOperand &Rsrc, MachineDominatorTree *MDT,
|
||||
MachineBasicBlock::iterator Begin = nullptr,
|
||||
MachineBasicBlock::iterator End = nullptr) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
MachineFunction &MF = *MBB.getParent();
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
MachineBasicBlock::iterator I(&MI);
|
||||
if (!Begin.isValid())
|
||||
Begin = &MI;
|
||||
if (!End.isValid()) {
|
||||
End = &MI;
|
||||
++End;
|
||||
}
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
|
@ -4767,13 +4778,17 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
|
|||
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
|
||||
|
||||
// Save the EXEC mask
|
||||
BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
|
||||
BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
|
||||
|
||||
// Killed uses in the instruction we are waterfalling around will be
|
||||
// incorrect due to the added control-flow.
|
||||
for (auto &MO : MI.uses()) {
|
||||
if (MO.isReg() && MO.isUse()) {
|
||||
MRI.clearKillFlags(MO.getReg());
|
||||
MachineBasicBlock::iterator AfterMI = MI;
|
||||
++AfterMI;
|
||||
for (auto I = Begin; I != AfterMI; I++) {
|
||||
for (auto &MO : I->uses()) {
|
||||
if (MO.isReg() && MO.isUse()) {
|
||||
MRI.clearKillFlags(MO.getReg());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4790,11 +4805,11 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
|
|||
LoopBB->addSuccessor(LoopBB);
|
||||
LoopBB->addSuccessor(RemainderBB);
|
||||
|
||||
// Move MI to the LoopBB, and the remainder of the block to RemainderBB.
|
||||
MachineBasicBlock::iterator J = I++;
|
||||
// Move Begin to MI to the LoopBB, and the remainder of the block to
|
||||
// RemainderBB.
|
||||
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
||||
LoopBB->splice(LoopBB->begin(), &MBB, J);
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
|
||||
LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
|
||||
|
||||
MBB.addSuccessor(LoopBB);
|
||||
|
||||
|
@ -5016,6 +5031,34 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
|
|||
return;
|
||||
}
|
||||
|
||||
// Legalize SI_CALL
|
||||
if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
|
||||
MachineOperand *Dest = &MI.getOperand(0);
|
||||
if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
|
||||
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
|
||||
// following copies, we also need to move copies from and to physical
|
||||
// registers into the loop block.
|
||||
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
|
||||
unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
|
||||
unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
|
||||
|
||||
// Also move the copies to physical registers into the loop block
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
MachineBasicBlock::iterator Start(&MI);
|
||||
while (Start->getOpcode() != FrameSetupOpcode)
|
||||
--Start;
|
||||
MachineBasicBlock::iterator End(&MI);
|
||||
while (End->getOpcode() != FrameDestroyOpcode)
|
||||
++End;
|
||||
// Also include following copies of the return value
|
||||
++End;
|
||||
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
|
||||
MI.definesRegister(End->getOperand(1).getReg()))
|
||||
++End;
|
||||
loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
|
||||
}
|
||||
}
|
||||
|
||||
// Legalize MUBUF* instructions.
|
||||
int RsrcIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
|
||||
|
|
|
@ -197,13 +197,275 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME
|
||||
; define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
|
||||
; call void %fptr()
|
||||
; ret void
|
||||
; }
|
||||
define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
|
||||
; GCN-LABEL: test_indirect_call_vgpr_ptr:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v43, s33, 17
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v43, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v43, s38, 3
|
||||
; GCN-NEXT: v_writelane_b32 v43, s39, 4
|
||||
; GCN-NEXT: v_writelane_b32 v43, s40, 5
|
||||
; GCN-NEXT: v_writelane_b32 v43, s41, 6
|
||||
; GCN-NEXT: v_writelane_b32 v43, s42, 7
|
||||
; GCN-NEXT: v_writelane_b32 v43, s43, 8
|
||||
; GCN-NEXT: v_writelane_b32 v43, s44, 9
|
||||
; GCN-NEXT: v_writelane_b32 v43, s45, 10
|
||||
; GCN-NEXT: v_writelane_b32 v43, s46, 11
|
||||
; GCN-NEXT: v_writelane_b32 v43, s47, 12
|
||||
; GCN-NEXT: v_writelane_b32 v43, s48, 13
|
||||
; GCN-NEXT: v_writelane_b32 v43, s49, 14
|
||||
; GCN-NEXT: v_writelane_b32 v43, s30, 15
|
||||
; GCN-NEXT: v_writelane_b32 v43, s31, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, v31
|
||||
; GCN-NEXT: s_mov_b32 s34, s14
|
||||
; GCN-NEXT: s_mov_b32 s35, s13
|
||||
; GCN-NEXT: s_mov_b32 s36, s12
|
||||
; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
|
||||
; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GCN-NEXT: s_mov_b64 s[46:47], exec
|
||||
; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v41
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v42
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
|
||||
; GCN-NEXT: s_mov_b32 s12, s36
|
||||
; GCN-NEXT: s_mov_b32 s13, s35
|
||||
; GCN-NEXT: s_mov_b32 s14, s34
|
||||
; GCN-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[48:49]
|
||||
; GCN-NEXT: s_cbranch_execnz BB2_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[46:47]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v43, 15
|
||||
; GCN-NEXT: v_readlane_b32 s5, v43, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v43, 14
|
||||
; GCN-NEXT: v_readlane_b32 s48, v43, 13
|
||||
; GCN-NEXT: v_readlane_b32 s47, v43, 12
|
||||
; GCN-NEXT: v_readlane_b32 s46, v43, 11
|
||||
; GCN-NEXT: v_readlane_b32 s45, v43, 10
|
||||
; GCN-NEXT: v_readlane_b32 s44, v43, 9
|
||||
; GCN-NEXT: v_readlane_b32 s43, v43, 8
|
||||
; GCN-NEXT: v_readlane_b32 s42, v43, 7
|
||||
; GCN-NEXT: v_readlane_b32 s41, v43, 6
|
||||
; GCN-NEXT: v_readlane_b32 s40, v43, 5
|
||||
; GCN-NEXT: v_readlane_b32 s39, v43, 4
|
||||
; GCN-NEXT: v_readlane_b32 s38, v43, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v43, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: v_readlane_b32 s33, v43, 17
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
call void %fptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
|
||||
; call void %fptr(i32 123)
|
||||
; ret void
|
||||
; }
|
||||
define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
|
||||
; GCN-LABEL: test_indirect_call_vgpr_ptr_arg:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v43, s33, 17
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v43, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v43, s38, 3
|
||||
; GCN-NEXT: v_writelane_b32 v43, s39, 4
|
||||
; GCN-NEXT: v_writelane_b32 v43, s40, 5
|
||||
; GCN-NEXT: v_writelane_b32 v43, s41, 6
|
||||
; GCN-NEXT: v_writelane_b32 v43, s42, 7
|
||||
; GCN-NEXT: v_writelane_b32 v43, s43, 8
|
||||
; GCN-NEXT: v_writelane_b32 v43, s44, 9
|
||||
; GCN-NEXT: v_writelane_b32 v43, s45, 10
|
||||
; GCN-NEXT: v_writelane_b32 v43, s46, 11
|
||||
; GCN-NEXT: v_writelane_b32 v43, s47, 12
|
||||
; GCN-NEXT: v_writelane_b32 v43, s48, 13
|
||||
; GCN-NEXT: v_writelane_b32 v43, s49, 14
|
||||
; GCN-NEXT: v_writelane_b32 v43, s30, 15
|
||||
; GCN-NEXT: v_writelane_b32 v43, s31, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, v31
|
||||
; GCN-NEXT: s_mov_b32 s34, s14
|
||||
; GCN-NEXT: s_mov_b32 s35, s13
|
||||
; GCN-NEXT: s_mov_b32 s36, s12
|
||||
; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
|
||||
; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GCN-NEXT: s_mov_b64 s[46:47], exec
|
||||
; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v41
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v42
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
|
||||
; GCN-NEXT: s_mov_b32 s12, s36
|
||||
; GCN-NEXT: s_mov_b32 s13, s35
|
||||
; GCN-NEXT: s_mov_b32 s14, s34
|
||||
; GCN-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[48:49]
|
||||
; GCN-NEXT: s_cbranch_execnz BB3_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[46:47]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v43, 15
|
||||
; GCN-NEXT: v_readlane_b32 s5, v43, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v43, 14
|
||||
; GCN-NEXT: v_readlane_b32 s48, v43, 13
|
||||
; GCN-NEXT: v_readlane_b32 s47, v43, 12
|
||||
; GCN-NEXT: v_readlane_b32 s46, v43, 11
|
||||
; GCN-NEXT: v_readlane_b32 s45, v43, 10
|
||||
; GCN-NEXT: v_readlane_b32 s44, v43, 9
|
||||
; GCN-NEXT: v_readlane_b32 s43, v43, 8
|
||||
; GCN-NEXT: v_readlane_b32 s42, v43, 7
|
||||
; GCN-NEXT: v_readlane_b32 s41, v43, 6
|
||||
; GCN-NEXT: v_readlane_b32 s40, v43, 5
|
||||
; GCN-NEXT: v_readlane_b32 s39, v43, 4
|
||||
; GCN-NEXT: v_readlane_b32 s38, v43, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v43, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: v_readlane_b32 s33, v43, 17
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
call void %fptr(i32 123)
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
|
||||
; GCN-LABEL: test_indirect_call_vgpr_ptr_ret:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v43, s33, 17
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v43, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v43, s38, 3
|
||||
; GCN-NEXT: v_writelane_b32 v43, s39, 4
|
||||
; GCN-NEXT: v_writelane_b32 v43, s40, 5
|
||||
; GCN-NEXT: v_writelane_b32 v43, s41, 6
|
||||
; GCN-NEXT: v_writelane_b32 v43, s42, 7
|
||||
; GCN-NEXT: v_writelane_b32 v43, s43, 8
|
||||
; GCN-NEXT: v_writelane_b32 v43, s44, 9
|
||||
; GCN-NEXT: v_writelane_b32 v43, s45, 10
|
||||
; GCN-NEXT: v_writelane_b32 v43, s46, 11
|
||||
; GCN-NEXT: v_writelane_b32 v43, s47, 12
|
||||
; GCN-NEXT: v_writelane_b32 v43, s48, 13
|
||||
; GCN-NEXT: v_writelane_b32 v43, s49, 14
|
||||
; GCN-NEXT: v_writelane_b32 v43, s30, 15
|
||||
; GCN-NEXT: v_writelane_b32 v43, s31, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, v31
|
||||
; GCN-NEXT: s_mov_b32 s34, s14
|
||||
; GCN-NEXT: s_mov_b32 s35, s13
|
||||
; GCN-NEXT: s_mov_b32 s36, s12
|
||||
; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
|
||||
; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GCN-NEXT: s_mov_b64 s[46:47], exec
|
||||
; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v41
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v42
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
|
||||
; GCN-NEXT: s_mov_b32 s12, s36
|
||||
; GCN-NEXT: s_mov_b32 s13, s35
|
||||
; GCN-NEXT: s_mov_b32 s14, s34
|
||||
; GCN-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[48:49]
|
||||
; GCN-NEXT: s_cbranch_execnz BB4_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[46:47]
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
|
||||
; GCN-NEXT: v_readlane_b32 s4, v43, 15
|
||||
; GCN-NEXT: v_readlane_b32 s5, v43, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v43, 14
|
||||
; GCN-NEXT: v_readlane_b32 s48, v43, 13
|
||||
; GCN-NEXT: v_readlane_b32 s47, v43, 12
|
||||
; GCN-NEXT: v_readlane_b32 s46, v43, 11
|
||||
; GCN-NEXT: v_readlane_b32 s45, v43, 10
|
||||
; GCN-NEXT: v_readlane_b32 s44, v43, 9
|
||||
; GCN-NEXT: v_readlane_b32 s43, v43, 8
|
||||
; GCN-NEXT: v_readlane_b32 s42, v43, 7
|
||||
; GCN-NEXT: v_readlane_b32 s41, v43, 6
|
||||
; GCN-NEXT: v_readlane_b32 s40, v43, 5
|
||||
; GCN-NEXT: v_readlane_b32 s39, v43, 4
|
||||
; GCN-NEXT: v_readlane_b32 s38, v43, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v43, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: v_readlane_b32 s33, v43, 17
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
%a = call i32 %fptr()
|
||||
%b = add i32 %a, 1
|
||||
ret i32 %b
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue