forked from OSchip/llvm-project
AMDGPU/GlobalISel: Fold wave address into mubuf addressing modes
This commit is contained in:
parent
2d670de84c
commit
045be6ff36
|
@ -3905,20 +3905,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
|
|||
return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
|
||||
}
|
||||
|
||||
// Return the wave level SGPR base address if this is a wave address.
|
||||
static Register getWaveAddress(const MachineInstr *Def) {
|
||||
return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
|
||||
? Def->getOperand(1).getReg()
|
||||
: Register();
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
|
||||
MachineOperand &Root) const {
|
||||
MachineInstr *MI = Root.getParent();
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
Register Reg = Root.getReg();
|
||||
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
const MachineInstr *Def = MRI->getVRegDef(Reg);
|
||||
if (Register WaveBase = getWaveAddress(Def)) {
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { // rsrc
|
||||
MIB.addReg(Info->getScratchRSrcReg());
|
||||
},
|
||||
[=](MachineInstrBuilder &MIB) { // soffset
|
||||
MIB.addReg(WaveBase);
|
||||
},
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
|
||||
}};
|
||||
}
|
||||
|
||||
int64_t Offset = 0;
|
||||
|
||||
// FIXME: Copy check is a hack
|
||||
Register BasePtr;
|
||||
if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
|
||||
if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
|
||||
return {};
|
||||
const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
|
||||
Register WaveBase = getWaveAddress(BasePtrDef);
|
||||
if (!WaveBase)
|
||||
return {};
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { // rsrc
|
||||
MIB.addReg(Info->getScratchRSrcReg());
|
||||
},
|
||||
[=](MachineInstrBuilder &MIB) { // soffset
|
||||
MIB.addReg(WaveBase);
|
||||
},
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
|
||||
}};
|
||||
}
|
||||
|
||||
if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
|
||||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
|
||||
return {};
|
||||
|
||||
const MachineFunction *MF = MBB->getParent();
|
||||
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { // rsrc
|
||||
MIB.addReg(Info->getScratchRSrcReg());
|
||||
|
|
|
@ -14,21 +14,20 @@ define amdgpu_kernel void @kernel_caller_stack() {
|
|||
; MUBUF: ; %bb.0:
|
||||
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; MUBUF-NEXT: s_mov_b32 s32, 0
|
||||
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
||||
; MUBUF-NEXT: s_mov_b32 s32, 0
|
||||
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
||||
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 9
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 10
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 11
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 12
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
|
||||
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
||||
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -112,42 +111,41 @@ define amdgpu_kernel void @kernel_caller_byval() {
|
|||
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
|
||||
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
|
||||
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
|
||||
; MUBUF-NEXT: v_lshrrev_b32_e64 v16, 6, s32
|
||||
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v16, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen offset:8
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v3, v16, s[0:3], 0 offen offset:12
|
||||
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen offset:16
|
||||
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen offset:20
|
||||
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen offset:24
|
||||
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v7, v16, s[0:3], 0 offen offset:28
|
||||
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v8, v16, s[0:3], 0 offen offset:32
|
||||
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen offset:36
|
||||
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen offset:40
|
||||
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen offset:44
|
||||
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen offset:48
|
||||
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen offset:52
|
||||
; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen offset:56
|
||||
; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(15)
|
||||
; MUBUF-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen offset:60
|
||||
; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60
|
||||
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -244,20 +242,19 @@ define void @func_caller_stack() {
|
|||
; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s32
|
||||
; MUBUF-NEXT: s_addk_i32 s32, 0x400
|
||||
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 9
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 10
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 11
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
|
||||
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 12
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
|
||||
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
||||
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
|
||||
|
@ -317,65 +314,64 @@ define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) {
|
|||
; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s32
|
||||
; MUBUF-NEXT: s_addk_i32 s32, 0x400
|
||||
; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s32
|
||||
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; MUBUF-NEXT: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:8
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:12
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:16
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:20
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:24
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:28
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:32
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:36
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:40
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:44
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:48
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:52
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
|
||||
; MUBUF-NEXT: s_nop 0
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:56
|
||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:60
|
||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
|
||||
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
|
||||
|
|
|
@ -581,3 +581,144 @@ body: |
|
|||
G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: function_store_private_s32_to_4_wave_address
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address
|
||||
; GFX6: liveins: $vgpr0, $vgpr1
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
|
||||
G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
|
||||
|
||||
...
|
||||
|
||||
# Has regbank copy of constant
|
||||
---
|
||||
name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
|
||||
; GFX6: liveins: $vgpr0, $vgpr1
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
|
||||
%2:sgpr(s32) = G_CONSTANT i32 4095
|
||||
%3:vgpr(s32) = COPY %2
|
||||
%4:vgpr(p5) = G_PTR_ADD %1, %3
|
||||
G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: function_store_private_s32_to_4_wave_address_offset_4095
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
|
||||
; GFX6: liveins: $vgpr0, $vgpr1
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
|
||||
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
|
||||
; GFX6-NEXT: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
|
||||
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
|
||||
%2:vgpr(s32) = G_CONSTANT i32 4095
|
||||
%3:vgpr(p5) = G_PTR_ADD %1, %2
|
||||
G_STORE %0, %3 :: (store (s32), align 4, addrspace 5)
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
|
||||
; GFX6: liveins: $vgpr0, $vgpr1
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX6-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
|
||||
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
|
||||
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
|
||||
%2:sgpr(s32) = G_CONSTANT i32 4096
|
||||
%3:vgpr(s32) = COPY %2
|
||||
%4:vgpr(p5) = G_PTR_ADD %1, %3
|
||||
G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
|
||||
|
||||
...
|
||||
|
|
|
@ -144,8 +144,7 @@ attributes #0 = { nounwind }
|
|||
|
||||
; GCN: amdpal.pipelines:
|
||||
; GCN-NEXT: - .registers:
|
||||
; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
|
||||
; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
|
||||
; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
|
||||
; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
|
||||
; GCN-NEXT: .shader_functions:
|
||||
; GCN-NEXT: dynamic_stack:
|
||||
|
@ -187,15 +186,13 @@ attributes #0 = { nounwind }
|
|||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GCN-NEXT: no_stack_indirect_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GCN-NEXT: simple_lds:
|
||||
; GCN-NEXT: .lds_size: 0x100{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x20{{$}}
|
||||
|
@ -227,8 +224,7 @@ attributes #0 = { nounwind }
|
|||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2b{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
|
||||
; GCN-NEXT: simple_stack_recurse:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x26{{$}}
|
||||
|
|
Loading…
Reference in New Issue