[AMDGPU] Add AV class spill pseudo instructions

While enabling vector superclasses with D109301,
the AV spills are converted into VGPR spills by
introducing appropriate copies. The whole thing
ended up adding two instructions per spill (a copy
+ vgpr spill pseudo) and caused an incorrect
liverange update during inline spiller.

This patch adds the pseudo instructions for all
AV spills from 32b to 1024b and handles them in
the way all other spills are lowered.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D115439
This commit is contained in:
Christudasan Devadasan 2021-12-09 02:55:21 -05:00
parent d3cd0635e1
commit cf58b9ce98
8 changed files with 3898 additions and 54 deletions

View File

@ -1417,6 +1417,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
}
}
static unsigned getAVSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_AV32_SAVE;
case 8:
return AMDGPU::SI_SPILL_AV64_SAVE;
case 12:
return AMDGPU::SI_SPILL_AV96_SAVE;
case 16:
return AMDGPU::SI_SPILL_AV128_SAVE;
case 20:
return AMDGPU::SI_SPILL_AV160_SAVE;
case 24:
return AMDGPU::SI_SPILL_AV192_SAVE;
case 28:
return AMDGPU::SI_SPILL_AV224_SAVE;
case 32:
return AMDGPU::SI_SPILL_AV256_SAVE;
case 64:
return AMDGPU::SI_SPILL_AV512_SAVE;
case 128:
return AMDGPU::SI_SPILL_AV1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register SrcReg, bool isKill,
@ -1463,21 +1490,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
: getVGPRSpillSaveOpcode(SpillSize);
unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize)
: RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
if (RI.isVectorSuperClass(RC)) {
// Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
// equivalent VGPR register beforehand. Regalloc might want to introduce
// AV spills only to be relevant until rewriter at which they become
// either spills of VGPRs or AGPRs.
Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
.addReg(SrcReg, RegState::Kill);
SrcReg = TmpVReg;
}
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
@ -1567,6 +1584,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
}
}
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_AV32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_AV64_RESTORE;
case 12:
return AMDGPU::SI_SPILL_AV96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_AV128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_AV160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_AV192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_AV224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_AV256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_AV512_RESTORE;
case 128:
return AMDGPU::SI_SPILL_AV1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
}
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
@ -1609,26 +1653,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
Register TmpReg = DestReg;
if (IsVectorSuperClass) {
// For AV classes, insert the spill restore to a VGPR followed by a copy
// into an equivalent AV register.
MachineRegisterInfo &MRI = MF->getRegInfo();
DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
}
unsigned Opcode = RI.isVectorSuperClass(RC)
? getAVSpillRestoreOpcode(SpillSize)
: RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
if (IsVectorSuperClass)
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
.addReg(DestReg, RegState::Kill);
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
}
void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
@ -4581,8 +4614,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
unsigned RCID,
bool IsAllocatable) {
if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
(TID.mayLoad() || TID.mayStore() ||
(TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
(((TID.mayLoad() || TID.mayStore()) &&
!(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
(TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
switch (RCID) {
case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;

View File

@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>;
defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>;
defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>;
defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),

View File

@ -912,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V1024_RESTORE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A1024_RESTORE:
case AMDGPU::SI_SPILL_AV1024_SAVE:
case AMDGPU::SI_SPILL_AV1024_RESTORE:
return 32;
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
@ -919,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_AV512_SAVE:
case AMDGPU::SI_SPILL_AV512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
@ -926,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_A256_SAVE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_AV256_SAVE:
case AMDGPU::SI_SPILL_AV256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S224_RESTORE:
@ -933,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V224_RESTORE:
case AMDGPU::SI_SPILL_A224_SAVE:
case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_AV224_SAVE:
case AMDGPU::SI_SPILL_AV224_RESTORE:
return 7;
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
@ -940,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V192_RESTORE:
case AMDGPU::SI_SPILL_A192_SAVE:
case AMDGPU::SI_SPILL_A192_RESTORE:
case AMDGPU::SI_SPILL_AV192_SAVE:
case AMDGPU::SI_SPILL_AV192_RESTORE:
return 6;
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S160_RESTORE:
@ -947,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_A160_SAVE:
case AMDGPU::SI_SPILL_A160_RESTORE:
case AMDGPU::SI_SPILL_AV160_SAVE:
case AMDGPU::SI_SPILL_AV160_RESTORE:
return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
@ -954,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_A128_SAVE:
case AMDGPU::SI_SPILL_A128_RESTORE:
case AMDGPU::SI_SPILL_AV128_SAVE:
case AMDGPU::SI_SPILL_AV128_RESTORE:
return 4;
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S96_RESTORE:
@ -961,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_A96_SAVE:
case AMDGPU::SI_SPILL_A96_RESTORE:
case AMDGPU::SI_SPILL_AV96_SAVE:
case AMDGPU::SI_SPILL_AV96_RESTORE:
return 3;
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
@ -968,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_A64_SAVE:
case AMDGPU::SI_SPILL_A64_RESTORE:
case AMDGPU::SI_SPILL_AV64_SAVE:
case AMDGPU::SI_SPILL_AV64_RESTORE:
return 2;
case AMDGPU::SI_SPILL_S32_SAVE:
case AMDGPU::SI_SPILL_S32_RESTORE:
@ -975,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_A32_SAVE:
case AMDGPU::SI_SPILL_A32_RESTORE:
case AMDGPU::SI_SPILL_AV32_SAVE:
case AMDGPU::SI_SPILL_AV32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@ -1815,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A128_SAVE:
case AMDGPU::SI_SPILL_A96_SAVE:
case AMDGPU::SI_SPILL_A64_SAVE:
case AMDGPU::SI_SPILL_A32_SAVE: {
case AMDGPU::SI_SPILL_A32_SAVE:
case AMDGPU::SI_SPILL_AV1024_SAVE:
case AMDGPU::SI_SPILL_AV512_SAVE:
case AMDGPU::SI_SPILL_AV256_SAVE:
case AMDGPU::SI_SPILL_AV224_SAVE:
case AMDGPU::SI_SPILL_AV192_SAVE:
case AMDGPU::SI_SPILL_AV160_SAVE:
case AMDGPU::SI_SPILL_AV128_SAVE:
case AMDGPU::SI_SPILL_AV96_SAVE:
case AMDGPU::SI_SPILL_AV64_SAVE:
case AMDGPU::SI_SPILL_AV32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@ -1851,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_A1024_RESTORE: {
case AMDGPU::SI_SPILL_A1024_RESTORE:
case AMDGPU::SI_SPILL_AV32_RESTORE:
case AMDGPU::SI_SPILL_AV64_RESTORE:
case AMDGPU::SI_SPILL_AV96_RESTORE:
case AMDGPU::SI_SPILL_AV128_RESTORE:
case AMDGPU::SI_SPILL_AV160_RESTORE:
case AMDGPU::SI_SPILL_AV192_RESTORE:
case AMDGPU::SI_SPILL_AV224_RESTORE:
case AMDGPU::SI_SPILL_AV256_RESTORE:
case AMDGPU::SI_SPILL_AV512_RESTORE:
case AMDGPU::SI_SPILL_AV1024_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==

View File

@ -0,0 +1,84 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,1 -stop-after=prologepilog -verify-machineinstrs -verify-regalloc -o - %s | FileCheck --check-prefixes=GCN %s
# The VGPR pair spilled and restored around the callsite is used in the next basic block.
#
# AMDGPU target spill hooks storeRegToStackSlot/loadRegFromStackSlot handle the register spills via
# spill pseudos to insert a single instruction per spill to tackle the limitation during inline spiller
# that incorrectly updates the Liveintervals in case of a spill lowered into multiple instructions.
# AV spills were handled earlier by converting them into equivalent VGPR spills by adding appropriate copies.
# The multiple instructions (a copy + vgpr spill pseudo) introduced an incorrect liverange that caused a
# crash during RA. It is fixed by introducing AV* spill pseudos to ensure a single instruction per spill and
# the test started compiling successfully.
---
name: test_av_spill_cross_bb_usage
tracksRegLiveness: true
stack:
- { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
machineFunctionInfo:
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
stackPtrOffsetReg: '$sgpr32'
body: |
; GCN-LABEL: name: test_av_spill_cross_bb_usage
; GCN: bb.0:
; GCN: S_BRANCH %bb.1
; GCN: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr10:0x0000000000000003, $vgpr11:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F
; GCN-NEXT: {{ $}}
; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_highregs, implicit-def dead $vgpr0
; GCN-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5)
; GCN: bb.2:
; GCN-NEXT: liveins: $vgpr40, $vgpr14_vgpr15:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 0, implicit $exec
; GCN-NEXT: FLAT_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr43_vgpr44, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr14_vgpr15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
; GCN: S_SETPC_B64_return undef $sgpr30_sgpr31
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr40, $sgpr30, $sgpr31, $sgpr30_sgpr31
undef %0.sub1:vreg_64 = COPY $vgpr15
%0.sub0:vreg_64 = COPY $vgpr14
undef %1.sub1:vreg_64 = COPY $vgpr13
%1.sub0:vreg_64 = COPY $vgpr12
undef %2.sub1:vreg_64 = COPY $vgpr11
%2.sub0:vreg_64 = COPY $vgpr10
undef %3.sub1:vreg_64 = COPY $vgpr9
%3.sub0:vreg_64 = COPY $vgpr8
undef %4.sub1:vreg_64 = COPY $vgpr7
%4.sub0:vreg_64 = COPY $vgpr6
undef %5.sub1:vreg_64 = COPY $vgpr5
%5.sub0:vreg_64 = COPY $vgpr4
undef %6.sub1:vreg_64 = COPY $vgpr3
%6.sub0:vreg_64 = COPY $vgpr2
undef %7.sub1:vreg_64 = COPY $vgpr1
%7.sub0:vreg_64 = COPY $vgpr0
S_CBRANCH_SCC1 %bb.2, implicit undef $scc
S_BRANCH %bb.1
bb.1:
liveins: $vgpr40, $sgpr30, $sgpr31, $sgpr30_sgpr31
ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
renamable $sgpr16_sgpr17 = IMPLICIT_DEF
$vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
$vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit killed $sgpr30_sgpr31
dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_highregs, implicit-def dead $vgpr0
%8:vreg_64 = nofpexcept V_FMA_F64_e64 0, %7, 0, %6, 0, %5, 0, 0, implicit $mode, implicit $exec
ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
FLAT_STORE_DWORDX2 %4, %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
FLAT_STORE_DWORDX2 %2, %3, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
bb.2:
liveins: $vgpr40
%9:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
FLAT_STORE_DWORDX2 undef %10:vreg_64, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
FLAT_STORE_DWORDX2 %9, %0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
S_SETPC_B64_return undef $sgpr30_sgpr31
...

View File

@ -32,11 +32,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def [[VREG_64:%[0-9]+]]
; REGALLOC-GFX90A: SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
; REGALLOC-GFX90A: [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64
; REGALLOC-GFX90A: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64_align2 = SI_SPILL_V64_RESTORE %stack.0
; REGALLOC-GFX90A: [[COPY_AV64:%[0-9]+]]:av_64_align2 = COPY [[SI_SPILL_V64_RESTORE]]
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX2 undef %15:vreg_64_align2, [[COPY_AV64]]
; REGALLOC-GFX90A-NOT: %{{[0-9]+}}:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX4 undef %17:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]]
; REGALLOC-GFX90A: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]]
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]]
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):

File diff suppressed because it is too large Load Diff

View File

@ -116,17 +116,23 @@ use:
; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
; GFX90A-DAG: v_accvgpr_read_b32 v4, a2 ; Reload Reuse
; GFX90A-DAG: v_accvgpr_read_b32 v3, a3 ; Reload Reuse
; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; GCN-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
; GFX90A-DAG: buffer_load_dword a0, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword a1, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-DAG: v_accvgpr_write_b32 a2, v4 ; Reload Reuse
; GFX90A-DAG: v_accvgpr_write_b32 a3, v3 ; Reload Reuse
; GFX90A: global_store_dwordx4 v[0:1], a[0:3], off
; GCN: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
; GCN: ScratchSize: 20
define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -4,12 +4,10 @@
define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-LABEL: name: test_spill_av_class
; GCN: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0
; GCN-NEXT: undef %23.sub0:av_64 = COPY %21.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY %23
; GCN-NEXT: SI_SPILL_V64_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec
; GCN: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec
; GCN-NEXT: [[COPY3:%[0-9]+]]:av_64 = COPY [[SI_SPILL_V64_RESTORE]]
; GCN-NEXT: undef %22.sub0:vreg_64 = COPY [[COPY3]].sub0
; GCN-NEXT: undef [[AV_REG:%[0-9]+]].sub0:av_64 = COPY %{{[0-9]+}}.sub0
; GCN-NEXT: SI_SPILL_AV64_SAVE [[AV_REG]], %stack.0, $sgpr32, 0, implicit $exec
; GCN: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec
; GCN-NEXT: undef %22.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0
%v0 = call i32 asm sideeffect "; def $0", "=v"()
%tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)