[AMDGPU] Implement copyPhysReg for 16 bit subregs

Differential Revision: https://reviews.llvm.org/D74937
This commit is contained in:
Stanislav Mekhanoshin 2020-02-28 15:48:46 -08:00
parent 5fee925beb
commit 96e51ed005
3 changed files with 275 additions and 0 deletions

View File

@ -679,6 +679,74 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return; return;
} }
if (RC == &AMDGPU::VGPR_LO16RegClass || RC == &AMDGPU::VGPR_HI16RegClass) {
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg));
// d s
// l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2
// llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2
// l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d
// llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2
// h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d
// 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2
// h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2
// llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2
bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass;
bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg);
DestReg = RI.getMatchingSuperReg(DestReg,
DstLow ? AMDGPU::lo16 : AMDGPU::hi16,
&AMDGPU::VGPR_32RegClass);
SrcReg = RI.getMatchingSuperReg(SrcReg,
SrcLow ? AMDGPU::lo16 : AMDGPU::hi16,
&AMDGPU::VGPR_32RegClass);
if (DestReg == SrcReg) {
// l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0]
// h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0]
if (DstLow == SrcLow)
return;
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg)
.addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0)
.addReg(DestReg, RegState::Undef)
.addImm(0) // src1_mod
.addImm(0) // src1
.addImm(0)
.addImm(0)
.addImm(0)
.addImm(0)
.addImm(0);
return;
}
// Last instruction first:
auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg)
.addReg((SrcLow && !DstLow) ? SrcReg : DestReg,
(SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0)
.addReg((!SrcLow && DstLow) ? SrcReg : DestReg,
(!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0)
.addImm(2);
unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32
: SrcLow ? AMDGPU::V_LSHRREV_B32_e32
: AMDGPU::V_LSHLREV_B32_e32;
auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg);
if (DstLow == SrcLow) { // alignbyte
First.addReg(SrcLow ? SrcReg : DestReg,
SrcLow ? getKillRegState(KillSrc) : RegState::Undef)
.addReg(SrcLow ? DestReg : SrcReg,
SrcLow ? RegState::Undef :getKillRegState(KillSrc))
.addImm(2);
} else {
First.addImm(16)
.addReg(DestReg, RegState::Undef);
}
return;
}
unsigned EltSize = 4; unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32; unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) { if (RI.isSGPRClass(RC)) {

View File

@ -1279,6 +1279,8 @@ StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
const TargetRegisterClass * const TargetRegisterClass *
SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
static const TargetRegisterClass *const BaseClasses[] = { static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_LO16RegClass,
&AMDGPU::VGPR_HI16RegClass,
&AMDGPU::VGPR_32RegClass, &AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass, &AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass, &AMDGPU::AGPR_32RegClass,
@ -1318,6 +1320,9 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC); unsigned Size = getRegSizeInBits(*RC);
switch (Size) { switch (Size) {
case 16:
return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
case 32: case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
case 64: case 64:

View File

@ -0,0 +1,202 @@
# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
# GCN-LABEL: {{^}}lo_to_lo:
# GCN: v_alignbyte_b32 v1, v0, v1, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
name: lo_to_lo
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_lo16 = COPY $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_hi:
# GCN: v_lshrrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
name: lo_to_hi
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_hi16 = COPY killed $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_lo:
# GCN: v_lshlrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
name: hi_to_lo
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_lo16 = COPY $vgpr0_hi16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_hi:
# GCN: v_alignbyte_b32 v1, v1, v0, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
name: hi_to_hi
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_hi16 = COPY $vgpr0_hi16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_lo_samereg:
# GCN: s_waitcnt
# GCN-NEXT: s_endpgm
name: lo_to_lo_samereg
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr0_lo16 = COPY $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_hi_samereg:
# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0]
name: lo_to_hi_samereg
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr0_hi16 = COPY $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_lo_samereg:
# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0]
name: hi_to_lo_samereg
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr0_lo16 = COPY killed $vgpr0_hi16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_hi_samereg:
# GCN: s_waitcnt
# GCN-NEXT: s_endpgm
name: hi_to_hi_samereg
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr0_hi16 = COPY killed $vgpr0_hi16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_lo_def_livein:
# GCN: v_alignbyte_b32 v1, v0, v1, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
name: lo_to_lo_def_livein
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = IMPLICIT_DEF
$vgpr1_lo16 = COPY $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_hi_def_livein:
# GCN: v_lshrrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
name: lo_to_hi_def_livein
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = IMPLICIT_DEF
$vgpr1_hi16 = COPY $vgpr0_lo16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_lo_def_livein:
# GCN: v_lshlrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
name: hi_to_lo_def_livein
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = IMPLICIT_DEF
$vgpr1_lo16 = COPY killed $vgpr0_hi16
S_ENDPGM 0
...
# GCN-LABEL: {{^}}hi_to_hi_def_livein:
# GCN: v_alignbyte_b32 v1, v1, v0, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
name: hi_to_hi_def_livein
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = IMPLICIT_DEF
$vgpr1_hi16 = COPY $vgpr0_hi16
S_ENDPGM 0
...
# TODO: This can be coalesced into a VGPR_32 copy
# GCN-LABEL: {{^}}lo_to_lo_hi_to_hi:
# GCN: v_alignbyte_b32 v1, v0, v1, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
# GCN-NEXT: v_mov_b32_e32 v2, v1
# GCN-NEXT: s_endpgm
name: lo_to_lo_hi_to_hi
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_lo16 = COPY $vgpr0_lo16
$vgpr1_hi16 = COPY $vgpr0_hi16
$vgpr2 = COPY killed $vgpr1
S_ENDPGM 0
...
# GCN-LABEL: {{^}}lo_to_hi_hi_to_lo:
# GCN: v_lshlrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
# GCN-NEXT: v_mov_b32_e32 v2, v1
# GCN-NEXT: s_endpgm
name: lo_to_hi_hi_to_lo
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = IMPLICIT_DEF
$vgpr1_lo16 = COPY $vgpr0_hi16
$vgpr1_hi16 = COPY $vgpr0_lo16
$vgpr2 = COPY killed $vgpr1
S_ENDPGM 0
...
# NB: copy of undef just killed instead of expansion
# GCN-LABEL: {{^}}lo_to_lo_undef:
# GCN: s_waitcnt
# GCN-NEXT: v_mov_b32_e32 v2, v1
# GCN-NEXT: s_endpgm
name: lo_to_lo_undef
tracksRegLiveness: true
body: |
bb.0:
$vgpr1_lo16 = COPY undef $vgpr0_lo16
$vgpr2 = COPY killed $vgpr1
S_ENDPGM 0
...