forked from OSchip/llvm-project
[AMDGPU] Use SDWA for 16 bit subreg copy
This simplifies the logic and allows to use it on GFX8. Differential Revision: https://reviews.llvm.org/D78150
This commit is contained in:
parent
fc4e954ed5
commit
fde2aefa22
|
@ -683,16 +683,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
|
||||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg));
|
||||
|
||||
// d s
|
||||
// l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2
|
||||
// llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2
|
||||
// l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d
|
||||
// llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2
|
||||
// h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d
|
||||
// 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2
|
||||
// h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2
|
||||
// llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2
|
||||
|
||||
bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass;
|
||||
bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg);
|
||||
DestReg = RI.getMatchingSuperReg(DestReg,
|
||||
|
@ -702,49 +692,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
SrcLow ? AMDGPU::lo16 : AMDGPU::hi16,
|
||||
&AMDGPU::VGPR_32RegClass);
|
||||
|
||||
if (DestReg == SrcReg) {
|
||||
// l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0]
|
||||
// h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0]
|
||||
if (DstLow == SrcLow)
|
||||
return;
|
||||
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg)
|
||||
.addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0)
|
||||
.addReg(DestReg, RegState::Undef)
|
||||
.addImm(0) // src1_mod
|
||||
.addImm(0) // src1
|
||||
.addImm(0)
|
||||
.addImm(0)
|
||||
.addImm(0)
|
||||
.addImm(0)
|
||||
.addImm(0);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Last instruction first:
|
||||
auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg)
|
||||
.addReg((SrcLow && !DstLow) ? SrcReg : DestReg,
|
||||
(SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0)
|
||||
.addReg((!SrcLow && DstLow) ? SrcReg : DestReg,
|
||||
(!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0)
|
||||
.addImm(2);
|
||||
|
||||
unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32
|
||||
: SrcLow ? AMDGPU::V_LSHRREV_B32_e32
|
||||
: AMDGPU::V_LSHLREV_B32_e32;
|
||||
auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg);
|
||||
if (DstLow == SrcLow) { // alignbyte
|
||||
First
|
||||
.addReg(SrcLow ? SrcReg : DestReg,
|
||||
SrcLow ? getKillRegState(KillSrc) : unsigned(RegState::Undef))
|
||||
.addReg(SrcLow ? DestReg : SrcReg,
|
||||
SrcLow ? unsigned(RegState::Undef) : getKillRegState(KillSrc))
|
||||
.addImm(2);
|
||||
} else {
|
||||
First.addImm(16)
|
||||
.addReg(DestReg, RegState::Undef);
|
||||
}
|
||||
|
||||
auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), DestReg)
|
||||
.addImm(0) // src0_modifiers
|
||||
.addReg(SrcReg)
|
||||
.addImm(0) // clamp
|
||||
.addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
|
||||
: AMDGPU::SDWA::SdwaSel::WORD_1)
|
||||
.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
|
||||
.addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
|
||||
: AMDGPU::SDWA::SdwaSel::WORD_1)
|
||||
.addReg(DestReg, RegState::Implicit | RegState::Undef);
|
||||
// First implicit operand is $exec.
|
||||
MIB->tieOperands(0, MIB->getNumOperands() - 1);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx802 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_lo:
|
||||
# GCN: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
name: lo_to_lo
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -13,8 +14,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_hi:
|
||||
# GCN: v_lshrrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
name: lo_to_hi
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -25,8 +25,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}hi_to_lo:
|
||||
# GCN: v_lshlrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
name: hi_to_lo
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -37,8 +36,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}hi_to_hi:
|
||||
# GCN: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
name: hi_to_hi
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -50,6 +48,7 @@ body: |
|
|||
|
||||
# GCN-LABEL: {{^}}lo_to_lo_samereg:
|
||||
# GCN: s_waitcnt
|
||||
# GFX10-NEXT: s_waitcnt_vscnt
|
||||
# GCN-NEXT: s_endpgm
|
||||
name: lo_to_lo_samereg
|
||||
tracksRegLiveness: true
|
||||
|
@ -61,7 +60,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_hi_samereg:
|
||||
# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0]
|
||||
# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
name: lo_to_hi_samereg
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -72,7 +71,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}hi_to_lo_samereg:
|
||||
# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0]
|
||||
# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
name: hi_to_lo_samereg
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -84,6 +83,7 @@ body: |
|
|||
|
||||
# GCN-LABEL: {{^}}hi_to_hi_samereg:
|
||||
# GCN: s_waitcnt
|
||||
# GFX10-NEXT: s_waitcnt_vscnt
|
||||
# GCN-NEXT: s_endpgm
|
||||
name: hi_to_hi_samereg
|
||||
tracksRegLiveness: true
|
||||
|
@ -95,8 +95,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_lo_def_livein:
|
||||
# GCN: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
name: lo_to_lo_def_livein
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -109,8 +108,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_hi_def_livein:
|
||||
# GCN: v_lshrrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
name: lo_to_hi_def_livein
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -123,8 +121,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}hi_to_lo_def_livein:
|
||||
# GCN: v_lshlrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
name: hi_to_lo_def_livein
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -137,8 +134,7 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}hi_to_hi_def_livein:
|
||||
# GCN: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
name: hi_to_hi_def_livein
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
@ -152,10 +148,8 @@ body: |
|
|||
|
||||
# TODO: This can be coalesced into a VGPR_32 copy
|
||||
# GCN-LABEL: {{^}}lo_to_lo_hi_to_hi:
|
||||
# GCN: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
# GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
# GCN-NEXT: s_endpgm
|
||||
name: lo_to_lo_hi_to_hi
|
||||
|
@ -170,10 +164,8 @@ body: |
|
|||
...
|
||||
|
||||
# GCN-LABEL: {{^}}lo_to_hi_hi_to_lo:
|
||||
# GCN: v_lshlrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
|
||||
# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
|
||||
# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
|
||||
# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
|
||||
# GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
# GCN-NEXT: s_endpgm
|
||||
name: lo_to_hi_hi_to_lo
|
||||
|
@ -190,6 +182,7 @@ body: |
|
|||
# NB: copy of undef just killed instead of expansion
|
||||
# GCN-LABEL: {{^}}lo_to_lo_undef:
|
||||
# GCN: s_waitcnt
|
||||
# GFX10-NEXT: s_waitcnt_vscnt
|
||||
# GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
# GCN-NEXT: s_endpgm
|
||||
name: lo_to_lo_undef
|
||||
|
|
Loading…
Reference in New Issue