[AMDGPU] gfx11 CodeGen for new DPP instructions

Modifies the GCNDPPCombine pass to enable DPP formation for the new DPP
instruction in gfx11, namely VOP3 encoded instructions with DPP and VOPC
with DPP.

Depends on D128656

Reviewed By: #amdgpu, rampitec

Differential Revision: https://reviews.llvm.org/D128682
This commit is contained in:
Joe Nash 2022-06-27 13:20:21 -04:00
parent cc6462a475
commit 0483c91eee
12 changed files with 1092 additions and 50 deletions

View File

@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
}
int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
auto DPP32 = AMDGPU::getDPPOp32(Op);
int DPP32 = AMDGPU::getDPPOp32(Op);
if (IsShrinkable) {
assert(DPP32 == -1);
auto E32 = AMDGPU::getVOPe32(Op);
int E32 = AMDGPU::getVOPe32(Op);
DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
}
return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
return DPP32;
int DPP64 = -1;
if (ST->hasVOP3DPP())
DPP64 = AMDGPU::getDPPOp64(Op);
if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
return DPP64;
return -1;
}
// tracks the register operand definition and returns:
@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
bool HasVOP3DPP = ST->hasVOP3DPP();
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
bool Fail = false;
do {
auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
assert(Dst);
DPPInst.add(*Dst);
int NumOperands = 1;
int NumOperands = 0;
if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
DPPInst.add(*Dst);
++NumOperands;
}
if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
DPPInst.add(*SDst);
++NumOperands;
}
// If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
}
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src0_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src0_modifiers));
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
assert(HasVOP3DPP ||
(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src1_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src1_modifiers));
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
assert(HasVOP3DPP ||
(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.addImm(0);
++NumOperands;
}
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*Src1);
++NumOperands;
}
if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
if (auto *Mod2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
assert(NumOperands ==
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
assert(HasVOP3DPP ||
(0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod2->getImm());
++NumOperands;
}
auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
if (Src2) {
if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
DPPInst.add(*Src2);
++NumOperands;
}
if (HasVOP3DPP) {
auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
if (ClampOpr &&
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) {
DPPInst.addImm(ClampOpr->getImm());
}
auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
if (VdstInOpr &&
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) {
DPPInst.add(*VdstInOpr);
}
auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
if (OmodOpr &&
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) {
DPPInst.addImm(OmodOpr->getImm());
}
// Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
// all 1.
if (auto *OpSelOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
auto OpSel = OpSelOpr->getImm();
if (OpSel != 0) {
LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
Fail = true;
break;
}
if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1)
DPPInst.addImm(OpSel);
}
if (auto *OpSelHiOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
auto OpSelHi = OpSelHiOpr->getImm();
// Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
// the bitmask for 3 op_sel_hi bits set
assert(Src2 && "Expected vop3p with 3 operands");
if (OpSelHi != 7) {
LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
Fail = true;
break;
}
if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1)
DPPInst.addImm(OpSelHi);
}
auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
if (NegOpr &&
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) {
DPPInst.addImm(NegOpr->getImm());
}
auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
if (NegHiOpr &&
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) {
DPPInst.addImm(NegHiOpr->getImm());
}
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
bool IsShrinkable = isShrinkable(OrigMI);
if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
if (!(IsShrinkable ||
((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
TII->isVOP3(OrigOp)) &&
ST->hasVOP3DPP()) ||
TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
break;
}
if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
break;
}
@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
assert(Src0 && "Src1 without Src0?");
if (Src1 && Src1->isIdenticalTo(*Src0)) {
assert(Src1->isReg());
if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
(Src2 && Src2->isIdenticalTo(*Src0)))) ||
(Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
(Src2 && Src2->isIdenticalTo(*Src1))))) {
LLVM_DEBUG(
dbgs()
<< " " << OrigMI

View File

@ -1241,6 +1241,9 @@ namespace AMDGPU {
LLVM_READONLY
int getDPPOp32(uint16_t Opcode);
LLVM_READONLY
int getDPPOp64(uint16_t Opcode);
LLVM_READONLY
int getBasicFromSDWAOp(uint16_t Opcode);

View File

@ -2194,21 +2194,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
"$sdst",
"$vdst"),
""); // use $sdst for VOPC
string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string isrc1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1",
" $src1,"));
string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string src1nomods = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1",
" $src1,"));
string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", "");
string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
" $src1_modifiers,"));
string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
string src1mods = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
" $src1_modifiers,"));
string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
string src0 = !if(Src0HasMods, fsrc0, isrc0);
string src1 = !if(Src1HasMods, fsrc1, isrc1);
string src2 = !if(Src2HasMods, fsrc2, isrc2);
string src0 = !if(Src0HasMods, src0mods, src0nomods);
string src1 = !if(Src1HasMods, src1mods, src1nomods);
string src2 = !if(Src2HasMods, src2mods, src2nomods);
string opsel = !if(HasOpSel, "$op_sel", "");
string 3PMods = !if(IsVOP3P,
!if(HasOpSel, "$op_sel_hi", "")
@ -2559,8 +2559,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// the asm operand name via this HasModifiers flag
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods, DstVT >.ret;
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
HasModifiers, DstVT>.ret;
field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
@ -2800,6 +2800,14 @@ def getDPPOp32 : InstrMapping {
let ValueCols = [["DPP"]];
}
def getDPPOp64 : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
let ColFields = ["AsmVariantName"];
let KeyCol = ["VOP3"];
let ValueCols = [["VOP3_DPP"]];
}
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";

View File

@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
let HasModifiers =
!if (Features.IsMAI, 0,
!or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
}
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
; FIXME: Merge with DAG test
@ -29,6 +30,18 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8]
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
store i32 %tmp0, i32 addrspace(1)* %out
ret void
@ -58,6 +71,18 @@ define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, i64 addrspace(1)* %out
ret void

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
; GFX8-LABEL: dpp_test:
@ -29,6 +30,19 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
store i32 %tmp0, i32 addrspace(1)* %out
ret void
@ -65,6 +79,20 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
%load = load i64, i64 addrspace(1)* %gep

View File

@ -1,9 +1,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
; GCN-LABEL: {{^}}dpp64_ceil:
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
@ -19,7 +20,7 @@ define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
}
; GCN-LABEL: {{^}}dpp64_rcp:
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) {
@ -50,12 +51,12 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64
}
; GCN-LABEL: {{^}}dpp64_div:
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX10-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GCN: v_div_scale_f64
; GCN: v_rcp_f64_e32
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GCN: v_div_scale_f64
; GCN: v_rcp_f64_e32
define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id

View File

@ -1,8 +1,9 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -15,7 +16,7 @@ define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
}
; GCN-LABEL: {{^}}dpp_ceil:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -30,7 +31,7 @@ define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
}
; GCN-LABEL: {{^}}dpp_fadd:
; GCN: global_load_dword [[V:v[0-9]+]],
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_fadd(i32 addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -0,0 +1,810 @@
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
---
# GCN-label: name: vop3
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
name: vop3
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
%5:sreg_32_xm0_xexec = IMPLICIT_DEF
%6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64 %4, %1, %5, 1, implicit $exec
%8:vgpr_32 = V_CVT_PK_U8_F32_e64 4, %4, 2, %2, 2, %1, 1, implicit $mode, implicit $exec
; should not be combined because src2 literal is illegal
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
; should not be combined because src1 imm is illegal
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
...
# Regression test for src_modifiers on base u16 opcode
# GCN-label: name: vop3_u16
# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec
name: vop3_u16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
%6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
%7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec
...
name: vop3p
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; GCN-LABEL: name: vop3p
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec
; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec
; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF
; this should not be combined because op_sel is not zero
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_DOT2_F32_F16 0, %4, 0, %0, 0, %2, 0, 5, 0, 0, 0, implicit $mode, implicit $exec
; this should not be combined because op_sel_hi is not all set
%6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec
%8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
%10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec
%12:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%13:vgpr_32 = V_FMA_MIXLO_F16 8, %12, 8, %0, 8, %2, 0, %2, 0, 7, implicit $mode, implicit $exec
%14:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%15:vgpr_32 = V_FMA_MIXHI_F16 8, %14, 8, %0, 8, %2, 1, %0, 0, 7, implicit $mode, implicit $exec
...
# when the DPP source isn't a src0 operand the operation should be commuted if possible
# GCN-LABEL: name: dpp_commute_shrink
# GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
# GCN: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
# GCN: %16:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
name: dpp_commute_shrink
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
%4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 0, implicit $exec
%5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
%6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 0, implicit $exec
%7:vgpr_32 = V_AND_B32_e64 %1, %6, implicit $exec
%8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
%9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
%10:vgpr_32 = V_MAX_I32_e64 %1, %9, implicit $exec
%11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
%12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 0, implicit $exec
%13:vgpr_32 = V_MIN_I32_e64 %1, %12, implicit $exec
%14:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%15:vgpr_32 = V_MOV_B32_dpp %14, %0, 1, 14, 15, 0, implicit $exec
%16:vgpr_32 = V_SUB_U32_e64 %1, %15, 0, implicit $exec
...
# do not combine, dpp arg used twice
# GCN-label: name: dpp_arg_twice
# GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
# GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
# GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
name: dpp_arg_twice
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
%5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
%7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
...
# when the dpp source isn't a src0 operand the operation should be commuted if possible
# GCN-label: name: dpp_commute_e64
# GCN: %4:vgpr_32 = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
# GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
# GCN: %13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64_dpp %1, %0, %1, 0, 1, 14, 15, 0, implicit $exec
# GCN: %17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec
name: dpp_commute_e64
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
%4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 1, implicit $exec
%5:vgpr_32 = IMPLICIT_DEF
%6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 15, 1, implicit $exec
%7:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %6, 2, %1, 1, 2, implicit $mode, implicit $exec
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
%10:vgpr_32 = V_SUB_U32_e64 %1, %9, 1, implicit $exec
%11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 14, 15, 0, implicit $exec
%13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %1, %12, 0, implicit $exec
; this cannot be combined because immediate as src0 isn't commutable
%15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%16:vgpr_32 = V_MOV_B32_dpp %15, %0, 1, 14, 15, 0, implicit $exec
%17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec
...
---
# check for floating point modifiers
# GCN-LABEL: name: add_f32_e64
# GCN: %4:vgpr_32 = V_ADD_F32_e64_dpp %2, 0, %1, 0, %0, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_ADD_F32_e64_dpp %2, 4, %1, 8, %0, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: add_f32_e64
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
; this should be combined as e64
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $mode, implicit $exec
; this should be combined and shrunk as all modifiers are default
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $mode, implicit $exec
; this should be combined and shrunk as modifiers other than abs|neg are default
%7:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $mode, implicit $exec
; this should be combined as e64
%9:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $mode, implicit $exec
...
# check for e64 modifiers
# GCN-LABEL: name: add_u32_e64
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec
# GCN: %6:vgpr_32 = V_ADD_U32_e64_dpp %2, %0, %1, 1, 1, 15, 15, 1, implicit $exec
name: add_u32_e64
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
; this should be combined and shrunk as all modifiers are default
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_ADD_U32_e64 %3, %1, 0, implicit $exec
; this should be combined as _e64
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
%6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec
...
# tests on sequences of dpp consumers
# GCN-LABEL: name: dpp_seq
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# broken sequence:
# GCN: %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
name: dpp_seq
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
%5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec
%6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
%7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
%8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec
; this breaks the sequence
%9:vgpr_32 = V_SUB_U32_e32 5, %7, implicit $exec
...
# tests on sequences of dpp consumers followed by control flow
# GCN-LABEL: name: dpp_seq_cf
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
name: dpp_seq_cf
tracksRegLiveness: true
body: |
bb.0:
successors: %bb.1, %bb.2
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
%5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec
%6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
%7:sreg_32 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
%8:sreg_32 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.2
bb.2:
SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
...
# GCN-LABEL: name: old_in_diff_bb
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %0, %1, %0, 1, 1, 1, 0, implicit $exec
name: old_in_diff_bb
tracksRegLiveness: true
body: |
bb.0:
successors: %bb.1
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
S_BRANCH %bb.1
bb.1:
%3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
%4:vgpr_32 = V_ADD_U32_e32 %3, %0, implicit $exec
...
# old reg def is in diff BB but bound_ctrl:1 - can combine
# GCN-LABEL: name: old_in_diff_bb_bctrl_zero
# GCN: %4:vgpr_32 = V_ADD_U32_dpp {{%[0-9]}}, %0, %1, 1, 15, 15, 1, implicit $exec
name: old_in_diff_bb_bctrl_zero
tracksRegLiveness: true
body: |
bb.0:
successors: %bb.1
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
S_BRANCH %bb.1
bb.1:
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
...
# EXEC mask changed between def and use - cannot combine
# GCN-LABEL: name: exec_changed
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
name: exec_changed
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
%5:sreg_64 = COPY $exec, implicit-def $exec
%6:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
...
# test if $old definition is correctly tracked through subreg manipulation pseudos
# GCN-LABEL: name: mul_old_subreg
# GCN: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
name: mul_old_subreg
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
%6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
%7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
...
# GCN-LABEL: name: add_old_subreg
# GCN: %5:vgpr_32 = V_ADD_U32_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
name: add_old_subreg
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
...
# GCN-LABEL: name: add_old_subreg_undef
# GCN: %5:vgpr_32 = V_ADD_U32_dpp undef %3.sub1, %1, %0.sub1, 1, 15, 15, 1, implicit $exec
name: add_old_subreg_undef
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
...
# Test instruction which does not have modifiers in VOP1 form but does in DPP form.
# GCN-LABEL: name: dpp_vop1
# GCN: %3:vgpr_32 = V_CEIL_F32_dpp %0, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
name: dpp_vop1
tracksRegLiveness: true
body: |
bb.0:
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
...
# Test instruction which does not have modifiers in VOP2 form but does in DPP form.
# GCN-LABEL: name: dpp_min
# GCN: %3:vgpr_32 = V_MIN_F32_dpp %0, 0, undef %2:vgpr_32, 0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
name: dpp_min
tracksRegLiveness: true
body: |
bb.0:
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = V_MIN_F32_e32 %2, undef %3:vgpr_32, implicit $mode, implicit $exec
...
# Test an undef old operand
# GCN-LABEL: name: dpp_undef_old
# GCN: %3:vgpr_32 = V_CEIL_F32_dpp undef %1:vgpr_32, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
name: dpp_undef_old
tracksRegLiveness: true
body: |
bb.0:
%2:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
...
# Do not combine a dpp mov which writes a physreg.
# GCN-LABEL: name: phys_dpp_mov_dst
# GCN: $vgpr0 = V_MOV_B32_dpp undef %0:vgpr_32, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
name: phys_dpp_mov_dst
tracksRegLiveness: true
body: |
bb.0:
$vgpr0 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
%2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
...
# Do not combine a dpp mov which reads a physreg.
# GCN-LABEL: name: phys_dpp_mov_old_src
# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
name: phys_dpp_mov_old_src
tracksRegLiveness: true
body: |
bb.0:
%1:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
%2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
...
# Do not combine a dpp mov which reads a physreg.
# GCN-LABEL: name: phys_dpp_mov_src
# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
name: phys_dpp_mov_src
tracksRegLiveness: true
body: |
bb.0:
%1:vgpr_32 = V_MOV_B32_dpp undef %0:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
%2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_both_combined
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %9:vgpr_32 = IMPLICIT_DEF
# GCN: %8:vgpr_32 = IMPLICIT_DEF
# GCN: %6:vgpr_32 = V_ADD_U32_dpp %9, %1.sub0, %2, 1, 15, 15, 1, implicit $exec
# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_both_combined
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_first_combined
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %8:vgpr_32 = IMPLICIT_DEF
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vgpr_32 = V_ADD_U32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit $exec
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_first_combined
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_second_combined
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
# GCN: %8:vgpr_32 = IMPLICIT_DEF
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, undef %4:vgpr_32, %subreg.sub1
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_second_combined
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_none_combined
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_none_combined
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_exec_changed
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
# GCN: S_BRANCH %bb.1
# GCN: bb.1:
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_exec_changed
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
S_BRANCH %bb.1
bb.1:
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_subreg
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vreg_64 = REG_SEQUENCE %5.sub0, %subreg.sub0, %5.sub1, %subreg.sub1
# GCN: %7:vgpr_32 = V_ADD_U32_e32 %6.sub0, %2, implicit $exec
# GCN: %8:vgpr_32 = V_ADDC_U32_e32 %6.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp_reg_sequence_subreg
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%8:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vreg_64 = REG_SEQUENCE %4.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
%6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %8, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_src2_reject
#GCN: %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
#GCN: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
#GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec
#GCN: %7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
name: dpp_reg_sequence_src2_reject
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
; use of dpp arg as src2, reject
%6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec
; cannot commute src0 and src2, and %4.sub0 already rejected, reject
%7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dpp_reg_sequence_src2
#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
#GCN: %4:vreg_64 = REG_SEQUENCE undef %2:vgpr_32, %subreg.sub0, %3, %subreg.sub1
#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
#GCN: %6:vgpr_32 = V_FMA_F32_e64_dpp %8, 2, %1.sub0, 2, %5, 2, %4.sub1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
name: dpp_reg_sequence_src2
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
%6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dpp64_add64_impdef
# GCN: %3:vgpr_32 = V_ADD_U32_dpp %1.sub0, %0.sub0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp %1.sub1, %0.sub1, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp64_add64_impdef
tracksRegLiveness: true
body: |
bb.0:
%0:vreg_64 = IMPLICIT_DEF
%1:vreg_64 = IMPLICIT_DEF
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: dpp64_add64_undef
# GCN: %3:vgpr_32 = V_ADD_U32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
name: dpp64_add64_undef
tracksRegLiveness: true
body: |
bb.0:
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: cndmask_with_src2
# GCN: %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec
# GCN: %8:vgpr_32 = V_CNDMASK_B32_e64_dpp %2, 4, %1, 0, %1, %7, 1, 15, 15, 1, implicit $exec
name: cndmask_with_src2
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
%4:sreg_32_xm0_xexec = IMPLICIT_DEF
%5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec
; src2 is legal for _e64
%6:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 15, 15, 1, implicit $exec
%7:sreg_32_xm0_xexec = IMPLICIT_DEF
%8:vgpr_32 = V_CNDMASK_B32_e64 4, %6, 0, %1, %7, implicit $exec
...
---
# Make sure flags aren't dropped
# GCN-LABEL: name: flags_add_f32_e64
# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: flags_add_f32_e64
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0, implicit %4
...
# GCN-LABEL: name: dont_combine_more_than_one_operand
# GCN: %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
name: dont_combine_more_than_one_operand
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence
# GCN: %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
name: dont_combine_more_than_one_operand_dpp_reg_sequence
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
%6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
...

View File

@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
; FIXME: The register allocator / scheduler should be able to avoid these hazards.

View File

@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
; GCN-LABEL: {{^}}dpp_test:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
@ -26,7 +27,7 @@ define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in
; GCN-LABEL: {{^}}dpp_test1:
; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
@ -51,7 +52,7 @@ bb:
}
; GCN-LABEL: {{^}}update_dpp64_test:
; GCN: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
@ -65,12 +66,13 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
; GCN-DAG: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {

View File

@ -0,0 +1,68 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
---
name: vopc
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; GCN-LABEL: name: vopc
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN: V_CMP_LT_F32_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN: V_CMPX_EQ_I16_e32 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
; GCN: V_CMP_CLASS_F16_e32_dpp [[DEF]], 0, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_GE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_e64_dpp [[DEF]], 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
; GCN: V_CMP_CLASS_F32_e32_dpp [[DEF]], 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
; GCN: V_CMP_NGE_F16_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
; GCN: [[V_CMP_NGE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_e64_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F16_e64_dpp]], 10101, implicit-def $scc
; GCN: V_CMP_GT_I32_e32_dpp [[DEF]], [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec
; unsafe to combine cmpx
%5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMPX_EQ_I16_e32 %5, %0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
%6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMP_CLASS_F16_e32 %6, %0, implicit-def $vcc, implicit $mode, implicit $exec
%7:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%8:sgpr_32 = V_CMP_GE_F16_e64 1, %7, 0, %0, 1, implicit $mode, implicit $exec
; unsafe to combine cmpx
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMPX_GT_U32_nosdst_e64 %9, %0, implicit-def $exec, implicit $mode, implicit $exec
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec
; shrink
%13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%14:sgpr_32 = V_CMP_NGE_F16_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec
; do not shrink, sdst used
%15:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%16:sgpr_32 = V_CMP_NGE_F16_e64 0, %15, 0, %0, 0, implicit $mode, implicit $exec
%17:sgpr_32 = S_AND_B32 %16, 10101, implicit-def $scc
; commute
%18:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMP_LT_I32_e32 %0, %18, implicit-def $vcc, implicit $exec
...