forked from OSchip/llvm-project
[AMDGPU] gfx11 CodeGen for new DPP instructions
Modifies the GCNDPPCombine pass to enable DPP formation for the new DPP instruction in gfx11, namely VOP3 encoded instructions with DPP and VOPC with DPP. Depends on D128656 Reviewed By: #amdgpu, rampitec Differential Revision: https://reviews.llvm.org/D128682
This commit is contained in:
parent
cc6462a475
commit
0483c91eee
|
@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
|
|||
}
|
||||
|
||||
int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
|
||||
auto DPP32 = AMDGPU::getDPPOp32(Op);
|
||||
int DPP32 = AMDGPU::getDPPOp32(Op);
|
||||
if (IsShrinkable) {
|
||||
assert(DPP32 == -1);
|
||||
auto E32 = AMDGPU::getVOPe32(Op);
|
||||
int E32 = AMDGPU::getVOPe32(Op);
|
||||
DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
|
||||
}
|
||||
return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
|
||||
if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
|
||||
return DPP32;
|
||||
int DPP64 = -1;
|
||||
if (ST->hasVOP3DPP())
|
||||
DPP64 = AMDGPU::getDPPOp64(Op);
|
||||
if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
|
||||
return DPP64;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// tracks the register operand definition and returns:
|
||||
|
@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
|
||||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
|
||||
|
||||
bool HasVOP3DPP = ST->hasVOP3DPP();
|
||||
auto OrigOp = OrigMI.getOpcode();
|
||||
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
|
||||
if (DPPOp == -1) {
|
||||
|
@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
|
||||
bool Fail = false;
|
||||
do {
|
||||
auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
|
||||
assert(Dst);
|
||||
DPPInst.add(*Dst);
|
||||
int NumOperands = 1;
|
||||
int NumOperands = 0;
|
||||
if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
|
||||
DPPInst.add(*Dst);
|
||||
++NumOperands;
|
||||
}
|
||||
if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
|
||||
if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
|
||||
DPPInst.add(*SDst);
|
||||
++NumOperands;
|
||||
}
|
||||
// If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
|
||||
}
|
||||
|
||||
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
|
||||
if (OldIdx != -1) {
|
||||
|
@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
AMDGPU::OpName::src0_modifiers)) {
|
||||
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
AMDGPU::OpName::src0_modifiers));
|
||||
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
|
||||
assert(HasVOP3DPP ||
|
||||
(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
|
||||
DPPInst.addImm(Mod0->getImm());
|
||||
++NumOperands;
|
||||
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
|
@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
AMDGPU::OpName::src1_modifiers)) {
|
||||
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
AMDGPU::OpName::src1_modifiers));
|
||||
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
|
||||
assert(HasVOP3DPP ||
|
||||
(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
|
||||
DPPInst.addImm(Mod1->getImm());
|
||||
++NumOperands;
|
||||
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
|
@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
DPPInst.addImm(0);
|
||||
++NumOperands;
|
||||
}
|
||||
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
|
||||
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
|
||||
if (Src1) {
|
||||
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
|
||||
Fail = true;
|
||||
|
@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
DPPInst.add(*Src1);
|
||||
++NumOperands;
|
||||
}
|
||||
|
||||
if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
|
||||
if (auto *Mod2 =
|
||||
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
|
||||
assert(NumOperands ==
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
|
||||
assert(HasVOP3DPP ||
|
||||
(0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
|
||||
DPPInst.addImm(Mod2->getImm());
|
||||
++NumOperands;
|
||||
}
|
||||
auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
|
||||
if (Src2) {
|
||||
if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
|
||||
!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
|
||||
|
@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
|||
break;
|
||||
}
|
||||
DPPInst.add(*Src2);
|
||||
++NumOperands;
|
||||
}
|
||||
if (HasVOP3DPP) {
|
||||
auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
|
||||
if (ClampOpr &&
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) {
|
||||
DPPInst.addImm(ClampOpr->getImm());
|
||||
}
|
||||
auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
|
||||
if (VdstInOpr &&
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) {
|
||||
DPPInst.add(*VdstInOpr);
|
||||
}
|
||||
auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
|
||||
if (OmodOpr &&
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) {
|
||||
DPPInst.addImm(OmodOpr->getImm());
|
||||
}
|
||||
// Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
|
||||
// all 1.
|
||||
if (auto *OpSelOpr =
|
||||
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
|
||||
auto OpSel = OpSelOpr->getImm();
|
||||
if (OpSel != 0) {
|
||||
LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
|
||||
Fail = true;
|
||||
break;
|
||||
}
|
||||
if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1)
|
||||
DPPInst.addImm(OpSel);
|
||||
}
|
||||
if (auto *OpSelHiOpr =
|
||||
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
|
||||
auto OpSelHi = OpSelHiOpr->getImm();
|
||||
// Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
|
||||
// the bitmask for 3 op_sel_hi bits set
|
||||
assert(Src2 && "Expected vop3p with 3 operands");
|
||||
if (OpSelHi != 7) {
|
||||
LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
|
||||
Fail = true;
|
||||
break;
|
||||
}
|
||||
if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1)
|
||||
DPPInst.addImm(OpSelHi);
|
||||
}
|
||||
auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
|
||||
if (NegOpr &&
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) {
|
||||
DPPInst.addImm(NegOpr->getImm());
|
||||
}
|
||||
auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
|
||||
if (NegHiOpr &&
|
||||
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) {
|
||||
DPPInst.addImm(NegHiOpr->getImm());
|
||||
}
|
||||
}
|
||||
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
|
||||
|
@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
|
|||
}
|
||||
|
||||
bool IsShrinkable = isShrinkable(OrigMI);
|
||||
if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
|
||||
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
|
||||
if (!(IsShrinkable ||
|
||||
((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
|
||||
TII->isVOP3(OrigOp)) &&
|
||||
ST->hasVOP3DPP()) ||
|
||||
TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
|
||||
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
|
||||
break;
|
||||
}
|
||||
if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
|
||||
LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
|
|||
break;
|
||||
}
|
||||
|
||||
auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
|
||||
assert(Src0 && "Src1 without Src0?");
|
||||
if (Src1 && Src1->isIdenticalTo(*Src0)) {
|
||||
assert(Src1->isReg());
|
||||
if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
|
||||
(Src2 && Src2->isIdenticalTo(*Src0)))) ||
|
||||
(Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
|
||||
(Src2 && Src2->isIdenticalTo(*Src1))))) {
|
||||
LLVM_DEBUG(
|
||||
dbgs()
|
||||
<< " " << OrigMI
|
||||
|
|
|
@ -1241,6 +1241,9 @@ namespace AMDGPU {
|
|||
LLVM_READONLY
|
||||
int getDPPOp32(uint16_t Opcode);
|
||||
|
||||
LLVM_READONLY
|
||||
int getDPPOp64(uint16_t Opcode);
|
||||
|
||||
LLVM_READONLY
|
||||
int getBasicFromSDWAOp(uint16_t Opcode);
|
||||
|
||||
|
|
|
@ -2194,21 +2194,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
|
|||
"$sdst",
|
||||
"$vdst"),
|
||||
""); // use $sdst for VOPC
|
||||
string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
|
||||
string isrc1 = !if(!eq(NumSrcArgs, 1), "",
|
||||
!if(!eq(NumSrcArgs, 2), " $src1",
|
||||
" $src1,"));
|
||||
string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
|
||||
string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
|
||||
string src1nomods = !if(!eq(NumSrcArgs, 1), "",
|
||||
!if(!eq(NumSrcArgs, 2), " $src1",
|
||||
" $src1,"));
|
||||
string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", "");
|
||||
|
||||
string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
|
||||
string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
|
||||
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
|
||||
" $src1_modifiers,"));
|
||||
string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
|
||||
string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
|
||||
string src1mods = !if(!eq(NumSrcArgs, 1), "",
|
||||
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
|
||||
" $src1_modifiers,"));
|
||||
string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
|
||||
|
||||
string src0 = !if(Src0HasMods, fsrc0, isrc0);
|
||||
string src1 = !if(Src1HasMods, fsrc1, isrc1);
|
||||
string src2 = !if(Src2HasMods, fsrc2, isrc2);
|
||||
string src0 = !if(Src0HasMods, src0mods, src0nomods);
|
||||
string src1 = !if(Src1HasMods, src1mods, src1nomods);
|
||||
string src2 = !if(Src2HasMods, src2mods, src2nomods);
|
||||
string opsel = !if(HasOpSel, "$op_sel", "");
|
||||
string 3PMods = !if(IsVOP3P,
|
||||
!if(HasOpSel, "$op_sel_hi", "")
|
||||
|
@ -2559,8 +2559,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
|
|||
// the asm operand name via this HasModifiers flag
|
||||
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
|
||||
field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
|
||||
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
|
||||
HasSrc2FloatMods, DstVT >.ret;
|
||||
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
|
||||
HasModifiers, DstVT>.ret;
|
||||
field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
|
||||
field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
|
||||
field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
|
||||
|
@ -2800,6 +2800,14 @@ def getDPPOp32 : InstrMapping {
|
|||
let ValueCols = [["DPP"]];
|
||||
}
|
||||
|
||||
def getDPPOp64 : InstrMapping {
|
||||
let FilterClass = "VOP";
|
||||
let RowFields = ["OpName"];
|
||||
let ColFields = ["AsmVariantName"];
|
||||
let KeyCol = ["VOP3"];
|
||||
let ValueCols = [["VOP3_DPP"]];
|
||||
}
|
||||
|
||||
// Maps an commuted opcode to its original version
|
||||
def getCommuteOrig : InstrMapping {
|
||||
let FilterClass = "Commutable_REV";
|
||||
|
|
|
@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
|
|||
let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
|
||||
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
|
||||
|
||||
let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
|
||||
let HasModifiers =
|
||||
!if (Features.IsMAI, 0,
|
||||
!or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
|
||||
}
|
||||
|
||||
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
|
||||
|
||||
; FIXME: Merge with DAG test
|
||||
|
||||
|
@ -29,6 +30,18 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
|
|||
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00]
|
||||
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
|
||||
;
|
||||
; GFX11-LABEL: dpp_test:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
|
||||
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8]
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01]
|
||||
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
|
||||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
|
||||
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
|
||||
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
|
||||
store i32 %tmp0, i32 addrspace(1)* %out
|
||||
ret void
|
||||
|
@ -58,6 +71,18 @@ define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
|
|||
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
|
||||
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
|
||||
;
|
||||
; GFX11-LABEL: mov_dpp64_test:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00]
|
||||
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
|
||||
; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
|
||||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
|
||||
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
|
||||
%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0
|
||||
store i64 %tmp0, i64 addrspace(1)* %out
|
||||
ret void
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
|
||||
|
||||
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
|
||||
; GFX8-LABEL: dpp_test:
|
||||
|
@ -29,6 +30,19 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
|
|||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: dpp_test:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
||||
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
|
||||
store i32 %tmp0, i32 addrspace(1)* %out
|
||||
ret void
|
||||
|
@ -65,6 +79,20 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
|
|||
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: update_dpp64_test:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
|
||||
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
|
||||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
|
||||
%load = load i64, i64 addrspace(1)* %gep
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
|
||||
|
||||
; GCN-LABEL: {{^}}dpp64_ceil:
|
||||
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
|
||||
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
|
||||
; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
|
||||
|
@ -19,7 +20,7 @@ define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dpp64_rcp:
|
||||
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
|
||||
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
|
||||
; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) {
|
||||
|
@ -50,12 +51,12 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dpp64_div:
|
||||
; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]],
|
||||
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GFX10-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GCN: v_div_scale_f64
|
||||
; GCN: v_rcp_f64_e32
|
||||
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
|
||||
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
; GCN: v_div_scale_f64
|
||||
; GCN: v_rcp_f64_e32
|
||||
define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
|
||||
|
||||
; GCN-LABEL: {{^}}dpp_add:
|
||||
; GCN: global_load_dword [[V:v[0-9]+]],
|
||||
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
|
||||
; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -15,7 +16,7 @@ define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dpp_ceil:
|
||||
; GCN: global_load_dword [[V:v[0-9]+]],
|
||||
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
|
||||
; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -30,7 +31,7 @@ define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dpp_fadd:
|
||||
; GCN: global_load_dword [[V:v[0-9]+]],
|
||||
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
|
||||
; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
|
||||
define amdgpu_kernel void @dpp_fadd(i32 addrspace(1)* %arg) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -0,0 +1,810 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
|
||||
|
||||
---
|
||||
|
||||
# GCN-label: name: vop3
|
||||
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
|
||||
# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
|
||||
name: vop3
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = COPY $vgpr2
|
||||
%3:vgpr_32 = IMPLICIT_DEF
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
|
||||
|
||||
%5:sreg_32_xm0_xexec = IMPLICIT_DEF
|
||||
%6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64 %4, %1, %5, 1, implicit $exec
|
||||
|
||||
%8:vgpr_32 = V_CVT_PK_U8_F32_e64 4, %4, 2, %2, 2, %1, 1, implicit $mode, implicit $exec
|
||||
|
||||
; should not be combined because src2 literal is illegal
|
||||
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
; should not be combined because src1 imm is illegal
|
||||
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Regression test for src_modifiers on base u16 opcode
|
||||
# GCN-label: name: vop3_u16
|
||||
# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec
|
||||
name: vop3_u16
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = COPY $vgpr2
|
||||
%3:vgpr_32 = IMPLICIT_DEF
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
|
||||
%7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec
|
||||
...
|
||||
|
||||
name: vop3p
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
|
||||
; GCN-LABEL: name: vop3p
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
|
||||
; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
|
||||
; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = COPY $vgpr2
|
||||
%3:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
; this should not be combined because op_sel is not zero
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%5:vgpr_32 = V_DOT2_F32_F16 0, %4, 0, %0, 0, %2, 0, 5, 0, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
; this should not be combined because op_sel_hi is not all set
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
%8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
|
||||
|
||||
%10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec
|
||||
|
||||
%12:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%13:vgpr_32 = V_FMA_MIXLO_F16 8, %12, 8, %0, 8, %2, 0, %2, 0, 7, implicit $mode, implicit $exec
|
||||
|
||||
%14:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%15:vgpr_32 = V_FMA_MIXHI_F16 8, %14, 8, %0, 8, %2, 1, %0, 0, 7, implicit $mode, implicit $exec
|
||||
|
||||
...
|
||||
|
||||
# when the DPP source isn't a src0 operand the operation should be commuted if possible
|
||||
# GCN-LABEL: name: dpp_commute_shrink
|
||||
# GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
|
||||
# GCN: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec
|
||||
# GCN: %16:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
name: dpp_commute_shrink
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
|
||||
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
%4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 0, implicit $exec
|
||||
|
||||
%5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 0, implicit $exec
|
||||
%7:vgpr_32 = V_AND_B32_e64 %1, %6, implicit $exec
|
||||
|
||||
%8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
|
||||
%9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
|
||||
%10:vgpr_32 = V_MAX_I32_e64 %1, %9, implicit $exec
|
||||
|
||||
%11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
|
||||
%12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 0, implicit $exec
|
||||
%13:vgpr_32 = V_MIN_I32_e64 %1, %12, implicit $exec
|
||||
|
||||
%14:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%15:vgpr_32 = V_MOV_B32_dpp %14, %0, 1, 14, 15, 0, implicit $exec
|
||||
%16:vgpr_32 = V_SUB_U32_e64 %1, %15, 0, implicit $exec
|
||||
|
||||
...
|
||||
|
||||
# do not combine, dpp arg used twice
|
||||
# GCN-label: name: dpp_arg_twice
|
||||
# GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
|
||||
# GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
|
||||
name: dpp_arg_twice
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
|
||||
|
||||
%5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
|
||||
|
||||
%7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
|
||||
|
||||
...
|
||||
|
||||
# when the dpp source isn't a src0 operand the operation should be commuted if possible
|
||||
# GCN-label: name: dpp_commute_e64
|
||||
# GCN: %4:vgpr_32 = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
# GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64_dpp %1, %0, %1, 0, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec
|
||||
name: dpp_commute_e64
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
|
||||
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
%4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 1, implicit $exec
|
||||
|
||||
%5:vgpr_32 = IMPLICIT_DEF
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 15, 1, implicit $exec
|
||||
%7:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %6, 2, %1, 1, 2, implicit $mode, implicit $exec
|
||||
|
||||
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec
|
||||
%10:vgpr_32 = V_SUB_U32_e64 %1, %9, 1, implicit $exec
|
||||
|
||||
%11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 14, 15, 0, implicit $exec
|
||||
%13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %1, %12, 0, implicit $exec
|
||||
|
||||
; this cannot be combined because immediate as src0 isn't commutable
|
||||
%15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%16:vgpr_32 = V_MOV_B32_dpp %15, %0, 1, 14, 15, 0, implicit $exec
|
||||
%17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# check for floating point modifiers
|
||||
# GCN-LABEL: name: add_f32_e64
|
||||
# GCN: %4:vgpr_32 = V_ADD_F32_e64_dpp %2, 0, %1, 0, %0, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
# GCN: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
# GCN: %10:vgpr_32 = V_ADD_F32_e64_dpp %2, 4, %1, 8, %0, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
|
||||
name: add_f32_e64
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
; this should be combined as e64
|
||||
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $mode, implicit $exec
|
||||
|
||||
; this should be combined and shrunk as all modifiers are default
|
||||
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
; this should be combined and shrunk as modifiers other than abs|neg are default
|
||||
%7:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $mode, implicit $exec
|
||||
|
||||
; this should be combined as e64
|
||||
%9:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# check for e64 modifiers
|
||||
# GCN-LABEL: name: add_u32_e64
|
||||
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_e64_dpp %2, %0, %1, 1, 1, 15, 15, 1, implicit $exec
|
||||
|
||||
name: add_u32_e64
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
; this should be combined and shrunk as all modifiers are default
|
||||
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e64 %3, %1, 0, implicit $exec
|
||||
|
||||
; this should be combined as _e64
|
||||
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec
|
||||
...
|
||||
|
||||
# tests on sequences of dpp consumers
|
||||
# GCN-LABEL: name: dpp_seq
|
||||
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# broken sequence:
|
||||
# GCN: %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
|
||||
name: dpp_seq
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
|
||||
%5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec
|
||||
%6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
|
||||
|
||||
%7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
%8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec
|
||||
; this breaks the sequence
|
||||
%9:vgpr_32 = V_SUB_U32_e32 5, %7, implicit $exec
|
||||
...
|
||||
|
||||
# tests on sequences of dpp consumers followed by control flow
|
||||
# GCN-LABEL: name: dpp_seq_cf
|
||||
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
|
||||
|
||||
name: dpp_seq_cf
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
liveins: $vgpr0, $vgpr1
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
|
||||
%5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec
|
||||
%6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
|
||||
|
||||
%7:sreg_32 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
|
||||
%8:sreg_32 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
|
||||
bb.2:
|
||||
SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: old_in_diff_bb
|
||||
# GCN: %4:vgpr_32 = V_ADD_U32_dpp %0, %1, %0, 1, 1, 1, 0, implicit $exec
|
||||
|
||||
name: old_in_diff_bb
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e32 %3, %0, implicit $exec
|
||||
...
|
||||
|
||||
# old reg def is in diff BB but bound_ctrl:1 - can combine
|
||||
# GCN-LABEL: name: old_in_diff_bb_bctrl_zero
|
||||
# GCN: %4:vgpr_32 = V_ADD_U32_dpp {{%[0-9]}}, %0, %1, 1, 15, 15, 1, implicit $exec
|
||||
|
||||
name: old_in_diff_bb_bctrl_zero
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
|
||||
...
|
||||
|
||||
# EXEC mask changed between def and use - cannot combine
|
||||
# GCN-LABEL: name: exec_changed
|
||||
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
|
||||
name: exec_changed
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
|
||||
%5:sreg_64 = COPY $exec, implicit-def $exec
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec
|
||||
...
|
||||
|
||||
# test if $old definition is correctly tracked through subreg manipulation pseudos
|
||||
|
||||
# GCN-LABEL: name: mul_old_subreg
|
||||
# GCN: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
|
||||
|
||||
name: mul_old_subreg
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
|
||||
%7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: add_old_subreg
|
||||
# GCN: %5:vgpr_32 = V_ADD_U32_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
|
||||
|
||||
name: add_old_subreg
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: add_old_subreg_undef
|
||||
# GCN: %5:vgpr_32 = V_ADD_U32_dpp undef %3.sub1, %1, %0.sub1, 1, 15, 15, 1, implicit $exec
|
||||
|
||||
name: add_old_subreg_undef
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 15, 15, 1, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# Test instruction which does not have modifiers in VOP1 form but does in DPP form.
|
||||
# GCN-LABEL: name: dpp_vop1
|
||||
# GCN: %3:vgpr_32 = V_CEIL_F32_dpp %0, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
name: dpp_vop1
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%1:vgpr_32 = IMPLICIT_DEF
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Test instruction which does not have modifiers in VOP2 form but does in DPP form.
|
||||
# GCN-LABEL: name: dpp_min
|
||||
# GCN: %3:vgpr_32 = V_MIN_F32_dpp %0, 0, undef %2:vgpr_32, 0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
name: dpp_min
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%1:vgpr_32 = IMPLICIT_DEF
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = V_MIN_F32_e32 %2, undef %3:vgpr_32, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Test an undef old operand
|
||||
# GCN-LABEL: name: dpp_undef_old
|
||||
# GCN: %3:vgpr_32 = V_CEIL_F32_dpp undef %1:vgpr_32, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
name: dpp_undef_old
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%2:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Do not combine a dpp mov which writes a physreg.
|
||||
# GCN-LABEL: name: phys_dpp_mov_dst
|
||||
# GCN: $vgpr0 = V_MOV_B32_dpp undef %0:vgpr_32, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
|
||||
name: phys_dpp_mov_dst
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
$vgpr0 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
%2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Do not combine a dpp mov which reads a physreg.
|
||||
# GCN-LABEL: name: phys_dpp_mov_old_src
|
||||
# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
|
||||
name: phys_dpp_mov_old_src
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%1:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
%2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# Do not combine a dpp mov which reads a physreg.
|
||||
# GCN-LABEL: name: phys_dpp_mov_src
|
||||
# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
|
||||
name: phys_dpp_mov_src
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%1:vgpr_32 = V_MOV_B32_dpp undef %0:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
|
||||
%2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_both_combined
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %9:vgpr_32 = IMPLICIT_DEF
|
||||
# GCN: %8:vgpr_32 = IMPLICIT_DEF
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_dpp %9, %1.sub0, %2, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_both_combined
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_first_combined
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %8:vgpr_32 = IMPLICIT_DEF
|
||||
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
|
||||
# GCN: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_first_combined
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_second_combined
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
|
||||
# GCN: %8:vgpr_32 = IMPLICIT_DEF
|
||||
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, undef %4:vgpr_32, %subreg.sub1
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_second_combined
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_none_combined
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
|
||||
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
|
||||
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_none_combined
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_exec_changed
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
|
||||
# GCN: S_BRANCH %bb.1
|
||||
# GCN: bb.1:
|
||||
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
|
||||
# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_exec_changed
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_subreg
|
||||
# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
|
||||
# GCN: %6:vreg_64 = REG_SEQUENCE %5.sub0, %subreg.sub0, %5.sub1, %subreg.sub1
|
||||
# GCN: %7:vgpr_32 = V_ADD_U32_e32 %6.sub0, %2, implicit $exec
|
||||
# GCN: %8:vgpr_32 = V_ADDC_U32_e32 %6.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp_reg_sequence_subreg
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%8:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5:vreg_64 = REG_SEQUENCE %4.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
|
||||
%6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %8, implicit $exec
|
||||
%7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_src2_reject
|
||||
#GCN: %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
#GCN: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
#GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec
|
||||
#GCN: %7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
|
||||
name: dpp_reg_sequence_src2_reject
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
; use of dpp arg as src2, reject
|
||||
%6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec
|
||||
; cannot commute src0 and src2, and %4.sub0 already rejected, reject
|
||||
%7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp_reg_sequence_src2
|
||||
#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
#GCN: %4:vreg_64 = REG_SEQUENCE undef %2:vgpr_32, %subreg.sub0, %3, %subreg.sub1
|
||||
#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
#GCN: %6:vgpr_32 = V_FMA_F32_e64_dpp %8, 2, %1.sub0, 2, %5, 2, %4.sub1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
name: dpp_reg_sequence_src2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
|
||||
%6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp64_add64_impdef
|
||||
# GCN: %3:vgpr_32 = V_ADD_U32_dpp %1.sub0, %0.sub0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp %1.sub1, %0.sub1, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp64_add64_impdef
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%0:vreg_64 = IMPLICIT_DEF
|
||||
%1:vreg_64 = IMPLICIT_DEF
|
||||
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec
|
||||
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dpp64_add64_undef
|
||||
# GCN: %3:vgpr_32 = V_ADD_U32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec
|
||||
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dpp64_add64_undef
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec
|
||||
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
|
||||
# GCN-LABEL: name: cndmask_with_src2
|
||||
# GCN: %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec
|
||||
# GCN: %8:vgpr_32 = V_CNDMASK_B32_e64_dpp %2, 4, %1, 0, %1, %7, 1, 15, 15, 1, implicit $exec
|
||||
name: cndmask_with_src2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
|
||||
%4:sreg_32_xm0_xexec = IMPLICIT_DEF
|
||||
%5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec
|
||||
|
||||
; src2 is legal for _e64
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%7:sreg_32_xm0_xexec = IMPLICIT_DEF
|
||||
%8:vgpr_32 = V_CNDMASK_B32_e64 4, %6, 0, %1, %7, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
# Make sure flags aren't dropped
|
||||
# GCN-LABEL: name: flags_add_f32_e64
|
||||
# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
name: flags_add_f32_e64
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
|
||||
S_ENDPGM 0, implicit %4
|
||||
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dont_combine_more_than_one_operand
|
||||
# GCN: %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
|
||||
name: dont_combine_more_than_one_operand
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
|
||||
...
|
||||
|
||||
# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence
|
||||
# GCN: %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
|
||||
# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
name: dont_combine_more_than_one_operand_dpp_reg_sequence
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
%0:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%1:vreg_64 = COPY $vgpr2_vgpr3
|
||||
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
|
||||
%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
|
||||
%6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
|
@ -1,6 +1,7 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s
|
||||
|
||||
; FIXME: The register allocator / scheduler should be able to avoid these hazards.
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
|
||||
|
||||
; GCN-LABEL: {{^}}dpp_test:
|
||||
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
|
||||
|
@ -26,7 +27,7 @@ define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in
|
|||
|
||||
|
||||
; GCN-LABEL: {{^}}dpp_test1:
|
||||
; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
|
@ -51,7 +52,7 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}update_dpp64_test:
|
||||
; GCN: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
||||
; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
||||
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
|
||||
|
@ -65,12 +66,13 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
|
|||
|
||||
; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
|
||||
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
|
||||
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
|
||||
; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
|
||||
; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
|
||||
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
|
||||
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
|
||||
; GCN-DAG: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
||||
; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
||||
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
||||
define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
|
||||
|
||||
---
|
||||
|
||||
name: vopc
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
|
||||
; GCN-LABEL: name: vopc
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GCN: V_CMP_LT_F32_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
|
||||
; GCN: V_CMPX_EQ_I16_e32 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: V_CMP_CLASS_F16_e32_dpp [[DEF]], 0, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
|
||||
; GCN: [[V_CMP_GE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_e64_dpp [[DEF]], 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
|
||||
; GCN: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
|
||||
; GCN: V_CMP_CLASS_F32_e32_dpp [[DEF]], 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
|
||||
; GCN: V_CMP_NGE_F16_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
; GCN: [[V_CMP_NGE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_e64_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
|
||||
; GCN: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F16_e64_dpp]], 10101, implicit-def $scc
|
||||
; GCN: V_CMP_GT_I32_e32_dpp [[DEF]], [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = COPY $vgpr2
|
||||
%3:vgpr_32 = IMPLICIT_DEF
|
||||
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
|
||||
; unsafe to combine cmpx
|
||||
%5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
V_CMPX_EQ_I16_e32 %5, %0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
V_CMP_CLASS_F16_e32 %6, %0, implicit-def $vcc, implicit $mode, implicit $exec
|
||||
|
||||
%7:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%8:sgpr_32 = V_CMP_GE_F16_e64 1, %7, 0, %0, 1, implicit $mode, implicit $exec
|
||||
|
||||
; unsafe to combine cmpx
|
||||
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
V_CMPX_GT_U32_nosdst_e64 %9, %0, implicit-def $exec, implicit $mode, implicit $exec
|
||||
|
||||
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec
|
||||
|
||||
; shrink
|
||||
%13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%14:sgpr_32 = V_CMP_NGE_F16_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec
|
||||
|
||||
; do not shrink, sdst used
|
||||
%15:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
%16:sgpr_32 = V_CMP_NGE_F16_e64 0, %15, 0, %0, 0, implicit $mode, implicit $exec
|
||||
%17:sgpr_32 = S_AND_B32 %16, 10101, implicit-def $scc
|
||||
|
||||
; commute
|
||||
%18:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
|
||||
V_CMP_LT_I32_e32 %0, %18, implicit-def $vcc, implicit $exec
|
||||
|
||||
...
|
Loading…
Reference in New Issue