forked from OSchip/llvm-project
[AMDGPU] gfx11 WMMA instruction support
gfx11 introduces new WMMA (Wave Matrix Multiply-accumulate) instructions. Reviewed By: arsenm, #amdgpu Differential Revision: https://reviews.llvm.org/D128756
This commit is contained in:
parent
a19c213212
commit
4874838a63
|
@ -1981,6 +1981,56 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
|
||||||
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
|
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
|
||||||
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
|
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
|
||||||
|
|
||||||
|
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
|
||||||
|
//
|
||||||
|
// These operations perform a matrix multiplication and accumulation of
|
||||||
|
// the form: D = A * B + C .
|
||||||
|
|
||||||
|
class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
|
||||||
|
Intrinsic<
|
||||||
|
[CD], // %D
|
||||||
|
[
|
||||||
|
AB, // %A
|
||||||
|
AB, // %B
|
||||||
|
LLVMMatchType<0>, // %C
|
||||||
|
],
|
||||||
|
[IntrNoMem, IntrConvergent, IntrWillReturn]
|
||||||
|
>;
|
||||||
|
|
||||||
|
class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
|
||||||
|
Intrinsic<
|
||||||
|
[CD], // %D
|
||||||
|
[
|
||||||
|
AB, // %A
|
||||||
|
AB, // %B
|
||||||
|
LLVMMatchType<0>, // %C
|
||||||
|
llvm_i1_ty, // %high
|
||||||
|
],
|
||||||
|
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>]
|
||||||
|
>;
|
||||||
|
|
||||||
|
class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
|
||||||
|
Intrinsic<
|
||||||
|
[CD], // %D
|
||||||
|
[
|
||||||
|
llvm_i1_ty, // %A_sign
|
||||||
|
AB, // %A
|
||||||
|
llvm_i1_ty, // %B_sign
|
||||||
|
AB, // %B
|
||||||
|
LLVMMatchType<0>, // %C
|
||||||
|
llvm_i1_ty, // %clamp
|
||||||
|
],
|
||||||
|
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
|
||||||
|
>;
|
||||||
|
|
||||||
|
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v8f32_ty, llvm_anyfloat_ty>;
|
||||||
|
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v8i32_ty, llvm_anyfloat_ty>;
|
||||||
|
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8f32_ty, llvm_anyfloat_ty>;
|
||||||
|
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8i32_ty, llvm_anyint_ty>;
|
||||||
|
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
|
||||||
|
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
|
||||||
|
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// Deep learning intrinsics.
|
// Deep learning intrinsics.
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -55,6 +55,10 @@ def gi_dotiuvop3pmods :
|
||||||
GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
|
GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
|
||||||
GIComplexPatternEquiv<DotIUVOP3PMods>;
|
GIComplexPatternEquiv<DotIUVOP3PMods>;
|
||||||
|
|
||||||
|
def gi_wmmaopselvop3pmods :
|
||||||
|
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
|
||||||
|
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
|
||||||
|
|
||||||
def gi_vop3opselmods :
|
def gi_vop3opselmods :
|
||||||
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
|
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
|
||||||
GIComplexPatternEquiv<VOP3OpSelMods>;
|
GIComplexPatternEquiv<VOP3OpSelMods>;
|
||||||
|
|
|
@ -2782,6 +2782,20 @@ bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
|
||||||
|
SDValue &Src) const {
|
||||||
|
const ConstantSDNode *C = cast<ConstantSDNode>(In);
|
||||||
|
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
|
||||||
|
|
||||||
|
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||||
|
unsigned SrcVal = C->getAPIntValue().getZExtValue();
|
||||||
|
if (SrcVal == 1)
|
||||||
|
Mods |= SISrcMods::OP_SEL_0;
|
||||||
|
|
||||||
|
Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
|
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
|
||||||
SDValue &SrcMods) const {
|
SDValue &SrcMods) const {
|
||||||
Src = In;
|
Src = In;
|
||||||
|
|
|
@ -231,6 +231,7 @@ private:
|
||||||
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||||
|
|
||||||
bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
|
bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
|
||||||
|
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
|
||||||
|
|
||||||
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||||
|
|
||||||
|
|
|
@ -3733,6 +3733,20 @@ AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
InstructionSelector::ComplexRendererFns
|
||||||
|
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
|
||||||
|
MachineOperand &Root) const {
|
||||||
|
assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
|
||||||
|
"expected i1 value");
|
||||||
|
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||||
|
if (Root.getImm() != 0)
|
||||||
|
Mods |= SISrcMods::OP_SEL_0;
|
||||||
|
|
||||||
|
return {{
|
||||||
|
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
InstructionSelector::ComplexRendererFns
|
InstructionSelector::ComplexRendererFns
|
||||||
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
|
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
|
||||||
Register Src;
|
Register Src;
|
||||||
|
|
|
@ -189,6 +189,9 @@ private:
|
||||||
InstructionSelector::ComplexRendererFns
|
InstructionSelector::ComplexRendererFns
|
||||||
selectDotIUVOP3PMods(MachineOperand &Root) const;
|
selectDotIUVOP3PMods(MachineOperand &Root) const;
|
||||||
|
|
||||||
|
InstructionSelector::ComplexRendererFns
|
||||||
|
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
|
||||||
|
|
||||||
InstructionSelector::ComplexRendererFns
|
InstructionSelector::ComplexRendererFns
|
||||||
selectVOP3OpSelMods(MachineOperand &Root) const;
|
selectVOP3OpSelMods(MachineOperand &Root) const;
|
||||||
|
|
||||||
|
|
|
@ -4271,6 +4271,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||||
case Intrinsic::amdgcn_fdot2_f32_bf16:
|
case Intrinsic::amdgcn_fdot2_f32_bf16:
|
||||||
case Intrinsic::amdgcn_sudot4:
|
case Intrinsic::amdgcn_sudot4:
|
||||||
case Intrinsic::amdgcn_sudot8:
|
case Intrinsic::amdgcn_sudot8:
|
||||||
|
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
|
||||||
|
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
|
||||||
|
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
|
||||||
|
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
|
||||||
|
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
|
||||||
|
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
|
||||||
return getDefaultMappingVOP(MI);
|
return getDefaultMappingVOP(MI);
|
||||||
case Intrinsic::amdgcn_sbfe:
|
case Intrinsic::amdgcn_sbfe:
|
||||||
case Intrinsic::amdgcn_ubfe:
|
case Intrinsic::amdgcn_ubfe:
|
||||||
|
|
|
@ -575,6 +575,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||||
if (Res) break;
|
if (Res) break;
|
||||||
|
|
||||||
Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
|
Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
|
||||||
|
if (Res)
|
||||||
|
break;
|
||||||
|
|
||||||
|
Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
|
||||||
} while (false);
|
} while (false);
|
||||||
|
|
||||||
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
|
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
|
||||||
|
|
|
@ -1082,6 +1082,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
|
||||||
}
|
}
|
||||||
fixVALUPartialForwardingHazard(MI);
|
fixVALUPartialForwardingHazard(MI);
|
||||||
fixVALUTransUseHazard(MI);
|
fixVALUTransUseHazard(MI);
|
||||||
|
fixWMMAHazards(MI);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
|
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
|
||||||
|
@ -1673,6 +1674,67 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
|
||||||
|
if (!SIInstrInfo::isWMMA(*MI))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||||
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||||
|
|
||||||
|
auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
|
||||||
|
if (!SIInstrInfo::isWMMA(I))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Src0 or Src1 of the current wmma instruction overlaps with the dest of
|
||||||
|
// the previous wmma.
|
||||||
|
const Register CurSrc0Reg =
|
||||||
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
|
||||||
|
const Register CurSrc1Reg =
|
||||||
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
|
||||||
|
|
||||||
|
const Register PrevDstReg =
|
||||||
|
TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
|
||||||
|
|
||||||
|
if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
|
||||||
|
TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Src2 of the current wmma instruction overlaps with the dest of the
|
||||||
|
// previous wmma.
|
||||||
|
const MachineOperand *Src2 =
|
||||||
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
|
||||||
|
const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
|
||||||
|
|
||||||
|
if (CurSrc2Reg != AMDGPU::NoRegister &&
|
||||||
|
TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
|
||||||
|
|
||||||
|
const MachineOperand *Src2Mods =
|
||||||
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
|
||||||
|
const bool NoSrc2Mods =
|
||||||
|
(Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
|
||||||
|
// Exception: there is no hazard if the wmma instructions are of the same
|
||||||
|
// type and there is no input modifier on src2 of the current instruction.
|
||||||
|
return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
|
||||||
|
TII->pseudoToMCOpcode(MI->getOpcode())));
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto IsExpiredFn = [](const MachineInstr &I, int) {
|
||||||
|
return SIInstrInfo::isVALU(I);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
|
||||||
|
std::numeric_limits<int>::max())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
|
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
|
||||||
int NSAtoVMEMWaitStates = 1;
|
int NSAtoVMEMWaitStates = 1;
|
||||||
|
|
||||||
|
|
|
@ -100,6 +100,7 @@ private:
|
||||||
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
|
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
|
||||||
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
|
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
|
||||||
bool fixVALUTransUseHazard(MachineInstr *MI);
|
bool fixVALUTransUseHazard(MachineInstr *MI);
|
||||||
|
bool fixWMMAHazards(MachineInstr *MI);
|
||||||
|
|
||||||
int checkMAIHazards(MachineInstr *MI);
|
int checkMAIHazards(MachineInstr *MI);
|
||||||
int checkMAIHazards908(MachineInstr *MI);
|
int checkMAIHazards908(MachineInstr *MI);
|
||||||
|
|
|
@ -126,7 +126,10 @@ enum : uint64_t {
|
||||||
IsAtomicNoRet = UINT64_C(1) << 57,
|
IsAtomicNoRet = UINT64_C(1) << 57,
|
||||||
|
|
||||||
// Atomic with return.
|
// Atomic with return.
|
||||||
IsAtomicRet = UINT64_C(1) << 58
|
IsAtomicRet = UINT64_C(1) << 58,
|
||||||
|
|
||||||
|
// Is a WMMA instruction.
|
||||||
|
IsWMMA = UINT64_C(1) << 59,
|
||||||
};
|
};
|
||||||
|
|
||||||
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
|
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
|
||||||
|
|
|
@ -147,6 +147,9 @@ class InstSI <dag outs, dag ins, string asm = "",
|
||||||
// Atomic with return.
|
// Atomic with return.
|
||||||
field bit IsAtomicRet = 0;
|
field bit IsAtomicRet = 0;
|
||||||
|
|
||||||
|
// This bit indicates that this is one of WMMA instructions.
|
||||||
|
field bit IsWMMA = 0;
|
||||||
|
|
||||||
// These need to be kept in sync with the enum in SIInstrFlags.
|
// These need to be kept in sync with the enum in SIInstrFlags.
|
||||||
let TSFlags{0} = SALU;
|
let TSFlags{0} = SALU;
|
||||||
let TSFlags{1} = VALU;
|
let TSFlags{1} = VALU;
|
||||||
|
@ -224,6 +227,8 @@ class InstSI <dag outs, dag ins, string asm = "",
|
||||||
|
|
||||||
let TSFlags{58} = IsAtomicRet;
|
let TSFlags{58} = IsAtomicRet;
|
||||||
|
|
||||||
|
let TSFlags{59} = IsWMMA;
|
||||||
|
|
||||||
let SchedRW = [Write32Bit];
|
let SchedRW = [Write32Bit];
|
||||||
|
|
||||||
let AsmVariantName = AMDGPUAsmVariants.Default;
|
let AsmVariantName = AMDGPUAsmVariants.Default;
|
||||||
|
|
|
@ -3255,6 +3255,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
|
||||||
return MIB;
|
return MIB;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (SIInstrInfo::isWMMA(MI)) {
|
||||||
|
unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
|
||||||
|
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
|
||||||
|
.setMIFlags(MI.getFlags());
|
||||||
|
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
|
||||||
|
MIB->addOperand(MI.getOperand(I));
|
||||||
|
|
||||||
|
updateLiveVariables(LV, MI, *MIB);
|
||||||
|
if (LIS)
|
||||||
|
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
|
||||||
|
|
||||||
|
return MIB;
|
||||||
|
}
|
||||||
|
|
||||||
// Handle MAC/FMAC.
|
// Handle MAC/FMAC.
|
||||||
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
|
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
|
||||||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
|
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
|
||||||
|
|
|
@ -673,6 +673,14 @@ public:
|
||||||
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
|
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isWMMA(const MachineInstr &MI) {
|
||||||
|
return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isWMMA(uint16_t Opcode) const {
|
||||||
|
return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
|
||||||
|
}
|
||||||
|
|
||||||
bool isDOT(uint16_t Opcode) const {
|
bool isDOT(uint16_t Opcode) const {
|
||||||
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
|
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1497,6 +1497,7 @@ def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
|
||||||
|
|
||||||
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
|
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
|
||||||
def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
|
def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
|
||||||
|
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
|
||||||
|
|
||||||
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
|
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
|
||||||
|
|
||||||
|
@ -2475,6 +2476,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
|
||||||
field bit IsVOP3P = 0;
|
field bit IsVOP3P = 0;
|
||||||
field bit IsDOT = 0;
|
field bit IsDOT = 0;
|
||||||
field bit IsSingle = 0;
|
field bit IsSingle = 0;
|
||||||
|
field bit IsWMMA = 0;
|
||||||
|
|
||||||
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
|
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
|
||||||
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
|
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
|
||||||
|
|
|
@ -1076,6 +1076,18 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> {
|
||||||
let DecoderMethod = "DecodeVS_32RegisterClass";
|
let DecoderMethod = "DecodeVS_32RegisterClass";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def VRegSrc_64 : RegisterOperand<VReg_64> {
|
||||||
|
let DecoderMethod = "decodeOperand_VReg_64";
|
||||||
|
}
|
||||||
|
|
||||||
|
def VRegSrc_128 : RegisterOperand<VReg_128> {
|
||||||
|
let DecoderMethod = "decodeOperand_VReg_128";
|
||||||
|
}
|
||||||
|
|
||||||
|
def VRegSrc_256 : RegisterOperand<VReg_256> {
|
||||||
|
let DecoderMethod = "decodeOperand_VReg_256";
|
||||||
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// VGPRSrc_*
|
// VGPRSrc_*
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -293,6 +293,10 @@ struct VOPC64DPPInfo {
|
||||||
#define GET_VOPC64DPPTable_IMPL
|
#define GET_VOPC64DPPTable_IMPL
|
||||||
#define GET_VOPC64DPP8Table_DECL
|
#define GET_VOPC64DPP8Table_DECL
|
||||||
#define GET_VOPC64DPP8Table_IMPL
|
#define GET_VOPC64DPP8Table_IMPL
|
||||||
|
#define GET_WMMAOpcode2AddrMappingTable_DECL
|
||||||
|
#define GET_WMMAOpcode2AddrMappingTable_IMPL
|
||||||
|
#define GET_WMMAOpcode3AddrMappingTable_DECL
|
||||||
|
#define GET_WMMAOpcode3AddrMappingTable_IMPL
|
||||||
#include "AMDGPUGenSearchableTables.inc"
|
#include "AMDGPUGenSearchableTables.inc"
|
||||||
|
|
||||||
int getMTBUFBaseOpcode(unsigned Opc) {
|
int getMTBUFBaseOpcode(unsigned Opc) {
|
||||||
|
@ -394,6 +398,16 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
|
||||||
return Info ? Info->is_gfx940_xdl : false;
|
return Info ? Info->is_gfx940_xdl : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
|
||||||
|
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
|
||||||
|
return Info ? Info->Opcode3Addr : ~0u;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
|
||||||
|
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc);
|
||||||
|
return Info ? Info->Opcode2Addr : ~0u;
|
||||||
|
}
|
||||||
|
|
||||||
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
|
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
|
||||||
// header files, so we need to wrap it in a function that takes unsigned
|
// header files, so we need to wrap it in a function that takes unsigned
|
||||||
// instead.
|
// instead.
|
||||||
|
|
|
@ -368,6 +368,11 @@ struct MIMGG16MappingInfo {
|
||||||
LLVM_READONLY
|
LLVM_READONLY
|
||||||
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
|
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
|
||||||
|
|
||||||
|
struct WMMAOpcodeMappingInfo {
|
||||||
|
unsigned Opcode2Addr;
|
||||||
|
unsigned Opcode3Addr;
|
||||||
|
};
|
||||||
|
|
||||||
LLVM_READONLY
|
LLVM_READONLY
|
||||||
const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
|
const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
|
||||||
|
|
||||||
|
@ -477,6 +482,12 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
|
||||||
LLVM_READONLY
|
LLVM_READONLY
|
||||||
int getMCOpcode(uint16_t Opcode, unsigned Gen);
|
int getMCOpcode(uint16_t Opcode, unsigned Gen);
|
||||||
|
|
||||||
|
LLVM_READONLY
|
||||||
|
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
|
||||||
|
|
||||||
|
LLVM_READONLY
|
||||||
|
unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
|
||||||
|
|
||||||
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
|
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
|
||||||
const MCSubtargetInfo *STI);
|
const MCSubtargetInfo *STI);
|
||||||
|
|
||||||
|
|
|
@ -677,6 +677,159 @@ let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable
|
||||||
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
|
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
|
||||||
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
|
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
|
||||||
|
|
||||||
|
class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
|
||||||
|
let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
|
||||||
|
let Src0RC64 = _Src01RC64;
|
||||||
|
let Src1RC64 = _Src01RC64;
|
||||||
|
let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
|
||||||
|
let HasClamp = _HasClamp;
|
||||||
|
let HasOpSel = _HasOpSel;
|
||||||
|
let IsWMMA = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>;
|
||||||
|
def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>;
|
||||||
|
def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
|
||||||
|
def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
|
||||||
|
def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>;
|
||||||
|
|
||||||
|
def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>;
|
||||||
|
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
|
||||||
|
def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
|
||||||
|
def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
|
||||||
|
def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>;
|
||||||
|
|
||||||
|
class WMMAType <bits<2> val> {
|
||||||
|
bit hasClamp = val{0};
|
||||||
|
bit hasOpsel = val{1};
|
||||||
|
}
|
||||||
|
|
||||||
|
def WMMARegular : WMMAType<0b00>;
|
||||||
|
def WMMAUIClamp : WMMAType<0b01>;
|
||||||
|
def WMMAOpSel : WMMAType<0b10>;
|
||||||
|
|
||||||
|
class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
|
||||||
|
GCNPat < (P.DstVT (node
|
||||||
|
(P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)),
|
||||||
|
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
|
||||||
|
(P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))
|
||||||
|
)),
|
||||||
|
(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2))
|
||||||
|
>;
|
||||||
|
|
||||||
|
class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
|
||||||
|
GCNPat < (P.DstVT (node
|
||||||
|
(P.Src0VT P.Src0VT:$src0),
|
||||||
|
(P.Src1VT P.Src1VT:$src1),
|
||||||
|
(P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers)
|
||||||
|
)),
|
||||||
|
(P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2))
|
||||||
|
>;
|
||||||
|
|
||||||
|
class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
|
||||||
|
GCNPat < (P.DstVT (node
|
||||||
|
(DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
|
||||||
|
(DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
|
||||||
|
(P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
|
||||||
|
)),
|
||||||
|
(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
|
||||||
|
>;
|
||||||
|
|
||||||
|
class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
|
||||||
|
Instruction Opcode2Addr = TwoAddr;
|
||||||
|
Instruction Opcode3Addr = ThreeAddr;
|
||||||
|
Predicate WaveSizePredicate;
|
||||||
|
}
|
||||||
|
|
||||||
|
def WMMAOpcode : GenericEnum {
|
||||||
|
let FilterClass = "VOP3P_Pseudo";
|
||||||
|
}
|
||||||
|
|
||||||
|
class WMMAMappingTable : GenericTable {
|
||||||
|
let FilterClass = "WMMAOpcodeMapping";
|
||||||
|
let CppTypeName = "WMMAOpcodeMappingInfo";
|
||||||
|
let Fields = ["Opcode2Addr", "Opcode3Addr"];
|
||||||
|
string TypeOf_Opcode2Addr = "WMMAOpcode";
|
||||||
|
string TypeOf_Opcode3Addr = "WMMAOpcode";
|
||||||
|
}
|
||||||
|
|
||||||
|
def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
|
||||||
|
let PrimaryKey = ["Opcode2Addr"];
|
||||||
|
let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode";
|
||||||
|
}
|
||||||
|
|
||||||
|
def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
|
||||||
|
let PrimaryKey = ["Opcode3Addr"];
|
||||||
|
let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode";
|
||||||
|
}
|
||||||
|
|
||||||
|
// The WMMA instruction has extra constraints:
|
||||||
|
// Matrices A and B cannot overlap with D. C cannot partially overlap with D,
|
||||||
|
// but it is OK for them to be the same (which is a typical case).
|
||||||
|
//
|
||||||
|
// We implement it as follows:
|
||||||
|
// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2).
|
||||||
|
// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case
|
||||||
|
// it converts the default pseudo to the pseudo where src2 is not the same as vdst.
|
||||||
|
// 3) @earlyclobber on the destination satisfies the constraint during RA.
|
||||||
|
|
||||||
|
multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
|
||||||
|
|
||||||
|
defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
|
||||||
|
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
|
||||||
|
|
||||||
|
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
|
||||||
|
if !eq(Suffix, "_w32") then {
|
||||||
|
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
|
||||||
|
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
|
||||||
|
def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||||
|
}
|
||||||
|
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
|
||||||
|
def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
|
||||||
|
!cast<Instruction>(NAME # _threeaddr_w32)>;
|
||||||
|
} else if !eq(Suffix, "_w64") then {
|
||||||
|
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
|
||||||
|
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
|
||||||
|
def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||||
|
}
|
||||||
|
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
|
||||||
|
def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
|
||||||
|
!cast<Instruction>(NAME # _threeaddr_w64)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !eq(Type, WMMAOpSel) then {
|
||||||
|
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||||
|
} else if !eq(Type, WMMAUIClamp) then {
|
||||||
|
def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||||
|
} else {
|
||||||
|
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let WaveSizePredicate = isWave32 in {
|
||||||
|
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
|
||||||
|
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
|
||||||
|
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
|
||||||
|
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
|
||||||
|
}
|
||||||
|
|
||||||
|
let WaveSizePredicate = isWave64 in {
|
||||||
|
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
|
||||||
|
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
|
||||||
|
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
|
||||||
|
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
|
||||||
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// Begin Real Encodings
|
// Begin Real Encodings
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
@ -747,6 +900,22 @@ defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;
|
||||||
defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
|
defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
|
||||||
defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;
|
defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;
|
||||||
|
|
||||||
|
multiclass VOP3P_Real_WMMA <bits<7> op> {
|
||||||
|
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in {
|
||||||
|
defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>;
|
||||||
|
}
|
||||||
|
let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in {
|
||||||
|
defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>;
|
||||||
|
defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>;
|
||||||
|
defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>;
|
||||||
|
defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>;
|
||||||
|
defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>;
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// GFX8 (VI)
|
// GFX8 (VI)
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -102,6 +102,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
|
||||||
let VOP3_OPSEL = isVop3OpSel;
|
let VOP3_OPSEL = isVop3OpSel;
|
||||||
let IsPacked = P.IsPacked;
|
let IsPacked = P.IsPacked;
|
||||||
let IsMAI = P.IsMAI;
|
let IsMAI = P.IsMAI;
|
||||||
|
let IsWMMA = P.IsWMMA;
|
||||||
|
|
||||||
let AsmOperands = !if(isVop3OpSel,
|
let AsmOperands = !if(isVop3OpSel,
|
||||||
P.AsmVOP3OpSel,
|
P.AsmVOP3OpSel,
|
||||||
|
@ -187,7 +188,11 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
|
||||||
// XXX - Is there any reason to distinguish this from regular VOP3
|
// XXX - Is there any reason to distinguish this from regular VOP3
|
||||||
// here?
|
// here?
|
||||||
class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
|
class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
|
||||||
VOP3_Real<ps, EncodingFamily, asm_name>;
|
VOP3_Real<ps, EncodingFamily, asm_name> {
|
||||||
|
|
||||||
|
// The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction.
|
||||||
|
let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints);
|
||||||
|
}
|
||||||
|
|
||||||
class VOP3a<VOPProfile P> : Enc64 {
|
class VOP3a<VOPProfile P> : Enc64 {
|
||||||
bits<4> src0_modifiers;
|
bits<4> src0_modifiers;
|
||||||
|
|
|
@ -0,0 +1,331 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,287 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,331 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,287 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -start-after postrapseudos -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_f32_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_f32_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_f16_16x16x16_f16 v[34:41], v[0:7], v[8:15], v[16:23]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_bf16_16x16x16_bf16 v[34:41], v[0:7], v[8:15], v[16:23]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_i32_16x16x16_iu8 v[26:33], v[0:3], v[4:7], v[8:15]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32:
|
||||||
|
# GCN: v_wmma_i32_16x16x16_iu4 v[26:33], v[0:1], v[2:3], v[8:15]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_f32_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F32_16X16X16_F16_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_f32_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F32_16X16X16_BF16_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_f16_16x16x16_f16 v[34:37], v[0:7], v[8:15], v[16:19]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_F16_16X16X16_F16_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_bf16_16x16x16_bf16 v[34:37], v[0:7], v[8:15], v[16:19]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_i32_16x16x16_iu8 v[26:29], v[0:3], v[4:7], v[8:11]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_I32_16X16X16_IU8_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
# GCN-LABEL: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64:
|
||||||
|
# GCN: v_wmma_i32_16x16x16_iu4 v[26:29], v[0:1], v[2:3], v[8:11]
|
||||||
|
---
|
||||||
|
name: test_V_WMMA_I32_16X16X16_IU4_threeaddr_w64
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33
|
||||||
|
early-clobber renamable $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,217 @@
|
||||||
|
# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_128 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
|
||||||
|
# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_256 }
|
||||||
|
- { id: 1, class: vreg_64 }
|
||||||
|
- { id: 2, class: vreg_256 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_256 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_128 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
|
||||||
|
# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
---
|
||||||
|
name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
|
||||||
|
registers:
|
||||||
|
- { id: 0, class: vreg_128 }
|
||||||
|
- { id: 1, class: vreg_64 }
|
||||||
|
- { id: 2, class: vreg_128 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
|
||||||
|
%0 = IMPLICIT_DEF
|
||||||
|
%1 = IMPLICIT_DEF
|
||||||
|
early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec
|
||||||
|
|
||||||
|
...
|
|
@ -0,0 +1,159 @@
|
||||||
|
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||||
|
# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_A
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_A
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_B
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_B
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
$vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: value_inbetween_WMMA1_D_overlaps_WMMA2_B
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: value_inbetween_WMMA1_D_overlaps_WMMA2_B
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
$vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
$vgpr40 = V_MOV_B32_e32 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
|
||||||
|
; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod
|
||||||
|
; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
; GCN-NEXT: V_NOP_e32 implicit $exec
|
||||||
|
; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
|
||||||
|
...
|
||||||
|
---
|
|
@ -0,0 +1,473 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
|
||||||
|
declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; The tests demonstrate that the following WMMA register constraints are satisfied.
|
||||||
|
;
|
||||||
|
; v_wmma D, A, B, C
|
||||||
|
; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
|
||||||
|
;
|
||||||
|
; In each test,
|
||||||
|
; - first wmma instruction: the dest register D is different than all the sources
|
||||||
|
; - second wmma instruction: the dest register D and src2 (C) are the same
|
||||||
|
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
|
||||||
|
%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
|
||||||
|
%res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
|
||||||
|
%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
|
||||||
|
%res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1)
|
||||||
|
store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
|
||||||
|
store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[16:17], v[20:23], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0]
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
|
||||||
|
; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W32: ; %bb.0: ; %bb
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W32-NEXT: s_clause 0x1
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
|
||||||
|
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
|
||||||
|
; W32-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
%res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
|
||||||
|
store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
|
||||||
|
store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,385 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
|
||||||
|
declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
|
||||||
|
|
||||||
|
; The tests demonstrate that the following WMMA register constraints are satisfied.
|
||||||
|
;
|
||||||
|
; v_wmma D, A, B, C
|
||||||
|
; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
|
||||||
|
;
|
||||||
|
; In each test,
|
||||||
|
; - first wmma instruction: the dest register D is different than all the sources
|
||||||
|
; - second wmma instruction: the dest register D and src2 (C) are the same
|
||||||
|
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_f16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
|
||||||
|
%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f32.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_f32_16x16x16_bf16:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
|
||||||
|
%res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.f16.16x16x16.f16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
|
||||||
|
%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
|
||||||
|
%res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1)
|
||||||
|
store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
|
||||||
|
store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
|
||||||
|
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
|
||||||
|
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; @llvm.amdgcn.wmma.i32.16x16x16.iu4
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
|
||||||
|
; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
|
||||||
|
; W64: ; %bb.0: ; %bb
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
|
||||||
|
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
|
||||||
|
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
|
||||||
|
; W64-NEXT: s_endpgm
|
||||||
|
bb:
|
||||||
|
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
%res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
|
||||||
|
store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
|
||||||
|
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,461 @@
|
||||||
|
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s
|
||||||
|
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s
|
||||||
|
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
|
||||||
|
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_f32_16x16x16_f16
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0
|
||||||
|
// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0
|
||||||
|
// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_f32_16x16x16_bf16
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0
|
||||||
|
// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0
|
||||||
|
// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_f16_16x16x16_f16
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], 1.0, v[8:15], v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], 1.0, v[8:15], v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], 1.0, v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], 1.0, v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_bf16_16x16x16_bf16
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], 1.0, v[8:15], v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], 1.0, v[16:23]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], 1.0, v[16:19]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] clamp
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_i32_16x16x16_iu8
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], 1, v[4:7], v[8:15]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], 1, v[4:7], v[8:11]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], 1, v[8:15]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], 1, v[8:11]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
//
|
||||||
|
// Test v_wmma_i32_16x16x16_iu4
|
||||||
|
//
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], 1, v[2:3], v[4:11]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], 1, v[2:3], v[4:7]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], 1, v[4:11]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], 1, v[4:7]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
|
||||||
|
// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
|
||||||
|
// W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
||||||
|
v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
|
||||||
|
// W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W32 %s
|
||||||
|
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W64 %s
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_f32_16x16x16_f16
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
0x10,0x40,0x40,0xcc,0x00,0x11,0xca,0x1b
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
0x10,0x41,0x40,0xcc,0x00,0x11,0x42,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
0x10,0x42,0x40,0xcc,0x00,0x11,0x42,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
0x10,0x43,0x40,0xcc,0x00,0x11,0x42,0x7c
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_f32_16x16x16_bf16
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x40,0x41,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
0x10,0x40,0x41,0xcc,0x00,0x11,0xca,0x1b
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
0x10,0x41,0x41,0xcc,0x00,0x11,0x42,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
0x10,0x42,0x41,0xcc,0x00,0x11,0x42,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
# W64: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
0x10,0x43,0x41,0xcc,0x00,0x11,0x42,0x7c
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_f16_16x16x16_f16
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x40,0x42,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
0x10,0x40,0x42,0xcc,0x00,0x11,0xca,0x1b
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x60,0x42,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
0x10,0x41,0x42,0xcc,0x00,0x11,0x42,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
0x10,0x42,0x42,0xcc,0x00,0x11,0x42,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
# W64: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
0x10,0x43,0x42,0xcc,0x00,0x11,0x42,0x7c
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_bf16_16x16x16_bf16
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x40,0x43,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b]
|
||||||
|
0x10,0x40,0x43,0xcc,0x00,0x11,0xca,0x1b
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; encoding: [0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c]
|
||||||
|
0x10,0x60,0x43,0xcc,0x00,0x11,0x42,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c]
|
||||||
|
0x10,0x41,0x43,0xcc,0x00,0x11,0x42,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c]
|
||||||
|
0x10,0x42,0x43,0xcc,0x00,0x11,0x42,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
# W64: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c]
|
||||||
|
0x10,0x43,0x43,0xcc,0x00,0x11,0x42,0x7c
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_i32_16x16x16_iu8
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
0x08,0x40,0x44,0xcc,0x00,0x09,0x22,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
|
||||||
|
0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c]
|
||||||
|
0x08,0x41,0x44,0xcc,0x00,0x09,0x22,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c]
|
||||||
|
0x08,0x42,0x44,0xcc,0x00,0x09,0x22,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c]
|
||||||
|
0x08,0x43,0x44,0xcc,0x00,0x09,0x22,0x7c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; encoding: [0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c]
|
||||||
|
0x08,0xc0,0x44,0xcc,0x00,0x09,0x22,0x1c
|
||||||
|
|
||||||
|
|
||||||
|
# Test v_wmma_i32_16x16x16_iu4
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
0x04,0x40,0x45,0xcc,0x00,0x05,0x12,0x1c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a]
|
||||||
|
0x04,0x40,0x45,0xcc,0x00,0x05,0x06,0x1a
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c]
|
||||||
|
0x04,0x41,0x45,0xcc,0x00,0x05,0x12,0x3c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c]
|
||||||
|
0x04,0x42,0x45,0xcc,0x00,0x05,0x12,0x5c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] neg_hi:[1,1,0] ; encoding: [0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c]
|
||||||
|
0x04,0x43,0x45,0xcc,0x00,0x05,0x12,0x7c
|
||||||
|
|
||||||
|
# W32: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
# W64: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; encoding: [0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c]
|
||||||
|
0x04,0xc0,0x45,0xcc,0x00,0x05,0x12,0x1c
|
||||||
|
|
Loading…
Reference in New Issue