[AMDGPU] gfx11 new dot instruction codegen support

Reviewed By: rampitec, #amdgpu

Differential Revision: https://reviews.llvm.org/D127904
This commit is contained in:
Joe Nash 2022-06-15 14:03:51 -04:00
parent 7e681ef35e
commit 2d43de13df
20 changed files with 825 additions and 79 deletions

View File

@ -222,12 +222,17 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx9
//===----------------------------------------------------------------------===//
TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot7-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot7-insts")
TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
//===----------------------------------------------------------------------===//
// GFX10+ only builtins.

View File

@ -8,29 +8,45 @@ typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void builtins_amdgcn_dl_insts_err(
global float *fOut, global int *siOut, global uint *uiOut,
half2 v2hA, half2 v2hB, float fC,
short2 v2ssA, short2 v2ssB, int siA, int siB, int siC,
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) {
fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
global short *sOut, global int *iOut, global half *hOut,
half2 v2hA, half2 v2hB, float fC, half hC,
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
int A, int B, int C) {
fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC); // expected-error {{'__builtin_amdgcn_fdot2_f16_f16' needs target feature dot8-insts}}
uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}}
uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}}
sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot8-insts}}
siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}}
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}}
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}}
uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}}
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}}
iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}}
siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
}

View File

@ -0,0 +1,64 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
typedef unsigned int uint;
typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
// CHECK-LABEL: @builtins_amdgcn_dl_insts
// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false)
// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true)
// CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC)
// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC)
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false)
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true)
// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 false)
// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 true)
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false)
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 true)
// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 false)
// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 true)
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
// CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 false)
// CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 true)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void builtins_amdgcn_dl_insts_err(
global float *fOut, global int *siOut, global uint *uiOut,
global short *sOut, global int *iOut, global half *hOut,
half2 v2hA, half2 v2hB, float fC, half hC,
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
int A, int B, int C) {
fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);
fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);
hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC);
sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);
siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false);
siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true);
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false);
iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true);
siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false);
siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true);
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true);
}

View File

@ -1954,6 +1954,49 @@ def int_amdgcn_fdot2 :
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
>;
// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_fdot2_f16_f16 :
GCCBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
Intrinsic<
[llvm_half_ty], // %r
[
llvm_v2f16_ty, // %a
llvm_v2f16_ty, // %b
llvm_half_ty // %c
],
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
>;
// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_fdot2_bf16_bf16 :
GCCBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
Intrinsic<
[llvm_i16_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_i16_ty // %c
],
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
>;
// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_fdot2_f32_bf16 :
GCCBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
Intrinsic<
[llvm_float_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_float_ty, // %c
llvm_i1_ty // %clamp
],
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
>;
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_sdot2 :
@ -2014,6 +2057,27 @@ def int_amdgcn_udot4 :
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
>;
// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
def int_amdgcn_sudot4 :
GCCBuiltin<"__builtin_amdgcn_sudot4">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i1_ty, // %a_sign
llvm_i32_ty, // %a
llvm_i1_ty, // %b_sign
llvm_i32_ty, // %b
llvm_i32_ty, // %c
llvm_i1_ty // %clamp
],
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
>;
// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
@ -2046,6 +2110,28 @@ def int_amdgcn_udot8 :
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
>;
// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
def int_amdgcn_sudot8 :
GCCBuiltin<"__builtin_amdgcn_sudot8">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i1_ty, // %a_sign
llvm_i32_ty, // %a
llvm_i1_ty, // %b_sign
llvm_i32_ty, // %b
llvm_i32_ty, // %c
llvm_i1_ty // %clamp
],
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
>;
//===----------------------------------------------------------------------===//
// gfx908 intrinsics
// ===----------------------------------------------------------------------===//

View File

@ -51,6 +51,10 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;
def gi_dotiuvop3pmods :
GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
GIComplexPatternEquiv<DotIUVOP3PMods>;
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;

View File

@ -2741,6 +2741,21 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// 1 promotes packed values to signed, 0 treats them as unsigned.
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
unsigned Mods = SISrcMods::OP_SEL_1;
unsigned SrcSign = C->getAPIntValue().getZExtValue();
if (SrcSign == 1)
Mods ^= SISrcMods::NEG;
Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;

View File

@ -225,6 +225,8 @@ private:
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

View File

@ -3670,6 +3670,21 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// Value is in Imm operand as i1 sign extended to int64_t.
// 1(-1) promotes packed values to signed, 0 treats them as unsigned.
assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
"expected i1 value");
unsigned Mods = SISrcMods::OP_SEL_1;
if (Root.getImm() == -1)
Mods ^= SISrcMods::NEG;
return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;

View File

@ -185,6 +185,9 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectDotIUVOP3PMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;

View File

@ -4253,6 +4253,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:
case Intrinsic::amdgcn_fdot2_bf16_bf16:
case Intrinsic::amdgcn_fdot2_f16_f16:
case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:

View File

@ -724,8 +724,8 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = HasDot8Insts in {
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>>;
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>>;
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}
//===----------------------------------------------------------------------===//

View File

@ -359,7 +359,7 @@ let SubtargetPredicate = HasDot8Insts in {
defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
null_frag, 1>;
int_amdgcn_fdot2_f32_bf16, 1>;
} // End SubtargetPredicate = HasDot8Insts
@ -381,8 +381,8 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
}
let SubtargetPredicate = HasDot8Insts in {
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>;
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>;
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
} // End SubtargetPredicate = HasDot8Insts
def : UDot2Pat<V_DOT2_U32_U16>;

View File

@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2:
@ -10,12 +11,12 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
ret float %r
}
@ -27,12 +28,12 @@ define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_clamp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_clamp:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
ret float %r
}
@ -44,12 +45,12 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_neg_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_neg_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
ret float %r
@ -62,12 +63,12 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_neg_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_neg_b:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.b = fneg <2 x half> %b
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
ret float %r
@ -80,12 +81,12 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_neg_a_neg_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_neg_a_neg_b:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %b
%neg.b = fneg <2 x half> %b
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false)
@ -100,13 +101,13 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_neg_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_neg_c:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
ret float %r
@ -119,12 +120,12 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_inline_literal_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_inline_literal_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
ret float %ret
}
@ -136,12 +137,12 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_inline_literal_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_inline_literal_b:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
ret float %ret
}
@ -153,12 +154,12 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_inline_literal_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_fdot2_inline_literal_c:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
ret float %ret
}

View File

@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}

View File

@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}

View File

@ -0,0 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a, <2 x i16> %b, i16 %c)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
i16 addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,
<2 x i16> addrspace(1)* %b,
i16 addrspace(1)* %c) {
entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
%c.val = load i16, i16 addrspace(1)* %c
%r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val, <2 x i16> %b.val, i16 %c.val)
store i16 %r.val, i16 addrspace(1)* %r
ret void
}

View File

@ -0,0 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
half addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
<2 x half> addrspace(1)* %b,
half addrspace(1)* %c) {
entry:
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%b.val = load <2 x half>, <2 x half> addrspace(1)* %b
%c.val = load half, half addrspace(1)* %c
%r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val, <2 x half> %b.val, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
}

View File

@ -0,0 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float %c, i1 %clamp)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
float addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,
<2 x i16> addrspace(1)* %b,
float addrspace(1)* %c) {
entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
%c.val = load float, float addrspace(1)* %c
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1)
store float %r.val, float addrspace(1)* %r
ret void
}
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
float addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,
<2 x i16> addrspace(1)* %b,
float addrspace(1)* %c) {
entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
%c.val = load float, float addrspace(1)* %c
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0)
store float %r.val, float addrspace(1)* %r
ret void
}

View File

@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}

View File

@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
ret i32 %ret
}
define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
entry:
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
ret i32 %ret
}