forked from OSchip/llvm-project
[AMDGPU] gfx11 new dot instruction codegen support
Reviewed By: rampitec, #amdgpu Differential Revision: https://reviews.llvm.org/D127904
This commit is contained in:
parent
7e681ef35e
commit
2d43de13df
|
@ -222,12 +222,17 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx9
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot7-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot7-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX10+ only builtins.
|
||||
|
|
|
@ -8,14 +8,24 @@ typedef half __attribute__((ext_vector_type(2))) half2;
|
|||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
kernel void builtins_amdgcn_dl_insts_err(
|
||||
global float *fOut, global int *siOut, global uint *uiOut,
|
||||
half2 v2hA, half2 v2hB, float fC,
|
||||
short2 v2ssA, short2 v2ssB, int siA, int siB, int siC,
|
||||
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) {
|
||||
global short *sOut, global int *iOut, global half *hOut,
|
||||
half2 v2hA, half2 v2hB, float fC, half hC,
|
||||
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
|
||||
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
|
||||
int A, int B, int C) {
|
||||
fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
|
||||
fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
|
||||
|
||||
hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC); // expected-error {{'__builtin_amdgcn_fdot2_f16_f16' needs target feature dot8-insts}}
|
||||
|
||||
sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot8-insts}}
|
||||
|
||||
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}}
|
||||
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}}
|
||||
|
||||
siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
|
||||
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
|
||||
|
||||
|
@ -28,9 +38,15 @@ kernel void builtins_amdgcn_dl_insts_err(
|
|||
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
|
||||
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
|
||||
|
||||
iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}}
|
||||
iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}}
|
||||
|
||||
siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
|
||||
siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
|
||||
|
||||
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
|
||||
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
|
||||
|
||||
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
|
||||
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
// REQUIRES: amdgpu-registered-target
|
||||
|
||||
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
|
||||
|
||||
typedef unsigned int uint;
|
||||
typedef half __attribute__((ext_vector_type(2))) half2;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
|
||||
|
||||
// CHECK-LABEL: @builtins_amdgcn_dl_insts
|
||||
// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false)
|
||||
// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true)
|
||||
// CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC)
|
||||
// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC)
|
||||
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false)
|
||||
// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
|
||||
// CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 false)
|
||||
// CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 true)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
kernel void builtins_amdgcn_dl_insts_err(
|
||||
global float *fOut, global int *siOut, global uint *uiOut,
|
||||
global short *sOut, global int *iOut, global half *hOut,
|
||||
half2 v2hA, half2 v2hB, float fC, half hC,
|
||||
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
|
||||
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
|
||||
int A, int B, int C) {
|
||||
fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);
|
||||
fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);
|
||||
|
||||
hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC);
|
||||
|
||||
sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);
|
||||
|
||||
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
|
||||
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);
|
||||
|
||||
siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false);
|
||||
siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true);
|
||||
|
||||
uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);
|
||||
uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
|
||||
|
||||
iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false);
|
||||
iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true);
|
||||
|
||||
siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false);
|
||||
siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true);
|
||||
|
||||
uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
|
||||
uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);
|
||||
|
||||
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);
|
||||
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true);
|
||||
}
|
|
@ -1954,6 +1954,49 @@ def int_amdgcn_fdot2 :
|
|||
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
|
||||
>;
|
||||
|
||||
// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
|
||||
def int_amdgcn_fdot2_f16_f16 :
|
||||
GCCBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
|
||||
Intrinsic<
|
||||
[llvm_half_ty], // %r
|
||||
[
|
||||
llvm_v2f16_ty, // %a
|
||||
llvm_v2f16_ty, // %b
|
||||
llvm_half_ty // %c
|
||||
],
|
||||
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
|
||||
>;
|
||||
|
||||
// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
|
||||
def int_amdgcn_fdot2_bf16_bf16 :
|
||||
GCCBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
|
||||
Intrinsic<
|
||||
[llvm_i16_ty], // %r
|
||||
[
|
||||
llvm_v2i16_ty, // %a
|
||||
llvm_v2i16_ty, // %b
|
||||
llvm_i16_ty // %c
|
||||
],
|
||||
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
|
||||
>;
|
||||
|
||||
// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
|
||||
def int_amdgcn_fdot2_f32_bf16 :
|
||||
GCCBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
|
||||
Intrinsic<
|
||||
[llvm_float_ty], // %r
|
||||
[
|
||||
llvm_v2i16_ty, // %a
|
||||
llvm_v2i16_ty, // %b
|
||||
llvm_float_ty, // %c
|
||||
llvm_i1_ty // %clamp
|
||||
],
|
||||
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
|
||||
>;
|
||||
|
||||
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
|
||||
def int_amdgcn_sdot2 :
|
||||
|
@ -2014,6 +2057,27 @@ def int_amdgcn_udot4 :
|
|||
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
|
||||
>;
|
||||
|
||||
// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
|
||||
// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
|
||||
// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
|
||||
// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
|
||||
def int_amdgcn_sudot4 :
|
||||
GCCBuiltin<"__builtin_amdgcn_sudot4">,
|
||||
Intrinsic<
|
||||
[llvm_i32_ty], // %r
|
||||
[
|
||||
llvm_i1_ty, // %a_sign
|
||||
llvm_i32_ty, // %a
|
||||
llvm_i1_ty, // %b_sign
|
||||
llvm_i32_ty, // %b
|
||||
llvm_i32_ty, // %c
|
||||
llvm_i1_ty // %clamp
|
||||
],
|
||||
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
|
||||
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
|
||||
>;
|
||||
|
||||
// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
|
||||
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
|
||||
|
@ -2046,6 +2110,28 @@ def int_amdgcn_udot8 :
|
|||
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
|
||||
>;
|
||||
|
||||
// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
|
||||
// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
|
||||
// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
|
||||
// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
|
||||
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
|
||||
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
|
||||
def int_amdgcn_sudot8 :
|
||||
GCCBuiltin<"__builtin_amdgcn_sudot8">,
|
||||
Intrinsic<
|
||||
[llvm_i32_ty], // %r
|
||||
[
|
||||
llvm_i1_ty, // %a_sign
|
||||
llvm_i32_ty, // %a
|
||||
llvm_i1_ty, // %b_sign
|
||||
llvm_i32_ty, // %b
|
||||
llvm_i32_ty, // %c
|
||||
llvm_i1_ty // %clamp
|
||||
],
|
||||
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
|
||||
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// gfx908 intrinsics
|
||||
// ===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -51,6 +51,10 @@ def gi_vop3pmodsdot :
|
|||
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
|
||||
GIComplexPatternEquiv<VOP3PModsDOT>;
|
||||
|
||||
def gi_dotiuvop3pmods :
|
||||
GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
|
||||
GIComplexPatternEquiv<DotIUVOP3PMods>;
|
||||
|
||||
def gi_vop3opselmods :
|
||||
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
|
||||
GIComplexPatternEquiv<VOP3OpSelMods>;
|
||||
|
|
|
@ -2741,6 +2741,21 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
|
|||
return SelectVOP3PMods(In, Src, SrcMods, true);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
|
||||
const ConstantSDNode *C = cast<ConstantSDNode>(In);
|
||||
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
|
||||
// 1 promotes packed values to signed, 0 treats them as unsigned.
|
||||
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
|
||||
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
unsigned SrcSign = C->getAPIntValue().getZExtValue();
|
||||
if (SrcSign == 1)
|
||||
Mods ^= SISrcMods::NEG;
|
||||
|
||||
Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
Src = In;
|
||||
|
|
|
@ -225,6 +225,8 @@ private:
|
|||
bool IsDOT = false) const;
|
||||
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
||||
bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
|
||||
|
||||
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
||||
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
|
|
|
@ -3670,6 +3670,21 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
|
|||
}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
|
||||
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
|
||||
// Value is in Imm operand as i1 sign extended to int64_t.
|
||||
// 1(-1) promotes packed values to signed, 0 treats them as unsigned.
|
||||
assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
|
||||
"expected i1 value");
|
||||
unsigned Mods = SISrcMods::OP_SEL_1;
|
||||
if (Root.getImm() == -1)
|
||||
Mods ^= SISrcMods::NEG;
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
|
||||
}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
|
||||
Register Src;
|
||||
|
|
|
@ -185,6 +185,9 @@ private:
|
|||
InstructionSelector::ComplexRendererFns
|
||||
selectVOP3PModsDOT(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectDotIUVOP3PMods(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectVOP3OpSelMods(MachineOperand &Root) const;
|
||||
|
||||
|
|
|
@ -4253,6 +4253,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
case Intrinsic::amdgcn_udot4:
|
||||
case Intrinsic::amdgcn_sdot8:
|
||||
case Intrinsic::amdgcn_udot8:
|
||||
case Intrinsic::amdgcn_fdot2_bf16_bf16:
|
||||
case Intrinsic::amdgcn_fdot2_f16_f16:
|
||||
case Intrinsic::amdgcn_fdot2_f32_bf16:
|
||||
case Intrinsic::amdgcn_sudot4:
|
||||
case Intrinsic::amdgcn_sudot8:
|
||||
return getDefaultMappingVOP(MI);
|
||||
case Intrinsic::amdgcn_sbfe:
|
||||
case Intrinsic::amdgcn_ubfe:
|
||||
|
|
|
@ -724,8 +724,8 @@ let SubtargetPredicate = isGFX11Plus in {
|
|||
} // End SubtargetPredicate = isGFX11Plus
|
||||
|
||||
let SubtargetPredicate = HasDot8Insts in {
|
||||
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>>;
|
||||
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>>;
|
||||
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
|
||||
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -359,7 +359,7 @@ let SubtargetPredicate = HasDot8Insts in {
|
|||
|
||||
defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
|
||||
VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
|
||||
null_frag, 1>;
|
||||
int_amdgcn_fdot2_f32_bf16, 1>;
|
||||
|
||||
} // End SubtargetPredicate = HasDot8Insts
|
||||
|
||||
|
@ -381,8 +381,8 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
|
|||
}
|
||||
|
||||
let SubtargetPredicate = HasDot8Insts in {
|
||||
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>;
|
||||
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>;
|
||||
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
|
||||
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
|
||||
} // End SubtargetPredicate = HasDot8Insts
|
||||
|
||||
def : UDot2Pat<V_DOT2_U32_U16>;
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
|
||||
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-LABEL: v_fdot2:
|
||||
|
@ -10,12 +11,12 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
|
||||
ret float %r
|
||||
}
|
||||
|
@ -27,12 +28,12 @@ define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_clamp:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_clamp:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
|
||||
ret float %r
|
||||
}
|
||||
|
@ -44,12 +45,12 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_neg_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_a:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
|
||||
ret float %r
|
||||
|
@ -62,12 +63,12 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_neg_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
|
||||
ret float %r
|
||||
|
@ -80,12 +81,12 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_neg_a_neg_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_a_neg_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %b
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false)
|
||||
|
@ -100,13 +101,13 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_neg_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_c:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg float %c
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
|
||||
ret float %r
|
||||
|
@ -119,12 +120,12 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_inline_literal_a:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_a:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
|
@ -136,12 +137,12 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_inline_literal_b:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
|
@ -153,12 +154,12 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
|
|||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fdot2_inline_literal_c:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_c:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a, <2 x i16> %b, i16 %c)
|
||||
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
|
||||
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
i16 addrspace(1)* %r,
|
||||
<2 x i16> addrspace(1)* %a,
|
||||
<2 x i16> addrspace(1)* %b,
|
||||
i16 addrspace(1)* %c) {
|
||||
entry:
|
||||
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
|
||||
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
|
||||
%c.val = load i16, i16 addrspace(1)* %c
|
||||
%r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val, <2 x i16> %b.val, i16 %c.val)
|
||||
store i16 %r.val, i16 addrspace(1)* %r
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
|
||||
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
|
||||
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
|
||||
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
|
||||
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
half addrspace(1)* %r,
|
||||
<2 x half> addrspace(1)* %a,
|
||||
<2 x half> addrspace(1)* %b,
|
||||
half addrspace(1)* %c) {
|
||||
entry:
|
||||
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
|
||||
%b.val = load <2 x half>, <2 x half> addrspace(1)* %b
|
||||
%c.val = load half, half addrspace(1)* %c
|
||||
%r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val, <2 x half> %b.val, half %c.val)
|
||||
store half %r.val, half addrspace(1)* %r
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float %c, i1 %clamp)
|
||||
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
|
||||
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
float addrspace(1)* %r,
|
||||
<2 x i16> addrspace(1)* %a,
|
||||
<2 x i16> addrspace(1)* %b,
|
||||
float addrspace(1)* %c) {
|
||||
entry:
|
||||
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
|
||||
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
|
||||
%c.val = load float, float addrspace(1)* %c
|
||||
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1)
|
||||
store float %r.val, float addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
|
||||
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
float addrspace(1)* %r,
|
||||
<2 x i16> addrspace(1)* %a,
|
||||
<2 x i16> addrspace(1)* %b,
|
||||
float addrspace(1)* %c) {
|
||||
entry:
|
||||
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
|
||||
%b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
|
||||
%c.val = load float, float addrspace(1)* %c
|
||||
%r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0)
|
||||
store float %r.val, float addrspace(1)* %r
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
|
||||
|
||||
declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
|
||||
; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
|
||||
entry:
|
||||
%ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
|
||||
ret i32 %ret
|
||||
}
|
Loading…
Reference in New Issue