forked from OSchip/llvm-project
[X86] Support __tile_stream_loadd intrinsic for new AMX interface
Adding support for __tile_stream_loadd intrinsic. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D103784
This commit is contained in:
parent
f0a68bbc96
commit
56d5c46b49
|
@ -103,6 +103,7 @@ TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
|
||||||
// AMX internal builtin
|
// AMX internal builtin
|
||||||
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
|
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
|
||||||
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
|
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
|
||||||
|
TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
|
||||||
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
||||||
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
||||||
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
|
||||||
|
|
|
@ -239,6 +239,14 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||||
(__SIZE_TYPE__)(stride));
|
(__SIZE_TYPE__)(stride));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
return __builtin_ia32_tileloaddt164_internal(m, n, base,
|
||||||
|
(__SIZE_TYPE__)(stride));
|
||||||
|
}
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
@ -311,6 +319,27 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
|
||||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||||
|
/// destination tile "dst". This intrinsic provides a hint to the implementation
|
||||||
|
/// that the data will likely not be reused in the near future and the data
|
||||||
|
/// caching can be optimized accordingly.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
|
static void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
||||||
|
}
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||||
|
|
|
@ -39,6 +39,14 @@ void test_tile_loadd(short row, short col) {
|
||||||
__tile_loadd(&a, buf, STRIDE);
|
__tile_loadd(&a, buf, STRIDE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_tile_stream_loadd(short row, short col) {
|
||||||
|
//CHECK-LABEL: @test_tile_stream_loadd
|
||||||
|
//CHECK: call x86_amx @llvm.x86.tileloaddt164.internal
|
||||||
|
//CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
|
||||||
|
__tile1024i a = {row, col};
|
||||||
|
__tile_stream_loadd(&a, buf, STRIDE);
|
||||||
|
}
|
||||||
|
|
||||||
void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
|
void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
|
||||||
//CHECK-LABEL: @test_tile_dpbssd
|
//CHECK-LABEL: @test_tile_dpbssd
|
||||||
//CHECK: call x86_amx @llvm.x86.tdpbssd.internal
|
//CHECK: call x86_amx @llvm.x86.tdpbssd.internal
|
||||||
|
|
|
@ -5050,6 +5050,11 @@ let TargetPrefix = "x86" in {
|
||||||
Intrinsic<[llvm_x86amx_ty],
|
Intrinsic<[llvm_x86amx_ty],
|
||||||
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||||
[]>;
|
[]>;
|
||||||
|
def int_x86_tileloaddt164_internal :
|
||||||
|
GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">,
|
||||||
|
Intrinsic<[llvm_x86amx_ty],
|
||||||
|
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
|
||||||
|
[]>;
|
||||||
def int_x86_tdpbssd_internal :
|
def int_x86_tdpbssd_internal :
|
||||||
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
|
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
|
||||||
Intrinsic<[llvm_x86amx_ty],
|
Intrinsic<[llvm_x86amx_ty],
|
||||||
|
|
|
@ -554,10 +554,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
||||||
MI.setDesc(TII->get(X86::LDTILECFG));
|
MI.setDesc(TII->get(X86::LDTILECFG));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case X86::PTILELOADDV: {
|
case X86::PTILELOADDV:
|
||||||
|
case X86::PTILELOADDT1V: {
|
||||||
for (unsigned i = 2; i > 0; --i)
|
for (unsigned i = 2; i > 0; --i)
|
||||||
MI.RemoveOperand(i);
|
MI.RemoveOperand(i);
|
||||||
MI.setDesc(TII->get(X86::TILELOADD));
|
unsigned Opc =
|
||||||
|
Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
|
||||||
|
MI.setDesc(TII->get(Opc));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case X86::PTDPBSSDV:
|
case X86::PTDPBSSDV:
|
||||||
|
|
|
@ -122,7 +122,8 @@ static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
|
bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
|
||||||
return MI.getOpcode() == X86::PTILELOADDV;
|
return MI.getOpcode() == X86::PTILELOADDV ||
|
||||||
|
MI.getOpcode() == X86::PTILELOADDT1V;
|
||||||
}
|
}
|
||||||
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
|
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
|
||||||
return MI.getOpcode() == X86::PTILESTOREDV;
|
return MI.getOpcode() == X86::PTILESTOREDV;
|
||||||
|
|
|
@ -4617,10 +4617,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
ReplaceNode(Node, Res);
|
ReplaceNode(Node, Res);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
case Intrinsic::x86_tileloadd64_internal: {
|
case Intrinsic::x86_tileloadd64_internal:
|
||||||
|
case Intrinsic::x86_tileloaddt164_internal: {
|
||||||
if (!Subtarget->hasAMXTILE())
|
if (!Subtarget->hasAMXTILE())
|
||||||
break;
|
break;
|
||||||
unsigned Opc = X86::PTILELOADDV;
|
unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
|
||||||
|
? X86::PTILELOADDV
|
||||||
|
: X86::PTILELOADDT1V;
|
||||||
// _tile_loadd_internal(row, col, buf, STRIDE)
|
// _tile_loadd_internal(row, col, buf, STRIDE)
|
||||||
SDValue Base = Node->getOperand(4);
|
SDValue Base = Node->getOperand(4);
|
||||||
SDValue Scale = getI8Imm(1, dl);
|
SDValue Scale = getI8Imm(1, dl);
|
||||||
|
|
|
@ -53,6 +53,9 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
|
||||||
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
|
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
|
||||||
GR16:$src2,
|
GR16:$src2,
|
||||||
opaquemem:$src3), []>;
|
opaquemem:$src3), []>;
|
||||||
|
def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
|
||||||
|
GR16:$src2,
|
||||||
|
opaquemem:$src3), []>;
|
||||||
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
||||||
GR16:$src2, opaquemem:$src3,
|
GR16:$src2, opaquemem:$src3,
|
||||||
TILE:$src4), []>;
|
TILE:$src4), []>;
|
||||||
|
|
|
@ -121,6 +121,7 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
|
||||||
default:
|
default:
|
||||||
llvm_unreachable("Expect amx intrinsics");
|
llvm_unreachable("Expect amx intrinsics");
|
||||||
case Intrinsic::x86_tileloadd64_internal:
|
case Intrinsic::x86_tileloadd64_internal:
|
||||||
|
case Intrinsic::x86_tileloaddt164_internal:
|
||||||
case Intrinsic::x86_tilestored64_internal: {
|
case Intrinsic::x86_tilestored64_internal: {
|
||||||
Row = II->getArgOperand(0);
|
Row = II->getArgOperand(0);
|
||||||
Col = II->getArgOperand(1);
|
Col = II->getArgOperand(1);
|
||||||
|
|
|
@ -65,7 +65,8 @@ static bool isAMXIntrinsic(IntrinsicInst *II) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool isTileLoad(IntrinsicInst *II) {
|
static bool isTileLoad(IntrinsicInst *II) {
|
||||||
return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
|
return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
|
||||||
|
II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool isTileStore(IntrinsicInst *II) {
|
static bool isTileStore(IntrinsicInst *II) {
|
||||||
|
|
|
@ -892,6 +892,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
|
||||||
}
|
}
|
||||||
// We only collect the tile shape that is defined.
|
// We only collect the tile shape that is defined.
|
||||||
case X86::PTILELOADDV:
|
case X86::PTILELOADDV:
|
||||||
|
case X86::PTILELOADDT1V:
|
||||||
case X86::PTDPBSSDV:
|
case X86::PTDPBSSDV:
|
||||||
case X86::PTDPBSUDV:
|
case X86::PTDPBSUDV:
|
||||||
case X86::PTDPBUSDV:
|
case X86::PTDPBUSDV:
|
||||||
|
|
|
@ -23,6 +23,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
|
||||||
; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
|
; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
|
||||||
; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
|
; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
|
||||||
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
|
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
|
||||||
|
; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1
|
||||||
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
|
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
|
||||||
; CHECK-NEXT: tilerelease
|
; CHECK-NEXT: tilerelease
|
||||||
; CHECK-NEXT: vzeroupper
|
; CHECK-NEXT: vzeroupper
|
||||||
|
@ -35,6 +36,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
|
||||||
%d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
|
%d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
|
||||||
%d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
|
%d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
|
||||||
%d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
|
%d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
|
||||||
|
%e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
|
||||||
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
|
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
|
||||||
|
|
||||||
ret void
|
ret void
|
||||||
|
@ -42,6 +44,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
|
||||||
|
|
||||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
||||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||||
|
declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
|
||||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||||
declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||||
declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||||
|
|
Loading…
Reference in New Issue