diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 57bf1b477d10..ce2b1decdf6c 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -103,6 +103,7 @@ TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index 6dc0c1f031c4..ec601a58e7c3 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -239,6 +239,14 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, (__SIZE_TYPE__)(stride)); } +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddt164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, @@ -311,6 +319,27 @@ static void __tile_loadd(__tile1024i *dst, const void *base, dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); } +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TILE +static void __tile_stream_loadd(__tile1024i *dst, const void *base, + __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); +} + /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c index 3bfe887c0445..fda6d6e8ee4f 100644 --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -39,6 +39,14 @@ void test_tile_loadd(short row, short col) { __tile_loadd(&a, buf, STRIDE); } +void test_tile_stream_loadd(short row, short col) { + //CHECK-LABEL: @test_tile_stream_loadd + //CHECK: call x86_amx @llvm.x86.tileloaddt164.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile1024i a = {row, col}; + __tile_stream_loadd(&a, buf, STRIDE); +} + void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbssd //CHECK: call x86_amx @llvm.x86.tdpbssd.internal diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index aa38fd3ca803..5848356b5b1a 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5050,6 +5050,11 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddt164_internal : + GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; def int_x86_tdpbssd_internal : GCCBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 88e34911a7df..4add8d30e010 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -554,10 +554,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(X86::LDTILECFG)); return true; } - case X86::PTILELOADDV: { + case X86::PTILELOADDV: + case X86::PTILELOADDT1V: { for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TILELOADD)); + unsigned Opc = + Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1; + MI.setDesc(TII->get(Opc)); return true; } case X86::PTDPBSSDV: diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index baf45917d320..7031bd40215d 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -122,7 +122,8 @@ static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { } bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV; + return MI.getOpcode() == X86::PTILELOADDV || + MI.getOpcode() == X86::PTILELOADDT1V; } bool X86FastTileConfig::isTileStore(MachineInstr &MI) { return MI.getOpcode() == X86::PTILESTOREDV; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d692fa0fa492..0a5bae3cf3c4 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4617,10 +4617,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Res); return; } - case Intrinsic::x86_tileloadd64_internal: { + case Intrinsic::x86_tileloadd64_internal: + case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) break; - unsigned Opc = X86::PTILELOADDV; + unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal + ? X86::PTILELOADDV + : X86::PTILELOADDT1V; // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index b83856cae723..d825981a6b36 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -53,6 +53,9 @@ let Predicates = [HasAMXTILE, In64BitMode] in { def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index f0c35fa37a46..4ba44ccb6c16 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -121,6 +121,7 @@ std::pair X86LowerAMXType::getShape(IntrinsicInst *II, default: llvm_unreachable("Expect amx intrinsics"); case Intrinsic::x86_tileloadd64_internal: + case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: { Row = II->getArgOperand(0); Col = II->getArgOperand(1); diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp index fad5c73bc92d..d9c6d08ada73 100644 --- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp +++ b/llvm/lib/Target/X86/X86PreAMXConfig.cpp @@ -65,7 +65,8 @@ static bool isAMXIntrinsic(IntrinsicInst *II) { } static bool isTileLoad(IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal; + return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal || + II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal; } static bool isTileStore(IntrinsicInst *II) { diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 619ae0e4e0a4..c4748423baea 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -892,6 +892,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, } // We only collect the tile shape that is defined. case X86::PTILELOADDV: + case X86::PTILELOADDT1V: case X86::PTDPBSSDV: case X86::PTDPBSUDV: case X86::PTDPBUSDV: diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll index 095eb8e6ea8d..a9dd9191245d 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -23,6 +23,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -35,6 +36,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride) call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void @@ -42,6 +44,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { declare x86_amx @llvm.x86.tilezero.internal(i16, i16) declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)