forked from OSchip/llvm-project
[X86] Fix tile config register spill issue.
This is an optimized approach for D94155. Previous code build the model that tile config register is the user of each AMX instruction. There is a problem for the tile config register spill. When across function, the ldtilecfg instruction may be inserted on each AMX instruction which use tile config register. This cause all tile data register clobber. To fix this issue, we remove the model of tile config register. Instead, we analyze the AMX instructions between one call to another. We will insert ldtilecfg after the first call if we find any AMX instructions. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D95136
This commit is contained in:
parent
c32f399802
commit
a5d9e0c79b
|
@ -461,25 +461,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
|||
case TargetOpcode::ICALL_BRANCH_FUNNEL:
|
||||
ExpandICallBranchFunnel(&MBB, MBBI);
|
||||
return true;
|
||||
case X86::PLDTILECFG: {
|
||||
MI.RemoveOperand(0);
|
||||
MI.setDesc(TII->get(X86::LDTILECFG));
|
||||
return true;
|
||||
}
|
||||
case X86::PSTTILECFG: {
|
||||
MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
|
||||
MI.setDesc(TII->get(X86::STTILECFG));
|
||||
return true;
|
||||
}
|
||||
case X86::PTILELOADDV: {
|
||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
||||
for (unsigned i = 2; i > 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TILELOADD));
|
||||
return true;
|
||||
}
|
||||
case X86::PTDPBSSDV: {
|
||||
MI.RemoveOperand(7); // Remove $tmmcfg
|
||||
MI.untieRegOperand(4);
|
||||
for (unsigned i = 3; i > 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
|
@ -488,14 +476,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
|||
return true;
|
||||
}
|
||||
case X86::PTILESTOREDV: {
|
||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
||||
for (int i = 1; i >= 0; --i)
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TILESTORED));
|
||||
return true;
|
||||
}
|
||||
case X86::PTILEZEROV: {
|
||||
for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
|
||||
for (int i = 2; i > 0; --i) // Remove row, col
|
||||
MI.RemoveOperand(i);
|
||||
MI.setDesc(TII->get(X86::TILEZERO));
|
||||
return true;
|
||||
|
|
|
@ -2094,8 +2094,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
|
||||
// Emit tilerelease for AMX kernel.
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
if (!MRI.reg_nodbg_empty(X86::TMMCFG))
|
||||
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
|
||||
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||
for (unsigned I = 0; I < RC->getNumRegs(); I++)
|
||||
if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
|
||||
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
|
||||
|
|
|
@ -4607,7 +4607,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
SDValue Index = Node->getOperand(5);
|
||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
MachineSDNode *CNode;
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
|
@ -4617,7 +4616,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
Index,
|
||||
Disp,
|
||||
Segment,
|
||||
CFG,
|
||||
Chain};
|
||||
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
|
@ -4628,14 +4626,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
break;
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
unsigned Opc = X86::PTDPBSSDV;
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
Node->getOperand(3),
|
||||
Node->getOperand(4),
|
||||
Node->getOperand(5),
|
||||
Node->getOperand(6),
|
||||
Node->getOperand(7),
|
||||
CFG,
|
||||
Chain};
|
||||
MachineSDNode *CNode =
|
||||
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||
|
@ -4647,8 +4643,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
break;
|
||||
unsigned Opc = X86::PTILEZEROV;
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
|
||||
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
|
||||
MachineSDNode *CNode =
|
||||
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
|
@ -4719,7 +4714,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
SDValue Index = Node->getOperand(5);
|
||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
||||
SDValue Chain = Node->getOperand(0);
|
||||
MachineSDNode *CNode;
|
||||
SDValue Ops[] = {Node->getOperand(2),
|
||||
|
@ -4730,7 +4724,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
|||
Disp,
|
||||
Segment,
|
||||
Node->getOperand(6),
|
||||
CFG,
|
||||
Chain};
|
||||
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
|
||||
ReplaceNode(Node, CNode);
|
||||
|
|
|
@ -48,23 +48,14 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
|
|||
VEX, T8XD;
|
||||
|
||||
// Pseduo instruction for RA.
|
||||
let hasSideEffects = 1, mayLoad = 1,
|
||||
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
||||
def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
|
||||
|
||||
let hasSideEffects = 1, mayStore = 1 in
|
||||
def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
|
||||
|
||||
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||
GR16:$src2,
|
||||
opaquemem:$src3,
|
||||
TILECFG:$cfg), []>;
|
||||
opaquemem:$src3), []>;
|
||||
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
||||
GR16:$src2, opaquemem:$src3,
|
||||
TILE:$src4, TILECFG:$cfg), []>;
|
||||
TILE:$src4), []>;
|
||||
def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||
GR16:$src2,
|
||||
TILECFG:$cfg), []>;
|
||||
GR16:$src2), []>;
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
// Pseudo instructions, using immediates instead of tile registers.
|
||||
|
@ -104,7 +95,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
|
|||
let Constraints = "$src4 = $dst" in
|
||||
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||
GR16:$src2, GR16:$src3, TILE:$src4,
|
||||
TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
|
||||
TILE:$src5, TILE:$src6), []>;
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
// Pseudo instructions, using immediates instead of tile registers.
|
||||
|
|
|
@ -3808,10 +3808,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
|||
MachineOperand &MO = NewMI->getOperand(2);
|
||||
MO.setReg(VirtReg);
|
||||
MO.setIsKill(true);
|
||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
||||
unsigned Opc = X86::PSTTILECFG;
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
||||
.addReg(SrcReg, getKillRegState(isKill));
|
||||
} else {
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
bool isAligned =
|
||||
|
@ -3840,10 +3836,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|||
MachineOperand &MO = NewMI->getOperand(3);
|
||||
MO.setReg(VirtReg);
|
||||
MO.setIsKill(true);
|
||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
||||
unsigned Opc = X86::PLDTILECFG;
|
||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
|
||||
FrameIdx);
|
||||
} else {
|
||||
const MachineFunction &MF = *MBB.getParent();
|
||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||
|
@ -6789,7 +6781,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
|
|||
// ENDBR instructions should not be scheduled around.
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
|
||||
Opcode == X86::PLDTILECFG)
|
||||
Opcode == X86::LDTILECFG)
|
||||
return true;
|
||||
|
||||
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
|
||||
|
|
|
@ -98,10 +98,9 @@ void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
|
|||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||
const TargetInstrInfo *TII,
|
||||
MachineRegisterInfo *MRI,
|
||||
const X86Subtarget *ST) {
|
||||
static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||
const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
|
||||
const X86Subtarget *ST) {
|
||||
auto *MBB = MI->getParent();
|
||||
|
||||
// FIXME: AMX should assume AVX512 enabled.
|
||||
|
@ -117,12 +116,8 @@ static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
|||
}
|
||||
|
||||
// build psuedo ldtilecfg
|
||||
Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
|
||||
|
||||
addFrameReference(
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
|
||||
|
||||
return VReg;
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
|
||||
FrameIdx);
|
||||
}
|
||||
|
||||
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
|
||||
|
@ -219,26 +214,98 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
|
|||
return &*MII;
|
||||
}
|
||||
|
||||
static void addTileCFGUse(MachineFunction &MF, Register CFG) {
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
|
||||
switch (MII->getOpcode()) {
|
||||
default:
|
||||
return false;
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTILESTOREDV:
|
||||
case X86::PTDPBSSDV:
|
||||
case X86::PTILEZEROV:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Traverse the basic block.
|
||||
for (MachineInstr &MI : MBB) {
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
switch (Opcode) {
|
||||
default:
|
||||
break;
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTILESTOREDV:
|
||||
case X86::PTDPBSSDV:
|
||||
case X86::PTILEZEROV:
|
||||
unsigned NumOperands = MI.getNumOperands();
|
||||
MI.RemoveOperand(NumOperands - 1);
|
||||
MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
|
||||
break;
|
||||
struct BBInfo {
|
||||
bool HasAMX = false;
|
||||
bool HasCallBeforeAMX = false;
|
||||
bool HasAMXBeforeCallInSuccs = false;
|
||||
MachineInstr *LastCall = nullptr;
|
||||
|
||||
BBInfo() = default;
|
||||
BBInfo(SmallSet<MachineInstr *, 8> &CfgNeedInsert, MachineBasicBlock *MBB,
|
||||
MachineInstr *MI = nullptr) {
|
||||
MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin();
|
||||
for (auto E = MBB->end(); MII != E; ++MII) {
|
||||
if (isAMXInstruction(MII)) {
|
||||
HasAMX = true;
|
||||
if (LastCall)
|
||||
CfgNeedInsert.insert(LastCall);
|
||||
} else if (MII->isCall()) {
|
||||
LastCall = &*MII;
|
||||
if (!HasAMX)
|
||||
HasCallBeforeAMX = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void reloadTileConfig(MachineInstr *MI, int FI,
|
||||
const TargetInstrInfo *TII,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
SmallSet<MachineInstr *, 8> CfgNeedInsert;
|
||||
SmallVector<MachineBasicBlock *, 8> WorkList;
|
||||
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
|
||||
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI);
|
||||
|
||||
WorkList.push_back(MBB);
|
||||
while (!WorkList.empty()) {
|
||||
MBB = WorkList.pop_back_val();
|
||||
for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
|
||||
if (!BBVisitedInfo.count(*I)) {
|
||||
BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I);
|
||||
WorkList.push_back(*I);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WorkList.clear();
|
||||
for (auto I : BBVisitedInfo) {
|
||||
WorkList.push_back(I.first);
|
||||
while (!WorkList.empty()) {
|
||||
MBB = WorkList.pop_back_val();
|
||||
if (BBVisitedInfo[MBB].HasCallBeforeAMX ||
|
||||
(!BBVisitedInfo[MBB].HasAMX &&
|
||||
!BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs))
|
||||
continue;
|
||||
for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
|
||||
if (!BBVisitedInfo.count(*I) ||
|
||||
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs)
|
||||
continue;
|
||||
if (BBVisitedInfo[*I].LastCall)
|
||||
CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall);
|
||||
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true;
|
||||
WorkList.push_back(*I);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto *I : CfgNeedInsert) {
|
||||
BitVector UsableRegs(TRI->getNumRegs());
|
||||
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||
for (unsigned J = 0; J < RC->getNumRegs(); J++)
|
||||
UsableRegs.set(X86::TMM0 + J);
|
||||
for (MachineOperand &CallMO : I->operands()) {
|
||||
if (CallMO.isRegMask())
|
||||
UsableRegs.clearBitsInMask(CallMO.getRegMask());
|
||||
}
|
||||
if (!UsableRegs.none())
|
||||
addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(),
|
||||
TII->get(X86::LDTILECFG)),
|
||||
FI);
|
||||
}
|
||||
}
|
||||
|
||||
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||
|
@ -255,8 +322,8 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
|||
unsigned Size = ST->getTileConfigSize();
|
||||
Align Alignment = ST->getTileConfigAlignment();
|
||||
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
|
||||
Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
|
||||
addTileCFGUse(mf, CFG);
|
||||
buildConfigMI(MI, SS, TII, MRI, ST);
|
||||
reloadTileConfig(MI, SS, TII, TRI);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -639,8 +639,3 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
|
|||
let CopyCost = -1 in // Don't allow copying of tile registers
|
||||
def TILE : RegisterClass<"X86", [x86amx], 8192,
|
||||
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
|
||||
def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
|
||||
let CopyCost = -1; // Don't allow copying of tile config registers.
|
||||
let isAllocatable = 1;
|
||||
let Size = 512;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "X86MachineFunctionInfo.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "X86Subtarget.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/CodeGen/LiveIntervals.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
|
@ -130,13 +131,14 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
|
|||
}
|
||||
|
||||
MachineInstr *X86TileConfig::getTileConfigPoint() {
|
||||
for (MachineBasicBlock &MBB : *MF) {
|
||||
|
||||
// Traverse the basic block.
|
||||
for (MachineInstr &MI : MBB)
|
||||
MachineBasicBlock *Entry = &*MF->begin();
|
||||
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
|
||||
for (MachineBasicBlock *MBB : RPOT) {
|
||||
for (MachineInstr &MI : *MBB)
|
||||
// Refer X86PreTileConfig.cpp.
|
||||
// We only support one tile config for now.
|
||||
if (MI.getOpcode() == X86::PLDTILECFG)
|
||||
// We only support one tile config for now. The other ldtilecfg
|
||||
// is for spill purpose and is dominated by the first ldtilecfg.
|
||||
if (MI.getOpcode() == X86::LDTILECFG)
|
||||
return &MI;
|
||||
}
|
||||
|
||||
|
@ -148,7 +150,7 @@ void X86TileConfig::tileConfig() {
|
|||
if (!MI)
|
||||
return;
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
int SS = MI->getOperand(1).getIndex();
|
||||
int SS = MI->getOperand(0).getIndex();
|
||||
BitVector PhysRegs(TRI->getNumRegs());
|
||||
|
||||
// Fill in the palette first.
|
||||
|
|
|
@ -1,10 +1,21 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||
|
||||
%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
|
||||
|
||||
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
|
||||
|
||||
define internal void @foo() {
|
||||
; CHECK-LABEL: foo:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; IPRA-LABEL: foo:
|
||||
; IPRA: # %bb.0: # %entry
|
||||
; IPRA-NEXT: retq
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test_api:
|
||||
; CHECK: # %bb.0:
|
||||
|
@ -25,7 +36,6 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %r14d
|
||||
; CHECK-NEXT: movw $8, %r15w
|
||||
|
@ -36,11 +46,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $buf+2048, %eax
|
||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
||||
; CHECK-NEXT: movabsq $64, %rcx
|
||||
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
|
||||
|
@ -55,16 +64,204 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; IPRA-LABEL: test_api:
|
||||
; IPRA: # %bb.0:
|
||||
; IPRA-NEXT: subq $72, %rsp
|
||||
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movl $buf, %eax
|
||||
; IPRA-NEXT: movl $32, %ecx
|
||||
; IPRA-NEXT: movw $8, %dx
|
||||
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
|
||||
; IPRA-NEXT: movl $buf+1024, %eax
|
||||
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: movl $buf+2048, %eax
|
||||
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||
; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
||||
; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx)
|
||||
; IPRA-NEXT: addq $72, %rsp
|
||||
; IPRA-NEXT: tilerelease
|
||||
; IPRA-NEXT: vzeroupper
|
||||
; IPRA-NEXT: retq
|
||||
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
tail call void (...) @foo()
|
||||
call void @foo()
|
||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare dso_local void @foo(...)
|
||||
define dso_local i32 @test_loop(i32 %0) nounwind {
|
||||
; CHECK-LABEL: test_loop:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: pushq %rbp
|
||||
; CHECK-NEXT: pushq %r15
|
||||
; CHECK-NEXT: pushq %r14
|
||||
; CHECK-NEXT: pushq %r13
|
||||
; CHECK-NEXT: pushq %r12
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8
|
||||
; CHECK-NEXT: movl %edi, %r14d
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: testl %r14d, %r14d
|
||||
; CHECK-NEXT: jg .LBB2_4
|
||||
; CHECK-NEXT: # %bb.1: # %.preheader
|
||||
; CHECK-NEXT: movl $7, %ebp
|
||||
; CHECK-NEXT: movl $buf, %r15d
|
||||
; CHECK-NEXT: movl $32, %r12d
|
||||
; CHECK-NEXT: movw $8, %bx
|
||||
; CHECK-NEXT: movl $buf+2048, %r13d
|
||||
; CHECK-NEXT: .p2align 4, 0x90
|
||||
; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tilestored %tmm0, (%r13,%r12)
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: decl %ebp
|
||||
; CHECK-NEXT: cmpl $7, %ebp
|
||||
; CHECK-NEXT: jne .LBB2_2
|
||||
; CHECK-NEXT: # %bb.3:
|
||||
; CHECK-NEXT: cmpl $3, %r14d
|
||||
; CHECK-NEXT: jne .LBB2_4
|
||||
; CHECK-NEXT: # %bb.6:
|
||||
; CHECK-NEXT: testl %ebp, %ebp
|
||||
; CHECK-NEXT: jne .LBB2_5
|
||||
; CHECK-NEXT: # %bb.7:
|
||||
; CHECK-NEXT: incl %r14d
|
||||
; CHECK-NEXT: jmp .LBB2_8
|
||||
; CHECK-NEXT: .LBB2_4:
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $32, %eax
|
||||
; CHECK-NEXT: movl $buf+1024, %ecx
|
||||
; CHECK-NEXT: movw $8, %dx
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
|
||||
; CHECK-NEXT: .LBB2_5:
|
||||
; CHECK-NEXT: decl %r14d
|
||||
; CHECK-NEXT: .LBB2_8:
|
||||
; CHECK-NEXT: movl %r14d, %eax
|
||||
; CHECK-NEXT: addq $3016, %rsp # imm = 0xBC8
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: popq %r12
|
||||
; CHECK-NEXT: popq %r13
|
||||
; CHECK-NEXT: popq %r14
|
||||
; CHECK-NEXT: popq %r15
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: retq
|
||||
;
|
||||
; IPRA-LABEL: test_loop:
|
||||
; IPRA: # %bb.0:
|
||||
; IPRA-NEXT: subq $72, %rsp
|
||||
; IPRA-NEXT: movl %edi, %eax
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: testl %edi, %edi
|
||||
; IPRA-NEXT: jg .LBB2_4
|
||||
; IPRA-NEXT: # %bb.1: # %.preheader
|
||||
; IPRA-NEXT: movl $7, %ecx
|
||||
; IPRA-NEXT: movl $buf, %r8d
|
||||
; IPRA-NEXT: movl $32, %esi
|
||||
; IPRA-NEXT: movw $8, %di
|
||||
; IPRA-NEXT: movl $buf+2048, %edx
|
||||
; IPRA-NEXT: .p2align 4, 0x90
|
||||
; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
|
||||
; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: decl %ecx
|
||||
; IPRA-NEXT: cmpl $7, %ecx
|
||||
; IPRA-NEXT: jne .LBB2_2
|
||||
; IPRA-NEXT: # %bb.3:
|
||||
; IPRA-NEXT: cmpl $3, %eax
|
||||
; IPRA-NEXT: jne .LBB2_4
|
||||
; IPRA-NEXT: # %bb.6:
|
||||
; IPRA-NEXT: testl %ecx, %ecx
|
||||
; IPRA-NEXT: jne .LBB2_5
|
||||
; IPRA-NEXT: # %bb.7:
|
||||
; IPRA-NEXT: incl %eax
|
||||
; IPRA-NEXT: jmp .LBB2_8
|
||||
; IPRA-NEXT: .LBB2_4:
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: movl $32, %ecx
|
||||
; IPRA-NEXT: movl $buf+1024, %edx
|
||||
; IPRA-NEXT: movw $8, %si
|
||||
; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0
|
||||
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
||||
; IPRA-NEXT: .LBB2_5:
|
||||
; IPRA-NEXT: decl %eax
|
||||
; IPRA-NEXT: .LBB2_8:
|
||||
; IPRA-NEXT: addq $72, %rsp
|
||||
; IPRA-NEXT: tilerelease
|
||||
; IPRA-NEXT: vzeroupper
|
||||
; IPRA-NEXT: retq
|
||||
call void @foo()
|
||||
br label %2
|
||||
2:
|
||||
%3 = icmp sgt i32 %0, 0
|
||||
br i1 %3, label %11, label %6
|
||||
4:
|
||||
%5 = icmp eq i32 %0, 3
|
||||
br i1 %5, label %13, label %11
|
||||
6:
|
||||
%7 = phi i32 [ %9, %6 ], [ 0, %2 ]
|
||||
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
call void @foo()
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %8)
|
||||
call void @foo()
|
||||
%9 = add i32 %7, 1
|
||||
%10 = icmp eq i32 %9, 0
|
||||
br i1 %10, label %4, label %6
|
||||
11:
|
||||
call void @foo()
|
||||
%12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32, x86_amx %12)
|
||||
br label %17
|
||||
13:
|
||||
%14 = icmp eq i32 %9, 7
|
||||
br i1 %14, label %15, label %17
|
||||
15:
|
||||
%16 = add i32 %0, 1
|
||||
br label %19
|
||||
17:
|
||||
%18 = sub i32 %0, 1
|
||||
br label %19
|
||||
19:
|
||||
%20 = phi i32 [ %16, %15 ], [ %18, %17 ]
|
||||
ret i32 %20
|
||||
}
|
||||
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
|
|
|
@ -5,6 +5,7 @@ define void @test_amx() {
|
|||
; CHECK-LABEL: test_amx:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
|
||||
ret void
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
|
||||
|
||||
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: movw $8, %dx
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
|
||||
; CHECK-NEXT: movl $buf+1024, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
|
||||
; CHECK-NEXT: movl $buf+2048, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
||||
; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: jmp foo # TAILCALL
|
||||
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
||||
tail call void @foo()
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: pushq %rbp
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: subq $72, %rsp
|
||||
; CHECK-NEXT: movl %esi, %ebx
|
||||
; CHECK-NEXT: movl %edi, %ebp
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB1_3
|
||||
; CHECK-NEXT: # %bb.1: # %if.true
|
||||
; CHECK-NEXT: movw $8, %ax
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: movl $buf+1024, %edx
|
||||
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
|
||||
; CHECK-NEXT: movl $buf+2048, %edx
|
||||
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
|
||||
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
||||
; CHECK-NEXT: jmp .LBB1_2
|
||||
; CHECK-NEXT: .LBB1_3: # %if.false
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: movw $8, %dx
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
|
||||
; CHECK-NEXT: movl $buf+1024, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
|
||||
; CHECK-NEXT: movl $buf+2048, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||
; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
|
||||
; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
|
||||
; CHECK-NEXT: .LBB1_2: # %if.true
|
||||
; CHECK-NEXT: addq $72, %rsp
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: retq
|
||||
call void @foo()
|
||||
br i1 undef, label %if.true, label %if.false
|
||||
|
||||
if.true:
|
||||
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
|
||||
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
|
||||
br label %exit
|
||||
|
||||
if.false:
|
||||
%t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||
%t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
%t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||
%t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare dso_local void @foo() nounwind
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
|
@ -36,11 +36,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movw $8, %cx
|
||||
; CHECK-NEXT: jne .LBB0_2
|
||||
; CHECK-NEXT: # %bb.1: # %if.true
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movw $8, %cx
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
||||
; CHECK-NEXT: movl $buf+1024, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
|
||||
|
@ -52,11 +51,13 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: jmp .LBB0_3
|
||||
; CHECK-NEXT: .LBB0_2: # %if.false
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movw $8, %cx
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
||||
; CHECK-NEXT: movl $buf+1024, %eax
|
||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3
|
||||
|
@ -68,7 +69,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
||||
; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)
|
||||
|
@ -139,7 +140,6 @@ define dso_local void @test3(i8 *%buf) nounwind {
|
|||
; CHECK-NEXT: movq %rdi, %rbx
|
||||
; CHECK-NEXT: movl $32, %r14d
|
||||
; CHECK-NEXT: xorl %ebp, %ebp
|
||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
||||
; CHECK-NEXT: .p2align 4, 0x90
|
||||
; CHECK-NEXT: .LBB1_2: # %loop.header
|
||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
|
@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
|
|||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
|
||||
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
|
||||
|
|
Loading…
Reference in New Issue