forked from OSchip/llvm-project
[X86] Fix tile config register spill issue.
This is an optimized approach for D94155. Previous code build the model that tile config register is the user of each AMX instruction. There is a problem for the tile config register spill. When across function, the ldtilecfg instruction may be inserted on each AMX instruction which use tile config register. This cause all tile data register clobber. To fix this issue, we remove the model of tile config register. Instead, we analyze the AMX instructions between one call to another. We will insert ldtilecfg after the first call if we find any AMX instructions. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D95136
This commit is contained in:
parent
c32f399802
commit
a5d9e0c79b
|
@ -461,25 +461,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
||||||
case TargetOpcode::ICALL_BRANCH_FUNNEL:
|
case TargetOpcode::ICALL_BRANCH_FUNNEL:
|
||||||
ExpandICallBranchFunnel(&MBB, MBBI);
|
ExpandICallBranchFunnel(&MBB, MBBI);
|
||||||
return true;
|
return true;
|
||||||
case X86::PLDTILECFG: {
|
|
||||||
MI.RemoveOperand(0);
|
|
||||||
MI.setDesc(TII->get(X86::LDTILECFG));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
case X86::PSTTILECFG: {
|
|
||||||
MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
|
|
||||||
MI.setDesc(TII->get(X86::STTILECFG));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
case X86::PTILELOADDV: {
|
case X86::PTILELOADDV: {
|
||||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
|
||||||
for (unsigned i = 2; i > 0; --i)
|
for (unsigned i = 2; i > 0; --i)
|
||||||
MI.RemoveOperand(i);
|
MI.RemoveOperand(i);
|
||||||
MI.setDesc(TII->get(X86::TILELOADD));
|
MI.setDesc(TII->get(X86::TILELOADD));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case X86::PTDPBSSDV: {
|
case X86::PTDPBSSDV: {
|
||||||
MI.RemoveOperand(7); // Remove $tmmcfg
|
|
||||||
MI.untieRegOperand(4);
|
MI.untieRegOperand(4);
|
||||||
for (unsigned i = 3; i > 0; --i)
|
for (unsigned i = 3; i > 0; --i)
|
||||||
MI.RemoveOperand(i);
|
MI.RemoveOperand(i);
|
||||||
|
@ -488,14 +476,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case X86::PTILESTOREDV: {
|
case X86::PTILESTOREDV: {
|
||||||
MI.RemoveOperand(8); // Remove $tmmcfg
|
|
||||||
for (int i = 1; i >= 0; --i)
|
for (int i = 1; i >= 0; --i)
|
||||||
MI.RemoveOperand(i);
|
MI.RemoveOperand(i);
|
||||||
MI.setDesc(TII->get(X86::TILESTORED));
|
MI.setDesc(TII->get(X86::TILESTORED));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case X86::PTILEZEROV: {
|
case X86::PTILEZEROV: {
|
||||||
for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
|
for (int i = 2; i > 0; --i) // Remove row, col
|
||||||
MI.RemoveOperand(i);
|
MI.RemoveOperand(i);
|
||||||
MI.setDesc(TII->get(X86::TILEZERO));
|
MI.setDesc(TII->get(X86::TILEZERO));
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -2094,8 +2094,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
|
||||||
|
|
||||||
// Emit tilerelease for AMX kernel.
|
// Emit tilerelease for AMX kernel.
|
||||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||||
if (!MRI.reg_nodbg_empty(X86::TMMCFG))
|
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||||
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
|
for (unsigned I = 0; I < RC->getNumRegs(); I++)
|
||||||
|
if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
|
||||||
|
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
|
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
|
||||||
|
|
|
@ -4607,7 +4607,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
SDValue Index = Node->getOperand(5);
|
SDValue Index = Node->getOperand(5);
|
||||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
|
||||||
SDValue Chain = Node->getOperand(0);
|
SDValue Chain = Node->getOperand(0);
|
||||||
MachineSDNode *CNode;
|
MachineSDNode *CNode;
|
||||||
SDValue Ops[] = {Node->getOperand(2),
|
SDValue Ops[] = {Node->getOperand(2),
|
||||||
|
@ -4617,7 +4616,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
Index,
|
Index,
|
||||||
Disp,
|
Disp,
|
||||||
Segment,
|
Segment,
|
||||||
CFG,
|
|
||||||
Chain};
|
Chain};
|
||||||
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||||
ReplaceNode(Node, CNode);
|
ReplaceNode(Node, CNode);
|
||||||
|
@ -4628,14 +4626,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
break;
|
break;
|
||||||
SDValue Chain = Node->getOperand(0);
|
SDValue Chain = Node->getOperand(0);
|
||||||
unsigned Opc = X86::PTDPBSSDV;
|
unsigned Opc = X86::PTDPBSSDV;
|
||||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
|
||||||
SDValue Ops[] = {Node->getOperand(2),
|
SDValue Ops[] = {Node->getOperand(2),
|
||||||
Node->getOperand(3),
|
Node->getOperand(3),
|
||||||
Node->getOperand(4),
|
Node->getOperand(4),
|
||||||
Node->getOperand(5),
|
Node->getOperand(5),
|
||||||
Node->getOperand(6),
|
Node->getOperand(6),
|
||||||
Node->getOperand(7),
|
Node->getOperand(7),
|
||||||
CFG,
|
|
||||||
Chain};
|
Chain};
|
||||||
MachineSDNode *CNode =
|
MachineSDNode *CNode =
|
||||||
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||||
|
@ -4647,8 +4643,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
break;
|
break;
|
||||||
unsigned Opc = X86::PTILEZEROV;
|
unsigned Opc = X86::PTILEZEROV;
|
||||||
SDValue Chain = Node->getOperand(0);
|
SDValue Chain = Node->getOperand(0);
|
||||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
|
||||||
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
|
|
||||||
MachineSDNode *CNode =
|
MachineSDNode *CNode =
|
||||||
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
|
||||||
ReplaceNode(Node, CNode);
|
ReplaceNode(Node, CNode);
|
||||||
|
@ -4719,7 +4714,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
SDValue Index = Node->getOperand(5);
|
SDValue Index = Node->getOperand(5);
|
||||||
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
|
||||||
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
|
||||||
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
|
|
||||||
SDValue Chain = Node->getOperand(0);
|
SDValue Chain = Node->getOperand(0);
|
||||||
MachineSDNode *CNode;
|
MachineSDNode *CNode;
|
||||||
SDValue Ops[] = {Node->getOperand(2),
|
SDValue Ops[] = {Node->getOperand(2),
|
||||||
|
@ -4730,7 +4724,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||||
Disp,
|
Disp,
|
||||||
Segment,
|
Segment,
|
||||||
Node->getOperand(6),
|
Node->getOperand(6),
|
||||||
CFG,
|
|
||||||
Chain};
|
Chain};
|
||||||
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
|
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
|
||||||
ReplaceNode(Node, CNode);
|
ReplaceNode(Node, CNode);
|
||||||
|
|
|
@ -48,23 +48,14 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
|
||||||
VEX, T8XD;
|
VEX, T8XD;
|
||||||
|
|
||||||
// Pseduo instruction for RA.
|
// Pseduo instruction for RA.
|
||||||
let hasSideEffects = 1, mayLoad = 1,
|
|
||||||
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
|
|
||||||
def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
|
|
||||||
|
|
||||||
let hasSideEffects = 1, mayStore = 1 in
|
|
||||||
def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
|
|
||||||
|
|
||||||
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||||
GR16:$src2,
|
GR16:$src2,
|
||||||
opaquemem:$src3,
|
opaquemem:$src3), []>;
|
||||||
TILECFG:$cfg), []>;
|
|
||||||
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
|
||||||
GR16:$src2, opaquemem:$src3,
|
GR16:$src2, opaquemem:$src3,
|
||||||
TILE:$src4, TILECFG:$cfg), []>;
|
TILE:$src4), []>;
|
||||||
def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||||
GR16:$src2,
|
GR16:$src2), []>;
|
||||||
TILECFG:$cfg), []>;
|
|
||||||
|
|
||||||
let usesCustomInserter = 1 in {
|
let usesCustomInserter = 1 in {
|
||||||
// Pseudo instructions, using immediates instead of tile registers.
|
// Pseudo instructions, using immediates instead of tile registers.
|
||||||
|
@ -104,7 +95,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
|
||||||
let Constraints = "$src4 = $dst" in
|
let Constraints = "$src4 = $dst" in
|
||||||
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
|
||||||
GR16:$src2, GR16:$src3, TILE:$src4,
|
GR16:$src2, GR16:$src3, TILE:$src4,
|
||||||
TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
|
TILE:$src5, TILE:$src6), []>;
|
||||||
|
|
||||||
let usesCustomInserter = 1 in {
|
let usesCustomInserter = 1 in {
|
||||||
// Pseudo instructions, using immediates instead of tile registers.
|
// Pseudo instructions, using immediates instead of tile registers.
|
||||||
|
|
|
@ -3808,10 +3808,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
|
||||||
MachineOperand &MO = NewMI->getOperand(2);
|
MachineOperand &MO = NewMI->getOperand(2);
|
||||||
MO.setReg(VirtReg);
|
MO.setReg(VirtReg);
|
||||||
MO.setIsKill(true);
|
MO.setIsKill(true);
|
||||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
|
||||||
unsigned Opc = X86::PSTTILECFG;
|
|
||||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
|
|
||||||
.addReg(SrcReg, getKillRegState(isKill));
|
|
||||||
} else {
|
} else {
|
||||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||||
bool isAligned =
|
bool isAligned =
|
||||||
|
@ -3840,10 +3836,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
||||||
MachineOperand &MO = NewMI->getOperand(3);
|
MachineOperand &MO = NewMI->getOperand(3);
|
||||||
MO.setReg(VirtReg);
|
MO.setReg(VirtReg);
|
||||||
MO.setIsKill(true);
|
MO.setIsKill(true);
|
||||||
} else if (RC->getID() == X86::TILECFGRegClassID) {
|
|
||||||
unsigned Opc = X86::PLDTILECFG;
|
|
||||||
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
|
|
||||||
FrameIdx);
|
|
||||||
} else {
|
} else {
|
||||||
const MachineFunction &MF = *MBB.getParent();
|
const MachineFunction &MF = *MBB.getParent();
|
||||||
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
|
||||||
|
@ -6789,7 +6781,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
|
||||||
// ENDBR instructions should not be scheduled around.
|
// ENDBR instructions should not be scheduled around.
|
||||||
unsigned Opcode = MI.getOpcode();
|
unsigned Opcode = MI.getOpcode();
|
||||||
if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
|
if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
|
||||||
Opcode == X86::PLDTILECFG)
|
Opcode == X86::LDTILECFG)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
|
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
|
||||||
|
|
|
@ -98,10 +98,9 @@ void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||||
MachineFunctionPass::getAnalysisUsage(AU);
|
MachineFunctionPass::getAnalysisUsage(AU);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||||
const TargetInstrInfo *TII,
|
const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
|
||||||
MachineRegisterInfo *MRI,
|
const X86Subtarget *ST) {
|
||||||
const X86Subtarget *ST) {
|
|
||||||
auto *MBB = MI->getParent();
|
auto *MBB = MI->getParent();
|
||||||
|
|
||||||
// FIXME: AMX should assume AVX512 enabled.
|
// FIXME: AMX should assume AVX512 enabled.
|
||||||
|
@ -117,12 +116,8 @@ static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||||
}
|
}
|
||||||
|
|
||||||
// build psuedo ldtilecfg
|
// build psuedo ldtilecfg
|
||||||
Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
|
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
|
||||||
|
FrameIdx);
|
||||||
addFrameReference(
|
|
||||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
|
|
||||||
|
|
||||||
return VReg;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
|
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
|
||||||
|
@ -219,26 +214,98 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
|
||||||
return &*MII;
|
return &*MII;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void addTileCFGUse(MachineFunction &MF, Register CFG) {
|
static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
|
||||||
for (MachineBasicBlock &MBB : MF) {
|
switch (MII->getOpcode()) {
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
case X86::PTILELOADDV:
|
||||||
|
case X86::PTILESTOREDV:
|
||||||
|
case X86::PTDPBSSDV:
|
||||||
|
case X86::PTILEZEROV:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Traverse the basic block.
|
struct BBInfo {
|
||||||
for (MachineInstr &MI : MBB) {
|
bool HasAMX = false;
|
||||||
unsigned Opcode = MI.getOpcode();
|
bool HasCallBeforeAMX = false;
|
||||||
switch (Opcode) {
|
bool HasAMXBeforeCallInSuccs = false;
|
||||||
default:
|
MachineInstr *LastCall = nullptr;
|
||||||
break;
|
|
||||||
case X86::PTILELOADDV:
|
BBInfo() = default;
|
||||||
case X86::PTILESTOREDV:
|
BBInfo(SmallSet<MachineInstr *, 8> &CfgNeedInsert, MachineBasicBlock *MBB,
|
||||||
case X86::PTDPBSSDV:
|
MachineInstr *MI = nullptr) {
|
||||||
case X86::PTILEZEROV:
|
MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin();
|
||||||
unsigned NumOperands = MI.getNumOperands();
|
for (auto E = MBB->end(); MII != E; ++MII) {
|
||||||
MI.RemoveOperand(NumOperands - 1);
|
if (isAMXInstruction(MII)) {
|
||||||
MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
|
HasAMX = true;
|
||||||
break;
|
if (LastCall)
|
||||||
|
CfgNeedInsert.insert(LastCall);
|
||||||
|
} else if (MII->isCall()) {
|
||||||
|
LastCall = &*MII;
|
||||||
|
if (!HasAMX)
|
||||||
|
HasCallBeforeAMX = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void reloadTileConfig(MachineInstr *MI, int FI,
|
||||||
|
const TargetInstrInfo *TII,
|
||||||
|
const TargetRegisterInfo *TRI) {
|
||||||
|
SmallSet<MachineInstr *, 8> CfgNeedInsert;
|
||||||
|
SmallVector<MachineBasicBlock *, 8> WorkList;
|
||||||
|
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
|
||||||
|
|
||||||
|
MachineBasicBlock *MBB = MI->getParent();
|
||||||
|
BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI);
|
||||||
|
|
||||||
|
WorkList.push_back(MBB);
|
||||||
|
while (!WorkList.empty()) {
|
||||||
|
MBB = WorkList.pop_back_val();
|
||||||
|
for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
|
||||||
|
if (!BBVisitedInfo.count(*I)) {
|
||||||
|
BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I);
|
||||||
|
WorkList.push_back(*I);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
WorkList.clear();
|
||||||
|
for (auto I : BBVisitedInfo) {
|
||||||
|
WorkList.push_back(I.first);
|
||||||
|
while (!WorkList.empty()) {
|
||||||
|
MBB = WorkList.pop_back_val();
|
||||||
|
if (BBVisitedInfo[MBB].HasCallBeforeAMX ||
|
||||||
|
(!BBVisitedInfo[MBB].HasAMX &&
|
||||||
|
!BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs))
|
||||||
|
continue;
|
||||||
|
for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
|
||||||
|
if (!BBVisitedInfo.count(*I) ||
|
||||||
|
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs)
|
||||||
|
continue;
|
||||||
|
if (BBVisitedInfo[*I].LastCall)
|
||||||
|
CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall);
|
||||||
|
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true;
|
||||||
|
WorkList.push_back(*I);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto *I : CfgNeedInsert) {
|
||||||
|
BitVector UsableRegs(TRI->getNumRegs());
|
||||||
|
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||||
|
for (unsigned J = 0; J < RC->getNumRegs(); J++)
|
||||||
|
UsableRegs.set(X86::TMM0 + J);
|
||||||
|
for (MachineOperand &CallMO : I->operands()) {
|
||||||
|
if (CallMO.isRegMask())
|
||||||
|
UsableRegs.clearBitsInMask(CallMO.getRegMask());
|
||||||
|
}
|
||||||
|
if (!UsableRegs.none())
|
||||||
|
addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(),
|
||||||
|
TII->get(X86::LDTILECFG)),
|
||||||
|
FI);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||||
|
@ -255,8 +322,8 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||||
unsigned Size = ST->getTileConfigSize();
|
unsigned Size = ST->getTileConfigSize();
|
||||||
Align Alignment = ST->getTileConfigAlignment();
|
Align Alignment = ST->getTileConfigAlignment();
|
||||||
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
|
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
|
||||||
Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
|
buildConfigMI(MI, SS, TII, MRI, ST);
|
||||||
addTileCFGUse(mf, CFG);
|
reloadTileConfig(MI, SS, TII, TRI);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -639,8 +639,3 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
|
||||||
let CopyCost = -1 in // Don't allow copying of tile registers
|
let CopyCost = -1 in // Don't allow copying of tile registers
|
||||||
def TILE : RegisterClass<"X86", [x86amx], 8192,
|
def TILE : RegisterClass<"X86", [x86amx], 8192,
|
||||||
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
|
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
|
||||||
def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
|
|
||||||
let CopyCost = -1; // Don't allow copying of tile config registers.
|
|
||||||
let isAllocatable = 1;
|
|
||||||
let Size = 512;
|
|
||||||
}
|
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include "X86MachineFunctionInfo.h"
|
#include "X86MachineFunctionInfo.h"
|
||||||
#include "X86RegisterInfo.h"
|
#include "X86RegisterInfo.h"
|
||||||
#include "X86Subtarget.h"
|
#include "X86Subtarget.h"
|
||||||
|
#include "llvm/ADT/PostOrderIterator.h"
|
||||||
#include "llvm/CodeGen/LiveIntervals.h"
|
#include "llvm/CodeGen/LiveIntervals.h"
|
||||||
#include "llvm/CodeGen/MachineDominators.h"
|
#include "llvm/CodeGen/MachineDominators.h"
|
||||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||||
|
@ -130,13 +131,14 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
|
||||||
}
|
}
|
||||||
|
|
||||||
MachineInstr *X86TileConfig::getTileConfigPoint() {
|
MachineInstr *X86TileConfig::getTileConfigPoint() {
|
||||||
for (MachineBasicBlock &MBB : *MF) {
|
MachineBasicBlock *Entry = &*MF->begin();
|
||||||
|
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
|
||||||
// Traverse the basic block.
|
for (MachineBasicBlock *MBB : RPOT) {
|
||||||
for (MachineInstr &MI : MBB)
|
for (MachineInstr &MI : *MBB)
|
||||||
// Refer X86PreTileConfig.cpp.
|
// Refer X86PreTileConfig.cpp.
|
||||||
// We only support one tile config for now.
|
// We only support one tile config for now. The other ldtilecfg
|
||||||
if (MI.getOpcode() == X86::PLDTILECFG)
|
// is for spill purpose and is dominated by the first ldtilecfg.
|
||||||
|
if (MI.getOpcode() == X86::LDTILECFG)
|
||||||
return &MI;
|
return &MI;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,7 +150,7 @@ void X86TileConfig::tileConfig() {
|
||||||
if (!MI)
|
if (!MI)
|
||||||
return;
|
return;
|
||||||
MachineBasicBlock *MBB = MI->getParent();
|
MachineBasicBlock *MBB = MI->getParent();
|
||||||
int SS = MI->getOperand(1).getIndex();
|
int SS = MI->getOperand(0).getIndex();
|
||||||
BitVector PhysRegs(TRI->getNumRegs());
|
BitVector PhysRegs(TRI->getNumRegs());
|
||||||
|
|
||||||
// Fill in the palette first.
|
// Fill in the palette first.
|
||||||
|
|
|
@ -1,10 +1,21 @@
|
||||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
|
||||||
%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
|
|
||||||
|
|
||||||
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
|
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
|
||||||
|
|
||||||
|
define internal void @foo() {
|
||||||
|
; CHECK-LABEL: foo:
|
||||||
|
; CHECK: # %bb.0: # %entry
|
||||||
|
; CHECK-NEXT: retq
|
||||||
|
;
|
||||||
|
; IPRA-LABEL: foo:
|
||||||
|
; IPRA: # %bb.0: # %entry
|
||||||
|
; IPRA-NEXT: retq
|
||||||
|
entry:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-LABEL: test_api:
|
; CHECK-LABEL: test_api:
|
||||||
; CHECK: # %bb.0:
|
; CHECK: # %bb.0:
|
||||||
|
@ -25,7 +36,6 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
|
||||||
; CHECK-NEXT: movl $buf, %eax
|
; CHECK-NEXT: movl $buf, %eax
|
||||||
; CHECK-NEXT: movl $32, %r14d
|
; CHECK-NEXT: movl $32, %r14d
|
||||||
; CHECK-NEXT: movw $8, %r15w
|
; CHECK-NEXT: movw $8, %r15w
|
||||||
|
@ -36,11 +46,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
||||||
; CHECK-NEXT: movabsq $64, %rax
|
; CHECK-NEXT: movabsq $64, %rax
|
||||||
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||||
; CHECK-NEXT: xorl %eax, %eax
|
|
||||||
; CHECK-NEXT: vzeroupper
|
; CHECK-NEXT: vzeroupper
|
||||||
; CHECK-NEXT: callq foo
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: movl $buf+2048, %eax
|
; CHECK-NEXT: movl $buf+2048, %eax
|
||||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
||||||
; CHECK-NEXT: movabsq $64, %rcx
|
; CHECK-NEXT: movabsq $64, %rcx
|
||||||
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
|
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
|
||||||
|
@ -55,16 +64,204 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: popq %rbp
|
; CHECK-NEXT: popq %rbp
|
||||||
; CHECK-NEXT: tilerelease
|
; CHECK-NEXT: tilerelease
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
|
;
|
||||||
|
; IPRA-LABEL: test_api:
|
||||||
|
; IPRA: # %bb.0:
|
||||||
|
; IPRA-NEXT: subq $72, %rsp
|
||||||
|
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||||
|
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movl $buf, %eax
|
||||||
|
; IPRA-NEXT: movl $32, %ecx
|
||||||
|
; IPRA-NEXT: movw $8, %dx
|
||||||
|
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
|
||||||
|
; IPRA-NEXT: movl $buf+1024, %eax
|
||||||
|
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
|
||||||
|
; IPRA-NEXT: callq foo
|
||||||
|
; IPRA-NEXT: movl $buf+2048, %eax
|
||||||
|
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||||
|
; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
||||||
|
; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx)
|
||||||
|
; IPRA-NEXT: addq $72, %rsp
|
||||||
|
; IPRA-NEXT: tilerelease
|
||||||
|
; IPRA-NEXT: vzeroupper
|
||||||
|
; IPRA-NEXT: retq
|
||||||
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||||
tail call void (...) @foo()
|
call void @foo()
|
||||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
||||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
declare dso_local void @foo(...)
|
define dso_local i32 @test_loop(i32 %0) nounwind {
|
||||||
|
; CHECK-LABEL: test_loop:
|
||||||
|
; CHECK: # %bb.0:
|
||||||
|
; CHECK-NEXT: pushq %rbp
|
||||||
|
; CHECK-NEXT: pushq %r15
|
||||||
|
; CHECK-NEXT: pushq %r14
|
||||||
|
; CHECK-NEXT: pushq %r13
|
||||||
|
; CHECK-NEXT: pushq %r12
|
||||||
|
; CHECK-NEXT: pushq %rbx
|
||||||
|
; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8
|
||||||
|
; CHECK-NEXT: movl %edi, %r14d
|
||||||
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||||
|
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: testl %r14d, %r14d
|
||||||
|
; CHECK-NEXT: jg .LBB2_4
|
||||||
|
; CHECK-NEXT: # %bb.1: # %.preheader
|
||||||
|
; CHECK-NEXT: movl $7, %ebp
|
||||||
|
; CHECK-NEXT: movl $buf, %r15d
|
||||||
|
; CHECK-NEXT: movl $32, %r12d
|
||||||
|
; CHECK-NEXT: movw $8, %bx
|
||||||
|
; CHECK-NEXT: movl $buf+2048, %r13d
|
||||||
|
; CHECK-NEXT: .p2align 4, 0x90
|
||||||
|
; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
|
||||||
|
; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
|
||||||
|
; CHECK-NEXT: movabsq $64, %rax
|
||||||
|
; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||||
|
; CHECK-NEXT: vzeroupper
|
||||||
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movabsq $64, %rax
|
||||||
|
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
|
||||||
|
; CHECK-NEXT: tilestored %tmm0, (%r13,%r12)
|
||||||
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: decl %ebp
|
||||||
|
; CHECK-NEXT: cmpl $7, %ebp
|
||||||
|
; CHECK-NEXT: jne .LBB2_2
|
||||||
|
; CHECK-NEXT: # %bb.3:
|
||||||
|
; CHECK-NEXT: cmpl $3, %r14d
|
||||||
|
; CHECK-NEXT: jne .LBB2_4
|
||||||
|
; CHECK-NEXT: # %bb.6:
|
||||||
|
; CHECK-NEXT: testl %ebp, %ebp
|
||||||
|
; CHECK-NEXT: jne .LBB2_5
|
||||||
|
; CHECK-NEXT: # %bb.7:
|
||||||
|
; CHECK-NEXT: incl %r14d
|
||||||
|
; CHECK-NEXT: jmp .LBB2_8
|
||||||
|
; CHECK-NEXT: .LBB2_4:
|
||||||
|
; CHECK-NEXT: vzeroupper
|
||||||
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movl $32, %eax
|
||||||
|
; CHECK-NEXT: movl $buf+1024, %ecx
|
||||||
|
; CHECK-NEXT: movw $8, %dx
|
||||||
|
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
|
||||||
|
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
|
||||||
|
; CHECK-NEXT: .LBB2_5:
|
||||||
|
; CHECK-NEXT: decl %r14d
|
||||||
|
; CHECK-NEXT: .LBB2_8:
|
||||||
|
; CHECK-NEXT: movl %r14d, %eax
|
||||||
|
; CHECK-NEXT: addq $3016, %rsp # imm = 0xBC8
|
||||||
|
; CHECK-NEXT: popq %rbx
|
||||||
|
; CHECK-NEXT: popq %r12
|
||||||
|
; CHECK-NEXT: popq %r13
|
||||||
|
; CHECK-NEXT: popq %r14
|
||||||
|
; CHECK-NEXT: popq %r15
|
||||||
|
; CHECK-NEXT: popq %rbp
|
||||||
|
; CHECK-NEXT: tilerelease
|
||||||
|
; CHECK-NEXT: retq
|
||||||
|
;
|
||||||
|
; IPRA-LABEL: test_loop:
|
||||||
|
; IPRA: # %bb.0:
|
||||||
|
; IPRA-NEXT: subq $72, %rsp
|
||||||
|
; IPRA-NEXT: movl %edi, %eax
|
||||||
|
; IPRA-NEXT: callq foo
|
||||||
|
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||||
|
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; IPRA-NEXT: testl %edi, %edi
|
||||||
|
; IPRA-NEXT: jg .LBB2_4
|
||||||
|
; IPRA-NEXT: # %bb.1: # %.preheader
|
||||||
|
; IPRA-NEXT: movl $7, %ecx
|
||||||
|
; IPRA-NEXT: movl $buf, %r8d
|
||||||
|
; IPRA-NEXT: movl $32, %esi
|
||||||
|
; IPRA-NEXT: movw $8, %di
|
||||||
|
; IPRA-NEXT: movl $buf+2048, %edx
|
||||||
|
; IPRA-NEXT: .p2align 4, 0x90
|
||||||
|
; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
|
||||||
|
; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0
|
||||||
|
; IPRA-NEXT: callq foo
|
||||||
|
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
||||||
|
; IPRA-NEXT: callq foo
|
||||||
|
; IPRA-NEXT: decl %ecx
|
||||||
|
; IPRA-NEXT: cmpl $7, %ecx
|
||||||
|
; IPRA-NEXT: jne .LBB2_2
|
||||||
|
; IPRA-NEXT: # %bb.3:
|
||||||
|
; IPRA-NEXT: cmpl $3, %eax
|
||||||
|
; IPRA-NEXT: jne .LBB2_4
|
||||||
|
; IPRA-NEXT: # %bb.6:
|
||||||
|
; IPRA-NEXT: testl %ecx, %ecx
|
||||||
|
; IPRA-NEXT: jne .LBB2_5
|
||||||
|
; IPRA-NEXT: # %bb.7:
|
||||||
|
; IPRA-NEXT: incl %eax
|
||||||
|
; IPRA-NEXT: jmp .LBB2_8
|
||||||
|
; IPRA-NEXT: .LBB2_4:
|
||||||
|
; IPRA-NEXT: callq foo
|
||||||
|
; IPRA-NEXT: movl $32, %ecx
|
||||||
|
; IPRA-NEXT: movl $buf+1024, %edx
|
||||||
|
; IPRA-NEXT: movw $8, %si
|
||||||
|
; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0
|
||||||
|
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
||||||
|
; IPRA-NEXT: .LBB2_5:
|
||||||
|
; IPRA-NEXT: decl %eax
|
||||||
|
; IPRA-NEXT: .LBB2_8:
|
||||||
|
; IPRA-NEXT: addq $72, %rsp
|
||||||
|
; IPRA-NEXT: tilerelease
|
||||||
|
; IPRA-NEXT: vzeroupper
|
||||||
|
; IPRA-NEXT: retq
|
||||||
|
call void @foo()
|
||||||
|
br label %2
|
||||||
|
2:
|
||||||
|
%3 = icmp sgt i32 %0, 0
|
||||||
|
br i1 %3, label %11, label %6
|
||||||
|
4:
|
||||||
|
%5 = icmp eq i32 %0, 3
|
||||||
|
br i1 %5, label %13, label %11
|
||||||
|
6:
|
||||||
|
%7 = phi i32 [ %9, %6 ], [ 0, %2 ]
|
||||||
|
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||||
|
call void @foo()
|
||||||
|
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %8)
|
||||||
|
call void @foo()
|
||||||
|
%9 = add i32 %7, 1
|
||||||
|
%10 = icmp eq i32 %9, 0
|
||||||
|
br i1 %10, label %4, label %6
|
||||||
|
11:
|
||||||
|
call void @foo()
|
||||||
|
%12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||||
|
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32, x86_amx %12)
|
||||||
|
br label %17
|
||||||
|
13:
|
||||||
|
%14 = icmp eq i32 %9, 7
|
||||||
|
br i1 %14, label %15, label %17
|
||||||
|
15:
|
||||||
|
%16 = add i32 %0, 1
|
||||||
|
br label %19
|
||||||
|
17:
|
||||||
|
%18 = sub i32 %0, 1
|
||||||
|
br label %19
|
||||||
|
19:
|
||||||
|
%20 = phi i32 [ %16, %15 ], [ %18, %17 ]
|
||||||
|
ret i32 %20
|
||||||
|
}
|
||||||
|
|
||||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||||
|
|
|
@ -5,6 +5,7 @@ define void @test_amx() {
|
||||||
; CHECK-LABEL: test_amx:
|
; CHECK-LABEL: test_amx:
|
||||||
; CHECK: # %bb.0:
|
; CHECK: # %bb.0:
|
||||||
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
|
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
|
||||||
|
; CHECK-NEXT: tilerelease
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
|
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -0,0 +1,123 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
||||||
|
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
|
||||||
|
|
||||||
|
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
|
||||||
|
; CHECK-LABEL: test1:
|
||||||
|
; CHECK: # %bb.0:
|
||||||
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||||
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movl $buf, %eax
|
||||||
|
; CHECK-NEXT: movl $32, %ecx
|
||||||
|
; CHECK-NEXT: movw $8, %dx
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
|
||||||
|
; CHECK-NEXT: movl $buf+1024, %eax
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
|
||||||
|
; CHECK-NEXT: movl $buf+2048, %eax
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||||
|
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
||||||
|
; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
|
||||||
|
; CHECK-NEXT: tilerelease
|
||||||
|
; CHECK-NEXT: vzeroupper
|
||||||
|
; CHECK-NEXT: jmp foo # TAILCALL
|
||||||
|
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||||
|
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||||
|
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||||
|
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
||||||
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
||||||
|
tail call void @foo()
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
||||||
|
; CHECK-LABEL: test2:
|
||||||
|
; CHECK: # %bb.0:
|
||||||
|
; CHECK-NEXT: pushq %rbp
|
||||||
|
; CHECK-NEXT: pushq %rbx
|
||||||
|
; CHECK-NEXT: subq $72, %rsp
|
||||||
|
; CHECK-NEXT: movl %esi, %ebx
|
||||||
|
; CHECK-NEXT: movl %edi, %ebp
|
||||||
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||||
|
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: vzeroupper
|
||||||
|
; CHECK-NEXT: callq foo
|
||||||
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
|
; CHECK-NEXT: xorl %eax, %eax
|
||||||
|
; CHECK-NEXT: testb %al, %al
|
||||||
|
; CHECK-NEXT: jne .LBB1_3
|
||||||
|
; CHECK-NEXT: # %bb.1: # %if.true
|
||||||
|
; CHECK-NEXT: movw $8, %ax
|
||||||
|
; CHECK-NEXT: tilezero %tmm0
|
||||||
|
; CHECK-NEXT: movl $32, %ecx
|
||||||
|
; CHECK-NEXT: movl $buf+1024, %edx
|
||||||
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
|
||||||
|
; CHECK-NEXT: movl $buf+2048, %edx
|
||||||
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
|
||||||
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
||||||
|
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
||||||
|
; CHECK-NEXT: jmp .LBB1_2
|
||||||
|
; CHECK-NEXT: .LBB1_3: # %if.false
|
||||||
|
; CHECK-NEXT: movl $buf, %eax
|
||||||
|
; CHECK-NEXT: movl $32, %ecx
|
||||||
|
; CHECK-NEXT: movw $8, %dx
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
|
||||||
|
; CHECK-NEXT: movl $buf+1024, %eax
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
|
||||||
|
; CHECK-NEXT: movl $buf+2048, %eax
|
||||||
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
||||||
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
|
||||||
|
; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
|
||||||
|
; CHECK-NEXT: .LBB1_2: # %if.true
|
||||||
|
; CHECK-NEXT: addq $72, %rsp
|
||||||
|
; CHECK-NEXT: popq %rbx
|
||||||
|
; CHECK-NEXT: popq %rbp
|
||||||
|
; CHECK-NEXT: tilerelease
|
||||||
|
; CHECK-NEXT: retq
|
||||||
|
call void @foo()
|
||||||
|
br i1 undef, label %if.true, label %if.false
|
||||||
|
|
||||||
|
if.true:
|
||||||
|
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
|
||||||
|
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||||
|
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||||
|
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
|
||||||
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
if.false:
|
||||||
|
%t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
||||||
|
%t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||||
|
%t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||||
|
%t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
|
||||||
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
exit:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare dso_local void @foo() nounwind
|
||||||
|
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
||||||
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||||
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||||
|
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
|
@ -36,11 +36,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5
|
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5
|
||||||
; CHECK-NEXT: xorl %eax, %eax
|
; CHECK-NEXT: xorl %eax, %eax
|
||||||
; CHECK-NEXT: testb %al, %al
|
; CHECK-NEXT: testb %al, %al
|
||||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
|
||||||
; CHECK-NEXT: movl $buf, %eax
|
|
||||||
; CHECK-NEXT: movw $8, %cx
|
|
||||||
; CHECK-NEXT: jne .LBB0_2
|
; CHECK-NEXT: jne .LBB0_2
|
||||||
; CHECK-NEXT: # %bb.1: # %if.true
|
; CHECK-NEXT: # %bb.1: # %if.true
|
||||||
|
; CHECK-NEXT: movl $buf, %eax
|
||||||
|
; CHECK-NEXT: movw $8, %cx
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
|
||||||
; CHECK-NEXT: movl $buf+1024, %eax
|
; CHECK-NEXT: movl $buf+1024, %eax
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
|
||||||
|
@ -52,11 +51,13 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: xorl %eax, %eax
|
; CHECK-NEXT: xorl %eax, %eax
|
||||||
; CHECK-NEXT: vzeroupper
|
; CHECK-NEXT: vzeroupper
|
||||||
; CHECK-NEXT: callq foo
|
; CHECK-NEXT: callq foo
|
||||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: movabsq $64, %rax
|
; CHECK-NEXT: movabsq $64, %rax
|
||||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
||||||
; CHECK-NEXT: jmp .LBB0_3
|
; CHECK-NEXT: jmp .LBB0_3
|
||||||
; CHECK-NEXT: .LBB0_2: # %if.false
|
; CHECK-NEXT: .LBB0_2: # %if.false
|
||||||
|
; CHECK-NEXT: movl $buf, %eax
|
||||||
|
; CHECK-NEXT: movw $8, %cx
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
|
||||||
; CHECK-NEXT: movl $buf+1024, %eax
|
; CHECK-NEXT: movl $buf+1024, %eax
|
||||||
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3
|
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3
|
||||||
|
@ -68,7 +69,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
|
||||||
; CHECK-NEXT: xorl %eax, %eax
|
; CHECK-NEXT: xorl %eax, %eax
|
||||||
; CHECK-NEXT: vzeroupper
|
; CHECK-NEXT: vzeroupper
|
||||||
; CHECK-NEXT: callq foo
|
; CHECK-NEXT: callq foo
|
||||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: movabsq $64, %rax
|
; CHECK-NEXT: movabsq $64, %rax
|
||||||
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
|
||||||
; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)
|
; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)
|
||||||
|
@ -139,7 +140,6 @@ define dso_local void @test3(i8 *%buf) nounwind {
|
||||||
; CHECK-NEXT: movq %rdi, %rbx
|
; CHECK-NEXT: movq %rdi, %rbx
|
||||||
; CHECK-NEXT: movl $32, %r14d
|
; CHECK-NEXT: movl $32, %r14d
|
||||||
; CHECK-NEXT: xorl %ebp, %ebp
|
; CHECK-NEXT: xorl %ebp, %ebp
|
||||||
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
|
|
||||||
; CHECK-NEXT: .p2align 4, 0x90
|
; CHECK-NEXT: .p2align 4, 0x90
|
||||||
; CHECK-NEXT: .LBB1_2: # %loop.header
|
; CHECK-NEXT: .LBB1_2: # %loop.header
|
||||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||||
|
@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
|
||||||
; CHECK-NEXT: xorl %eax, %eax
|
; CHECK-NEXT: xorl %eax, %eax
|
||||||
; CHECK-NEXT: vzeroupper
|
; CHECK-NEXT: vzeroupper
|
||||||
; CHECK-NEXT: callq foo
|
; CHECK-NEXT: callq foo
|
||||||
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||||
; CHECK-NEXT: tilezero %tmm0
|
; CHECK-NEXT: tilezero %tmm0
|
||||||
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
|
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
|
||||||
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
|
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
|
||||||
|
|
Loading…
Reference in New Issue