[X86] Fix tile config register spill issue.

This is an optimized approach for D94155.

Previous code build the model that tile config register is the user of
each AMX instruction. There is a problem for the tile config register
spill. When across function, the ldtilecfg instruction may be inserted
on each AMX instruction which use tile config register. This cause all
tile data register clobber.

To fix this issue, we remove the model of tile config register. Instead,
we analyze the AMX instructions between one call to another. We will
insert ldtilecfg after the first call if we find any AMX instructions.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D95136
This commit is contained in:
Wang, Pengfei 2021-01-30 12:00:55 +08:00
parent c32f399802
commit a5d9e0c79b
12 changed files with 452 additions and 100 deletions

View File

@ -461,25 +461,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
case X86::PLDTILECFG: {
MI.RemoveOperand(0);
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
case X86::PSTTILECFG: {
MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
MI.setDesc(TII->get(X86::STTILECFG));
return true;
}
case X86::PTILELOADDV: {
MI.RemoveOperand(8); // Remove $tmmcfg
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILELOADD));
return true;
}
case X86::PTDPBSSDV: {
MI.RemoveOperand(7); // Remove $tmmcfg
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);
@ -488,14 +476,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
return true;
}
case X86::PTILESTOREDV: {
MI.RemoveOperand(8); // Remove $tmmcfg
for (int i = 1; i >= 0; --i)
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
return true;
}
case X86::PTILEZEROV: {
for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
for (int i = 2; i > 0; --i) // Remove row, col
MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILEZERO));
return true;

View File

@ -2094,8 +2094,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Emit tilerelease for AMX kernel.
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!MRI.reg_nodbg_empty(X86::TMMCFG))
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
for (unsigned I = 0; I < RC->getNumRegs(); I++)
if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
break;
}
}
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,

View File

@ -4607,7 +4607,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@ -4617,7 +4616,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
Index,
Disp,
Segment,
CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
@ -4628,14 +4626,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
SDValue Chain = Node->getOperand(0);
unsigned Opc = X86::PTDPBSSDV;
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Node->getOperand(4),
Node->getOperand(5),
Node->getOperand(6),
Node->getOperand(7),
CFG,
Chain};
MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
@ -4647,8 +4643,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
unsigned Opc = X86::PTILEZEROV;
SDValue Chain = Node->getOperand(0);
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);
@ -4719,7 +4714,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Index = Node->getOperand(5);
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
SDValue Ops[] = {Node->getOperand(2),
@ -4730,7 +4724,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
Disp,
Segment,
Node->getOperand(6),
CFG,
Chain};
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);

View File

@ -48,23 +48,14 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
let hasSideEffects = 1, mayLoad = 1,
Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
let hasSideEffects = 1, mayStore = 1 in
def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3,
TILECFG:$cfg), []>;
opaquemem:$src3), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4, TILECFG:$cfg), []>;
TILE:$src4), []>;
def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2,
TILECFG:$cfg), []>;
GR16:$src2), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
@ -104,7 +95,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
let Constraints = "$src4 = $dst" in
def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2, GR16:$src3, TILE:$src4,
TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
TILE:$src5, TILE:$src6), []>;
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.

View File

@ -3808,10 +3808,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if (RC->getID() == X86::TILECFGRegClassID) {
unsigned Opc = X86::PSTTILECFG;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
@ -3840,10 +3836,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if (RC->getID() == X86::TILECFGRegClassID) {
unsigned Opc = X86::PLDTILECFG;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
@ -6789,7 +6781,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// ENDBR instructions should not be scheduled around.
unsigned Opcode = MI.getOpcode();
if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
Opcode == X86::PLDTILECFG)
Opcode == X86::LDTILECFG)
return true;
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);

View File

@ -98,9 +98,8 @@ void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
MachineFunctionPass::getAnalysisUsage(AU);
}
static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
const TargetInstrInfo *TII,
MachineRegisterInfo *MRI,
static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
const X86Subtarget *ST) {
auto *MBB = MI->getParent();
@ -117,12 +116,8 @@ static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
}
// build psuedo ldtilecfg
Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
addFrameReference(
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
return VReg;
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
FrameIdx);
}
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
@ -219,25 +214,97 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() {
return &*MII;
}
static void addTileCFGUse(MachineFunction &MF, Register CFG) {
for (MachineBasicBlock &MBB : MF) {
// Traverse the basic block.
for (MachineInstr &MI : MBB) {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
switch (MII->getOpcode()) {
default:
break;
return false;
case X86::PTILELOADDV:
case X86::PTILESTOREDV:
case X86::PTDPBSSDV:
case X86::PTILEZEROV:
unsigned NumOperands = MI.getNumOperands();
MI.RemoveOperand(NumOperands - 1);
MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
break;
return true;
}
}
struct BBInfo {
bool HasAMX = false;
bool HasCallBeforeAMX = false;
bool HasAMXBeforeCallInSuccs = false;
MachineInstr *LastCall = nullptr;
BBInfo() = default;
BBInfo(SmallSet<MachineInstr *, 8> &CfgNeedInsert, MachineBasicBlock *MBB,
MachineInstr *MI = nullptr) {
MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin();
for (auto E = MBB->end(); MII != E; ++MII) {
if (isAMXInstruction(MII)) {
HasAMX = true;
if (LastCall)
CfgNeedInsert.insert(LastCall);
} else if (MII->isCall()) {
LastCall = &*MII;
if (!HasAMX)
HasCallBeforeAMX = true;
}
}
}
};
static void reloadTileConfig(MachineInstr *MI, int FI,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
SmallSet<MachineInstr *, 8> CfgNeedInsert;
SmallVector<MachineBasicBlock *, 8> WorkList;
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
MachineBasicBlock *MBB = MI->getParent();
BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI);
WorkList.push_back(MBB);
while (!WorkList.empty()) {
MBB = WorkList.pop_back_val();
for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
if (!BBVisitedInfo.count(*I)) {
BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I);
WorkList.push_back(*I);
}
}
}
WorkList.clear();
for (auto I : BBVisitedInfo) {
WorkList.push_back(I.first);
while (!WorkList.empty()) {
MBB = WorkList.pop_back_val();
if (BBVisitedInfo[MBB].HasCallBeforeAMX ||
(!BBVisitedInfo[MBB].HasAMX &&
!BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs))
continue;
for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
if (!BBVisitedInfo.count(*I) ||
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs)
continue;
if (BBVisitedInfo[*I].LastCall)
CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall);
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true;
WorkList.push_back(*I);
}
}
}
for (auto *I : CfgNeedInsert) {
BitVector UsableRegs(TRI->getNumRegs());
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
for (unsigned J = 0; J < RC->getNumRegs(); J++)
UsableRegs.set(X86::TMM0 + J);
for (MachineOperand &CallMO : I->operands()) {
if (CallMO.isRegMask())
UsableRegs.clearBitsInMask(CallMO.getRegMask());
}
if (!UsableRegs.none())
addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(),
TII->get(X86::LDTILECFG)),
FI);
}
}
@ -255,8 +322,8 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
unsigned Size = ST->getTileConfigSize();
Align Alignment = ST->getTileConfigAlignment();
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
addTileCFGUse(mf, CFG);
buildConfigMI(MI, SS, TII, MRI, ST);
reloadTileConfig(MI, SS, TII, TRI);
return true;
}

View File

@ -639,8 +639,3 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
let CopyCost = -1; // Don't allow copying of tile config registers.
let isAllocatable = 1;
let Size = 512;
}

View File

@ -22,6 +22,7 @@
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@ -130,13 +131,14 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
}
MachineInstr *X86TileConfig::getTileConfigPoint() {
for (MachineBasicBlock &MBB : *MF) {
// Traverse the basic block.
for (MachineInstr &MI : MBB)
MachineBasicBlock *Entry = &*MF->begin();
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
for (MachineBasicBlock *MBB : RPOT) {
for (MachineInstr &MI : *MBB)
// Refer X86PreTileConfig.cpp.
// We only support one tile config for now.
if (MI.getOpcode() == X86::PLDTILECFG)
// We only support one tile config for now. The other ldtilecfg
// is for spill purpose and is dominated by the first ldtilecfg.
if (MI.getOpcode() == X86::LDTILECFG)
return &MI;
}
@ -148,7 +150,7 @@ void X86TileConfig::tileConfig() {
if (!MI)
return;
MachineBasicBlock *MBB = MI->getParent();
int SS = MI->getOperand(1).getIndex();
int SS = MI->getOperand(0).getIndex();
BitVector PhysRegs(TRI->getNumRegs());
// Fill in the palette first.

View File

@ -1,10 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
define internal void @foo() {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
;
; IPRA-LABEL: foo:
; IPRA: # %bb.0: # %entry
; IPRA-NEXT: retq
entry:
ret void
}
define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test_api:
; CHECK: # %bb.0:
@ -25,7 +36,6 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: movw $8, %r15w
@ -36,11 +46,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf+2048, %eax
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
@ -55,16 +64,204 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
;
; IPRA-LABEL: test_api:
; IPRA: # %bb.0:
; IPRA-NEXT: subq $72, %rsp
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; IPRA-NEXT: movl $buf, %eax
; IPRA-NEXT: movl $32, %ecx
; IPRA-NEXT: movw $8, %dx
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
; IPRA-NEXT: movl $buf+1024, %eax
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
; IPRA-NEXT: callq foo
; IPRA-NEXT: movl $buf+2048, %eax
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2
; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx)
; IPRA-NEXT: addq $72, %rsp
; IPRA-NEXT: tilerelease
; IPRA-NEXT: vzeroupper
; IPRA-NEXT: retq
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
tail call void (...) @foo()
call void @foo()
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
ret void
}
declare dso_local void @foo(...)
define dso_local i32 @test_loop(i32 %0) nounwind {
; CHECK-LABEL: test_loop:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: callq foo
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: testl %r14d, %r14d
; CHECK-NEXT: jg .LBB2_4
; CHECK-NEXT: # %bb.1: # %.preheader
; CHECK-NEXT: movl $7, %ebp
; CHECK-NEXT: movl $buf, %r15d
; CHECK-NEXT: movl $32, %r12d
; CHECK-NEXT: movw $8, %bx
; CHECK-NEXT: movl $buf+2048, %r13d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
; CHECK-NEXT: tilestored %tmm0, (%r13,%r12)
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: decl %ebp
; CHECK-NEXT: cmpl $7, %ebp
; CHECK-NEXT: jne .LBB2_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: cmpl $3, %r14d
; CHECK-NEXT: jne .LBB2_4
; CHECK-NEXT: # %bb.6:
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: jne .LBB2_5
; CHECK-NEXT: # %bb.7:
; CHECK-NEXT: incl %r14d
; CHECK-NEXT: jmp .LBB2_8
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $32, %eax
; CHECK-NEXT: movl $buf+1024, %ecx
; CHECK-NEXT: movw $8, %dx
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
; CHECK-NEXT: .LBB2_5:
; CHECK-NEXT: decl %r14d
; CHECK-NEXT: .LBB2_8:
; CHECK-NEXT: movl %r14d, %eax
; CHECK-NEXT: addq $3016, %rsp # imm = 0xBC8
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
;
; IPRA-LABEL: test_loop:
; IPRA: # %bb.0:
; IPRA-NEXT: subq $72, %rsp
; IPRA-NEXT: movl %edi, %eax
; IPRA-NEXT: callq foo
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; IPRA-NEXT: testl %edi, %edi
; IPRA-NEXT: jg .LBB2_4
; IPRA-NEXT: # %bb.1: # %.preheader
; IPRA-NEXT: movl $7, %ecx
; IPRA-NEXT: movl $buf, %r8d
; IPRA-NEXT: movl $32, %esi
; IPRA-NEXT: movw $8, %di
; IPRA-NEXT: movl $buf+2048, %edx
; IPRA-NEXT: .p2align 4, 0x90
; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0
; IPRA-NEXT: callq foo
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi)
; IPRA-NEXT: callq foo
; IPRA-NEXT: decl %ecx
; IPRA-NEXT: cmpl $7, %ecx
; IPRA-NEXT: jne .LBB2_2
; IPRA-NEXT: # %bb.3:
; IPRA-NEXT: cmpl $3, %eax
; IPRA-NEXT: jne .LBB2_4
; IPRA-NEXT: # %bb.6:
; IPRA-NEXT: testl %ecx, %ecx
; IPRA-NEXT: jne .LBB2_5
; IPRA-NEXT: # %bb.7:
; IPRA-NEXT: incl %eax
; IPRA-NEXT: jmp .LBB2_8
; IPRA-NEXT: .LBB2_4:
; IPRA-NEXT: callq foo
; IPRA-NEXT: movl $32, %ecx
; IPRA-NEXT: movl $buf+1024, %edx
; IPRA-NEXT: movw $8, %si
; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0
; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx)
; IPRA-NEXT: .LBB2_5:
; IPRA-NEXT: decl %eax
; IPRA-NEXT: .LBB2_8:
; IPRA-NEXT: addq $72, %rsp
; IPRA-NEXT: tilerelease
; IPRA-NEXT: vzeroupper
; IPRA-NEXT: retq
call void @foo()
br label %2
2:
%3 = icmp sgt i32 %0, 0
br i1 %3, label %11, label %6
4:
%5 = icmp eq i32 %0, 3
br i1 %5, label %13, label %11
6:
%7 = phi i32 [ %9, %6 ], [ 0, %2 ]
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
call void @foo()
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %8)
call void @foo()
%9 = add i32 %7, 1
%10 = icmp eq i32 %9, 0
br i1 %10, label %4, label %6
11:
call void @foo()
%12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32, x86_amx %12)
br label %17
13:
%14 = icmp eq i32 %9, 7
br i1 %14, label %15, label %17
15:
%16 = add i32 %0, 1
br label %19
17:
%18 = sub i32 %0, 1
br label %19
19:
%20 = phi i32 [ %16, %15 ], [ %18, %17 ]
ret i32 %20
}
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)

View File

@ -5,6 +5,7 @@ define void @test_amx() {
; CHECK-LABEL: test_amx:
; CHECK: # %bb.0:
; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
ret void

View File

@ -0,0 +1,123 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %ecx
; CHECK-NEXT: movw $8, %dx
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
; CHECK-NEXT: movl $buf+2048, %eax
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: jmp foo # TAILCALL
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
tail call void @foo()
ret void
}
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test2:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %if.true
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: movl $32, %ecx
; CHECK-NEXT: movl $buf+1024, %edx
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
; CHECK-NEXT: movl $buf+2048, %edx
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .LBB1_3: # %if.false
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movl $32, %ecx
; CHECK-NEXT: movw $8, %dx
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
; CHECK-NEXT: movl $buf+2048, %eax
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
; CHECK-NEXT: .LBB1_2: # %if.true
; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: tilerelease
; CHECK-NEXT: retq
call void @foo()
br i1 undef, label %if.true, label %if.false
if.true:
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
br label %exit
if.false:
%t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
%t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
%t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
%t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
br label %exit
exit:
ret void
}
declare dso_local void @foo() nounwind
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

View File

@ -36,11 +36,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movw $8, %cx
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.true
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movw $8, %cx
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
@ -52,11 +51,13 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.false
; CHECK-NEXT: movl $buf, %eax
; CHECK-NEXT: movw $8, %cx
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
; CHECK-NEXT: movl $buf+1024, %eax
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3
@ -68,7 +69,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)
@ -139,7 +140,6 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2