forked from OSchip/llvm-project
[X86][AMX] Hoist ldtilecfg
The previous code calculated the first ldtilecfg by dominating all AMX registers' def. This may result in the ldtilecfg being inserted into a loop. This patch try to calculate the nearest point where all shapes of AMX registers are reachable. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D99010
This commit is contained in:
parent
6c0a1ed3a9
commit
4cbaaf4a24
|
@ -6,31 +6,20 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file Pass to pre-config the shape of AMX register
|
||||
/// AMX register need to be configured before use. The shape of AMX register
|
||||
/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
|
||||
/// The pldtilecfg is to config tile registers. It should dominator all AMX
|
||||
/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
|
||||
/// register is used by all AMX instructions.
|
||||
/// This pass is to find the common dominator of all AMX instructions and
|
||||
/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
|
||||
/// produces is inserted as the last operand of each AMX instruction. We use
|
||||
/// this scheme to model the def-use relationship between AMX config instruction
|
||||
/// and other AMX instructions. Below is an example.
|
||||
/// \file Pass to pre-config the shapes of AMX registers
|
||||
/// AMX register needs to be configured before use. The shapes of AMX register
|
||||
/// are encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
|
||||
///
|
||||
/// ----B1----
|
||||
/// / \
|
||||
/// / \
|
||||
/// B2 B3
|
||||
/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
|
||||
/// The instruction ldtilecfg is used to config the shapes. It must be reachable
|
||||
/// for all variable shapes. ldtilecfg will be inserted more than once if we
|
||||
/// cannot find a dominating point for all AMX instructions.
|
||||
///
|
||||
/// is transformed to
|
||||
/// The configure register is caller saved according to ABI. We need to insert
|
||||
/// ldtilecfg again after the call instruction if callee clobbers any AMX
|
||||
/// registers.
|
||||
///
|
||||
/// B1
|
||||
/// %25:tilecfg = PLDTILECFG
|
||||
/// / \
|
||||
/// / \
|
||||
/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
|
||||
/// This pass calculates all points that ldtilecfg need to be inserted to and
|
||||
/// insert them. It reports error if the reachability conditions aren't met.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -38,32 +27,107 @@
|
|||
#include "X86InstrBuilder.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "X86Subtarget.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineLoopInfo.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TileShapeInfo.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "tile-pre-config"
|
||||
#define ASSERT_VALID_COMPARE \
|
||||
assert((!MBB || !RHS.MBB || MBB == RHS.MBB) && \
|
||||
"Cannot compare between different BBs");
|
||||
#define REPORT_CONFIG_FAIL \
|
||||
report_fatal_error( \
|
||||
MF.getName() + \
|
||||
": Failed to config tile register, please define the shape earlier");
|
||||
|
||||
namespace {
|
||||
|
||||
class X86PreTileConfig : public MachineFunctionPass {
|
||||
// context
|
||||
MachineFunction *MF = nullptr;
|
||||
const X86Subtarget *ST = nullptr;
|
||||
const TargetRegisterInfo *TRI;
|
||||
const TargetInstrInfo *TII;
|
||||
MachineDominatorTree *DomTree = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
struct MIRef {
|
||||
MachineInstr *MI = nullptr;
|
||||
MachineBasicBlock *MBB = nullptr;
|
||||
// A virtual position for instruction that will be inserted after MI.
|
||||
size_t Pos = 0;
|
||||
MIRef() = default;
|
||||
MIRef(MachineBasicBlock *MBB) : MBB(MBB) {
|
||||
for (auto I = MBB->begin(), E = MBB->end(); I != E && I->isPHI();
|
||||
++I, ++Pos)
|
||||
MI = &*I;
|
||||
}
|
||||
MIRef(MachineInstr *MI, MachineBasicBlock *MBB)
|
||||
: MI(MI), MBB(MBB),
|
||||
Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
|
||||
MIRef(MachineInstr *MI, MachineBasicBlock *MBB, size_t Pos)
|
||||
: MI(MI), MBB(MBB), Pos(Pos) {}
|
||||
operator bool() const { return MBB != nullptr; }
|
||||
bool operator==(const MIRef &RHS) const {
|
||||
return MI == RHS.MI && MBB == RHS.MBB;
|
||||
}
|
||||
bool operator<(const MIRef &RHS) const {
|
||||
ASSERT_VALID_COMPARE;
|
||||
return Pos < RHS.Pos;
|
||||
}
|
||||
bool operator>(const MIRef &RHS) const {
|
||||
ASSERT_VALID_COMPARE;
|
||||
return Pos > RHS.Pos;
|
||||
}
|
||||
};
|
||||
|
||||
MachineInstr *getTileConfigPoint();
|
||||
struct BBInfo {
|
||||
MIRef FirstAMX;
|
||||
MIRef LastCall;
|
||||
MIRef LastShape;
|
||||
bool NeedTileCfgLiveIn = false;
|
||||
unsigned ShapeReachedCount = 0;
|
||||
};
|
||||
|
||||
class X86PreTileConfig : public MachineFunctionPass {
|
||||
MachineRegisterInfo *MRI;
|
||||
const MachineLoopInfo *MLI;
|
||||
SmallSet<MachineInstr *, 8> DefVisited;
|
||||
SmallSet<MachineBasicBlock *, 8> ShapeBBs;
|
||||
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
|
||||
|
||||
/// Check if the callee will clobber AMX registers.
|
||||
bool isDestructiveCall(MachineInstr &MI, BitVector UsableRegs) {
|
||||
auto Iter = llvm::find_if(
|
||||
MI.operands(), [](MachineOperand &MO) { return MO.isRegMask(); });
|
||||
if (Iter == MI.operands_end())
|
||||
return false;
|
||||
UsableRegs.clearBitsInMask(Iter->getRegMask());
|
||||
return !UsableRegs.none();
|
||||
}
|
||||
|
||||
/// Check if MI is AMX pseudo instruction.
|
||||
bool isAMXInstruction(MachineInstr &MI) {
|
||||
if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
|
||||
return false;
|
||||
MachineOperand &MO = MI.getOperand(0);
|
||||
// We can simply check if it is AMX instruction by its def.
|
||||
// But we should exclude old API which uses physical registers.
|
||||
if (MO.isReg() && MO.getReg().isVirtual() &&
|
||||
MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
|
||||
collectShapeInfo(MI);
|
||||
return true;
|
||||
}
|
||||
// PTILESTOREDV is the only exception that doesn't def a AMX register.
|
||||
return MI.getOpcode() == X86::PTILESTOREDV;
|
||||
}
|
||||
|
||||
/// Check if it is an edge from loop bottom to loop head.
|
||||
bool isLoopBackEdge(MachineBasicBlock *Header, MachineBasicBlock *Bottom) {
|
||||
return MLI->isLoopHeader(Header) &&
|
||||
MLI->getLoopFor(Header)->getBottomBlock() == Bottom;
|
||||
}
|
||||
|
||||
/// Collect the shape def information for later use.
|
||||
void collectShapeInfo(MachineInstr &MI);
|
||||
|
||||
public:
|
||||
X86PreTileConfig() : MachineFunctionPass(ID) {}
|
||||
|
@ -74,10 +138,21 @@ public:
|
|||
}
|
||||
|
||||
/// X86PreTileConfig analysis usage.
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesAll();
|
||||
AU.addRequired<MachineLoopInfo>();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
/// Perform register allocation.
|
||||
bool runOnMachineFunction(MachineFunction &mf) override;
|
||||
/// Clear MF related structures.
|
||||
void releaseMemory() override {
|
||||
ShapeBBs.clear();
|
||||
DefVisited.clear();
|
||||
BBVisitedInfo.clear();
|
||||
}
|
||||
|
||||
/// Perform ldtilecfg instructions inserting.
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
static char ID;
|
||||
};
|
||||
|
@ -88,284 +163,199 @@ char X86PreTileConfig::ID = 0;
|
|||
|
||||
INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
|
||||
"Tile Register Pre-configure", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
|
||||
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
|
||||
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
|
||||
"Tile Register Pre-configure", false, false)
|
||||
|
||||
void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
AU.setPreservesAll();
|
||||
AU.addRequired<MachineDominatorTree>();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
|
||||
auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
|
||||
MIRef MIR(MI, MBB);
|
||||
if (BBVisitedInfo[MBB].LastShape < MIR)
|
||||
BBVisitedInfo[MBB].LastShape = MIR;
|
||||
ShapeBBs.insert(MBB);
|
||||
};
|
||||
|
||||
static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
|
||||
const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
|
||||
const X86Subtarget *ST) {
|
||||
auto *MBB = MI->getParent();
|
||||
|
||||
// Zero stack slot.
|
||||
if (ST->hasAVX512()) {
|
||||
Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
|
||||
.addReg(Zmm, RegState::Undef)
|
||||
.addReg(Zmm, RegState::Undef);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
|
||||
FrameIdx)
|
||||
.addReg(Zmm);
|
||||
} else if (ST->hasAVX2()) {
|
||||
Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORYrr), Ymm)
|
||||
.addReg(Ymm, RegState::Undef)
|
||||
.addReg(Ymm, RegState::Undef);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)),
|
||||
FrameIdx)
|
||||
.addReg(Ymm);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)),
|
||||
FrameIdx, 32)
|
||||
.addReg(Ymm);
|
||||
} else {
|
||||
assert(ST->hasSSE2() && "AMX should assume SSE2 enabled");
|
||||
Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
|
||||
BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PXORrr), Xmm)
|
||||
.addReg(Xmm, RegState::Undef)
|
||||
.addReg(Xmm, RegState::Undef);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
|
||||
FrameIdx)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
|
||||
FrameIdx, 16)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
|
||||
FrameIdx, 32)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
|
||||
FrameIdx, 48)
|
||||
.addReg(Xmm);
|
||||
}
|
||||
|
||||
// build psuedo ldtilecfg
|
||||
addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
|
||||
FrameIdx);
|
||||
}
|
||||
|
||||
static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
|
||||
unsigned Opcode = MI.getOpcode();
|
||||
switch (Opcode) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected machine instruction on tile");
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTDPBSSDV:
|
||||
case X86::PTDPBSUDV:
|
||||
case X86::PTDPBUSDV:
|
||||
case X86::PTDPBUUDV:
|
||||
case X86::PTILEZEROV:
|
||||
case X86::PTDPBF16PSV:
|
||||
MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
|
||||
MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
|
||||
ShapeT Shape(&MO1, &MO2, MRI);
|
||||
return Shape;
|
||||
SmallVector<Register, 8> WorkList(
|
||||
{MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
|
||||
while (!WorkList.empty()) {
|
||||
Register R = WorkList.pop_back_val();
|
||||
MachineInstr *DefMI = MRI->getVRegDef(R);
|
||||
MachineBasicBlock *DefMBB = DefMI->getParent();
|
||||
if (!DefMI || DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
|
||||
continue;
|
||||
if (DefMI->isPHI()) {
|
||||
for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
|
||||
if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
|
||||
RecordShape(DefMI, DefMBB); // In this case, PHI is also a shape def.
|
||||
else
|
||||
WorkList.push_back(DefMI->getOperand(I).getReg());
|
||||
} else {
|
||||
RecordShape(DefMI, DefMBB);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MachineInstr *X86PreTileConfig::getTileConfigPoint() {
|
||||
DenseMap<Register, ShapeT> PhysShapeInfo;
|
||||
MachineBasicBlock *MBB = nullptr;
|
||||
DenseSet<const MachineInstr *> MIs;
|
||||
for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
|
||||
Register VirtReg = Register::index2VirtReg(i);
|
||||
if (MRI->reg_nodbg_empty(VirtReg))
|
||||
continue;
|
||||
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
|
||||
if (RC.getID() != X86::TILERegClassID)
|
||||
continue;
|
||||
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
|
||||
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
|
||||
const TargetInstrInfo *TII = ST.getInstrInfo();
|
||||
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||
|
||||
// Find the common dominator for all MI that define tile register.
|
||||
for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
|
||||
if (MO.isUndef())
|
||||
continue;
|
||||
const auto *MI = MO.getParent();
|
||||
// PHI or IMPLICIT_DEF instructiion.
|
||||
// There must be a input tile before PHI instruction.
|
||||
if (MI->isTransient())
|
||||
continue;
|
||||
if (!MBB)
|
||||
MBB = const_cast<MachineBasicBlock *>(MI->getParent());
|
||||
MBB = DomTree->findNearestCommonDominator(
|
||||
MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
|
||||
BitVector AMXRegs(TRI->getNumRegs());
|
||||
for (unsigned I = 0; I < RC->getNumRegs(); I++)
|
||||
AMXRegs.set(X86::TMM0 + I);
|
||||
|
||||
// Collect the instructions that define shape.
|
||||
ShapeT Shape = getShape(*MI, MRI);
|
||||
std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
|
||||
Shape.getCol()};
|
||||
for (auto *ShapeMO : ShapeMOs) {
|
||||
Register ShapeReg = ShapeMO->getReg();
|
||||
for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
|
||||
const auto *ShapeMI = MO.getParent();
|
||||
MIs.insert(ShapeMI);
|
||||
// Iterate MF to collect information.
|
||||
MRI = &MF.getRegInfo();
|
||||
MLI = &getAnalysis<MachineLoopInfo>();
|
||||
SmallSet<MIRef, 8> CfgNeedInsert;
|
||||
SmallVector<MachineBasicBlock *, 8> CfgLiveInBBs;
|
||||
for (auto &MBB : MF) {
|
||||
size_t Pos = 0;
|
||||
for (auto &MI : MBB) {
|
||||
++Pos;
|
||||
if (isAMXInstruction(MI)) {
|
||||
// If there's call before the AMX, we need to reload tile config.
|
||||
if (BBVisitedInfo[&MBB].LastCall)
|
||||
CfgNeedInsert.insert(BBVisitedInfo[&MBB].LastCall);
|
||||
else // Otherwise, we need tile config to live in this BB.
|
||||
BBVisitedInfo[&MBB].NeedTileCfgLiveIn = true;
|
||||
// Always record the first AMX in case there's shape def after it.
|
||||
if (!BBVisitedInfo[&MBB].FirstAMX)
|
||||
BBVisitedInfo[&MBB].FirstAMX = MIRef(&MI, &MBB, Pos);
|
||||
} else if (MI.isCall() && isDestructiveCall(MI, AMXRegs)) {
|
||||
// Record the call only if the callee clobbers all AMX registers.
|
||||
BBVisitedInfo[&MBB].LastCall = MIRef(&MI, &MBB, Pos);
|
||||
}
|
||||
}
|
||||
if (BBVisitedInfo[&MBB].NeedTileCfgLiveIn) {
|
||||
if (&MBB == &MF.front())
|
||||
CfgNeedInsert.insert(MIRef(&MBB));
|
||||
else
|
||||
CfgLiveInBBs.push_back(&MBB);
|
||||
}
|
||||
}
|
||||
|
||||
// Update NeedTileCfgLiveIn for predecessors.
|
||||
while (!CfgLiveInBBs.empty()) {
|
||||
MachineBasicBlock *MBB = CfgLiveInBBs.pop_back_val();
|
||||
for (auto *Pred : MBB->predecessors()) {
|
||||
if (BBVisitedInfo[Pred].LastCall) {
|
||||
CfgNeedInsert.insert(BBVisitedInfo[Pred].LastCall);
|
||||
} else if (!BBVisitedInfo[Pred].NeedTileCfgLiveIn) {
|
||||
BBVisitedInfo[Pred].NeedTileCfgLiveIn = true;
|
||||
if (Pred == &MF.front())
|
||||
CfgNeedInsert.insert(MIRef(Pred));
|
||||
else
|
||||
CfgLiveInBBs.push_back(Pred);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// There's no AMX instruction if we didn't find a tile config live in point.
|
||||
if (CfgNeedInsert.empty())
|
||||
return false;
|
||||
|
||||
// Calculate how many times the ShapeBB can reach to this BB.
|
||||
unsigned ShapeBBNum = 0;
|
||||
for (auto *MBB : ShapeBBs) {
|
||||
SmallSet<MachineBasicBlock *, 8> VistedBB;
|
||||
SmallVector<MachineBasicBlock *, 8> WorkList({MBB});
|
||||
while (!WorkList.empty()) {
|
||||
MachineBasicBlock *MBB = WorkList.pop_back_val();
|
||||
++BBVisitedInfo[MBB].ShapeReachedCount;
|
||||
for (auto *Succ : MBB->successors())
|
||||
if (VistedBB.insert(Succ).second && !isLoopBackEdge(Succ, MBB))
|
||||
WorkList.push_back(Succ);
|
||||
}
|
||||
++ShapeBBNum;
|
||||
}
|
||||
|
||||
DebugLoc DL;
|
||||
SmallSet<MIRef, 8> VisitedOrInserted;
|
||||
int SS = MF.getFrameInfo().CreateStackObject(
|
||||
ST.getTileConfigSize(), ST.getTileConfigAlignment(), false);
|
||||
|
||||
// Try to insert for the tile config live in points.
|
||||
for (auto I : CfgNeedInsert) {
|
||||
SmallSet<MIRef, 8> InsertPoints;
|
||||
SmallVector<MIRef, 8> WorkList({I});
|
||||
while (!WorkList.empty()) {
|
||||
MIRef I = WorkList.pop_back_val();
|
||||
if (!VisitedOrInserted.count(I)) {
|
||||
if (BBVisitedInfo[I.MBB].ShapeReachedCount == ShapeBBNum) {
|
||||
// If the BB is all shapes reachable, stop sink and try to insert.
|
||||
InsertPoints.insert(I);
|
||||
} else {
|
||||
// Avoid the BB to be multi visited.
|
||||
VisitedOrInserted.insert(I);
|
||||
// We cannot sink it across any AMX instruction.
|
||||
if (BBVisitedInfo[I.MBB].FirstAMX)
|
||||
REPORT_CONFIG_FAIL;
|
||||
// Sink the inserting point along the chain with NeedTileCfgLiveIn =
|
||||
// true when MBB isn't all shapes reachable.
|
||||
for (auto *Succ : I.MBB->successors())
|
||||
if (BBVisitedInfo[Succ].NeedTileCfgLiveIn)
|
||||
WorkList.push_back(MIRef(Succ));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!MBB)
|
||||
return nullptr;
|
||||
// This pass is before the pass of eliminating PHI node, so it
|
||||
// is in SSA form.
|
||||
assert(MRI->isSSA() && "Not SSA form in pre-tile config");
|
||||
// Shape def should dominate tile config MBB.
|
||||
// def s s1 s2
|
||||
// / \ \ /
|
||||
// / \ \ /
|
||||
// conf s3=phi(s1,s2)
|
||||
// |
|
||||
// c
|
||||
//
|
||||
for (const auto *MI : MIs) {
|
||||
const MachineBasicBlock *ShapeMBB = MI->getParent();
|
||||
if (DomTree->dominates(ShapeMBB, MBB))
|
||||
continue;
|
||||
if (MI->isMoveImmediate())
|
||||
continue;
|
||||
report_fatal_error(MF->getName() + ": Failed to config tile register, "
|
||||
"please define the shape earlier");
|
||||
}
|
||||
|
||||
// ldtilecfg should be inserted after the MI that define the shape.
|
||||
MachineBasicBlock::reverse_instr_iterator I, E;
|
||||
for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
|
||||
auto *MI = &*I;
|
||||
if (MIs.count(MI) && (!MI->isMoveImmediate()))
|
||||
break;
|
||||
}
|
||||
MachineBasicBlock::iterator MII;
|
||||
if (I == E)
|
||||
MII = MBB->getFirstNonPHI();
|
||||
else {
|
||||
MII = MachineBasicBlock::iterator(&*I);
|
||||
MII++;
|
||||
}
|
||||
return &*MII;
|
||||
}
|
||||
|
||||
static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
|
||||
switch (MII->getOpcode()) {
|
||||
default:
|
||||
return false;
|
||||
case X86::PTILELOADDV:
|
||||
case X86::PTILESTOREDV:
|
||||
case X86::PTDPBSSDV:
|
||||
case X86::PTDPBSUDV:
|
||||
case X86::PTDPBUSDV:
|
||||
case X86::PTDPBUUDV:
|
||||
case X86::PTILEZEROV:
|
||||
case X86::PTDPBF16PSV:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
struct BBInfo {
|
||||
bool HasAMX = false;
|
||||
bool HasCallBeforeAMX = false;
|
||||
bool HasAMXBeforeCallInSuccs = false;
|
||||
MachineInstr *LastCall = nullptr;
|
||||
|
||||
BBInfo() = default;
|
||||
BBInfo(SmallSet<MachineInstr *, 8> &CfgNeedInsert, MachineBasicBlock *MBB,
|
||||
MachineInstr *MI = nullptr) {
|
||||
MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin();
|
||||
for (auto E = MBB->end(); MII != E; ++MII) {
|
||||
if (isAMXInstruction(MII)) {
|
||||
HasAMX = true;
|
||||
if (LastCall)
|
||||
CfgNeedInsert.insert(LastCall);
|
||||
} else if (MII->isCall()) {
|
||||
LastCall = &*MII;
|
||||
if (!HasAMX)
|
||||
HasCallBeforeAMX = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void reloadTileConfig(MachineInstr *MI, int FI,
|
||||
const TargetInstrInfo *TII,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
SmallSet<MachineInstr *, 8> CfgNeedInsert;
|
||||
SmallVector<MachineBasicBlock *, 8> WorkList;
|
||||
DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
|
||||
|
||||
MachineBasicBlock *MBB = MI->getParent();
|
||||
BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI);
|
||||
|
||||
// The entry BB is special, since it always has a ldtilecfg before AMX
|
||||
// instruction. We don't need to check if its predecessor BBs have call.
|
||||
// FIXME: This case happens only when the entry BB is in a loop. We need to
|
||||
// hoist the first tile config point out of the loop in future.
|
||||
BBVisitedInfo[MBB].HasCallBeforeAMX = true;
|
||||
|
||||
WorkList.push_back(MBB);
|
||||
while (!WorkList.empty()) {
|
||||
MBB = WorkList.pop_back_val();
|
||||
for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
|
||||
if (!BBVisitedInfo.count(*I)) {
|
||||
BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I);
|
||||
WorkList.push_back(*I);
|
||||
// A given point might be forked due to shape conditions are not met.
|
||||
for (MIRef I : InsertPoints) {
|
||||
// Even MBB is all shapes reachable, we still need to check if there's
|
||||
// AMX that intersects with shapes in the same MBB.
|
||||
if (BBVisitedInfo[I.MBB].FirstAMX &&
|
||||
BBVisitedInfo[I.MBB].FirstAMX < BBVisitedInfo[I.MBB].LastShape)
|
||||
REPORT_CONFIG_FAIL;
|
||||
// Make sure we insert ldtilecfg after the last shape def in MBB.
|
||||
if (I < BBVisitedInfo[I.MBB].LastShape)
|
||||
I = BBVisitedInfo[I.MBB].LastShape;
|
||||
// There're chances the MBB is sunk more than once. Record it to avoid
|
||||
// multi insert.
|
||||
if (VisitedOrInserted.insert(I).second) {
|
||||
auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
|
||||
addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)),
|
||||
SS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WorkList.clear();
|
||||
for (auto I : BBVisitedInfo) {
|
||||
WorkList.push_back(I.first);
|
||||
while (!WorkList.empty()) {
|
||||
MBB = WorkList.pop_back_val();
|
||||
if (BBVisitedInfo[MBB].HasCallBeforeAMX ||
|
||||
(!BBVisitedInfo[MBB].HasAMX &&
|
||||
!BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs))
|
||||
continue;
|
||||
for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
|
||||
if (!BBVisitedInfo.count(*I) ||
|
||||
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs)
|
||||
continue;
|
||||
if (BBVisitedInfo[*I].LastCall)
|
||||
CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall);
|
||||
BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true;
|
||||
WorkList.push_back(*I);
|
||||
}
|
||||
}
|
||||
// Zero stack slot.
|
||||
MachineBasicBlock &MBB = MF.front();
|
||||
MachineInstr *MI = &*MBB.begin();
|
||||
if (ST.hasAVX512()) {
|
||||
Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
|
||||
BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm)
|
||||
.addReg(Zmm, RegState::Undef)
|
||||
.addReg(Zmm, RegState::Undef);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
|
||||
.addReg(Zmm);
|
||||
} else if (ST.hasAVX2()) {
|
||||
Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
|
||||
BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm)
|
||||
.addReg(Ymm, RegState::Undef)
|
||||
.addReg(Ymm, RegState::Undef);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
|
||||
.addReg(Ymm);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
|
||||
.addReg(Ymm);
|
||||
} else {
|
||||
assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
|
||||
Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
|
||||
BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm)
|
||||
.addReg(Xmm, RegState::Undef)
|
||||
.addReg(Xmm, RegState::Undef);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32)
|
||||
.addReg(Xmm);
|
||||
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48)
|
||||
.addReg(Xmm);
|
||||
}
|
||||
|
||||
for (auto *I : CfgNeedInsert) {
|
||||
BitVector UsableRegs(TRI->getNumRegs());
|
||||
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
|
||||
for (unsigned J = 0; J < RC->getNumRegs(); J++)
|
||||
UsableRegs.set(X86::TMM0 + J);
|
||||
for (MachineOperand &CallMO : I->operands()) {
|
||||
if (CallMO.isRegMask())
|
||||
UsableRegs.clearBitsInMask(CallMO.getRegMask());
|
||||
}
|
||||
if (!UsableRegs.none())
|
||||
addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(),
|
||||
TII->get(X86::LDTILECFG)),
|
||||
FI);
|
||||
}
|
||||
}
|
||||
|
||||
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
|
||||
MF = &mf;
|
||||
MRI = &mf.getRegInfo();
|
||||
ST = &mf.getSubtarget<X86Subtarget>();
|
||||
TRI = ST->getRegisterInfo();
|
||||
TII = mf.getSubtarget().getInstrInfo();
|
||||
DomTree = &getAnalysis<MachineDominatorTree>();
|
||||
|
||||
MachineInstr *MI = getTileConfigPoint();
|
||||
if (!MI)
|
||||
return false;
|
||||
unsigned Size = ST->getTileConfigSize();
|
||||
Align Alignment = ST->getTileConfigAlignment();
|
||||
int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
|
||||
buildConfigMI(MI, SS, TII, MRI, ST);
|
||||
reloadTileConfig(MI, SS, TII, TRI);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -113,9 +113,10 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
|
|||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8
|
||||
; CHECK-NEXT: movl %edi, %r14d
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
|
@ -133,7 +134,6 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
|
|||
; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
|
@ -154,7 +154,6 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
|
|||
; CHECK-NEXT: incl %r14d
|
||||
; CHECK-NEXT: jmp .LBB2_8
|
||||
; CHECK-NEXT: .LBB2_4:
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $32, %eax
|
||||
|
@ -180,13 +179,13 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
|
|||
; IPRA: # %bb.0:
|
||||
; IPRA-NEXT: subq $72, %rsp
|
||||
; IPRA-NEXT: movl %edi, %eax
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: testl %edi, %edi
|
||||
; IPRA-NEXT: jg .LBB2_4
|
||||
; IPRA-NEXT: # %bb.1: # %.preheader
|
||||
|
@ -273,26 +272,26 @@ define dso_local void @test_loop2(i32 %0) nounwind {
|
|||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: subq $3024, %rsp # imm = 0xBD0
|
||||
; CHECK-NEXT: movl %edi, %ebx
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $buf, %r14d
|
||||
; CHECK-NEXT: movl $32, %r15d
|
||||
; CHECK-NEXT: movw $8, %bp
|
||||
; CHECK-NEXT: movl $buf+2048, %r12d
|
||||
; CHECK-NEXT: .p2align 4, 0x90
|
||||
; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: testl %ebx, %ebx
|
||||
; CHECK-NEXT: jle .LBB3_3
|
||||
; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: testl %ebx, %ebx
|
||||
; CHECK-NEXT: jle .LBB3_3
|
||||
; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
|
||||
; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movabsq $64, %rax
|
||||
|
@ -313,6 +312,12 @@ define dso_local void @test_loop2(i32 %0) nounwind {
|
|||
; IPRA-LABEL: test_loop2:
|
||||
; IPRA: # %bb.0:
|
||||
; IPRA-NEXT: subq $72, %rsp
|
||||
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movl $buf, %eax
|
||||
; IPRA-NEXT: movl $32, %ecx
|
||||
; IPRA-NEXT: movw $8, %dx
|
||||
|
@ -323,12 +328,6 @@ define dso_local void @test_loop2(i32 %0) nounwind {
|
|||
; IPRA-NEXT: testl %edi, %edi
|
||||
; IPRA-NEXT: jle .LBB3_3
|
||||
; IPRA-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
|
||||
; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
|
||||
; IPRA-NEXT: callq foo
|
||||
; IPRA-NEXT: tilestored %tmm0, (%rsi,%rcx)
|
||||
|
|
|
@ -10,10 +10,10 @@
|
|||
define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
|
||||
; AVX512-LABEL: test_api:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: testl %edi, %edi
|
||||
; AVX512-NEXT: movsbl %sil, %eax
|
||||
; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: testl %edi, %edi
|
||||
; AVX512-NEXT: movsbl %sil, %eax
|
||||
; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
|
@ -43,11 +43,11 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
|
|||
;
|
||||
; AVX2-LABEL: test_api:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: testl %edi, %edi
|
||||
; AVX2-NEXT: movsbl %sil, %eax
|
||||
; AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: testl %edi, %edi
|
||||
; AVX2-NEXT: movsbl %sil, %eax
|
||||
; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
|
@ -77,13 +77,13 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
|
|||
;
|
||||
; SSE2-LABEL: test_api:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: testl %edi, %edi
|
||||
; SSE2-NEXT: movsbl %sil, %eax
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: testl %edi, %edi
|
||||
; SSE2-NEXT: movsbl %sil, %eax
|
||||
; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
|
|
|
@ -32,6 +32,7 @@ define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
|
|||
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
||||
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
||||
call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
||||
tail call void @foo()
|
||||
ret void
|
||||
|
@ -47,6 +48,8 @@ define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: movl %edi, %ebp
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
||||
|
@ -59,9 +62,6 @@ define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
|||
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: callq foo
|
||||
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB1_3
|
||||
|
@ -116,8 +116,252 @@ exit:
|
|||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB2_2
|
||||
; CHECK-NEXT: # %bb.1: # %if.true
|
||||
; CHECK-NEXT: incl %edi
|
||||
; CHECK-NEXT: jmp .LBB2_3
|
||||
; CHECK-NEXT: .LBB2_2: # %if.false
|
||||
; CHECK-NEXT: decl %edi
|
||||
; CHECK-NEXT: .LBB2_3: # %exit
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
br i1 undef, label %if.true, label %if.false
|
||||
|
||||
if.true:
|
||||
%3 = add i16 %0, 1
|
||||
br label %exit
|
||||
|
||||
if.false:
|
||||
%4 = sub i16 %0, 1
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
||||
%6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: There's PRA Tile Register Configure bug needs to fix later.
|
||||
define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test4:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB3_3
|
||||
; CHECK-NEXT: # %bb.1: # %if.true
|
||||
; CHECK-NEXT: incl %edi
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB3_4
|
||||
; CHECK-NEXT: .LBB3_2: # %amx2
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movl $32, %eax
|
||||
; CHECK-NEXT: movl $buf+1024, %ecx
|
||||
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
|
||||
; CHECK-NEXT: movl $buf, %ecx
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
; CHECK-NEXT: .LBB3_3: # %if.false
|
||||
; CHECK-NEXT: decl %edi
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB3_2
|
||||
; CHECK-NEXT: .LBB3_4: # %amx1
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: movl $buf, %eax
|
||||
; CHECK-NEXT: movl $32, %ecx
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
br i1 undef, label %if.true, label %if.false
|
||||
|
||||
if.true:
|
||||
%3 = add i16 %0, 1
|
||||
br i1 undef, label %amx1, label %amx2
|
||||
|
||||
if.false:
|
||||
%4 = sub i16 %0, 1
|
||||
br i1 undef, label %amx2, label %amx1
|
||||
|
||||
amx1:
|
||||
%5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
||||
%6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
||||
br label %exit
|
||||
|
||||
amx2:
|
||||
%7 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
||||
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %8)
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
|
||||
; CHECK-LABEL: test5:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %eax, %eax
|
||||
; CHECK-NEXT: movl $buf, %r8d
|
||||
; CHECK-NEXT: movl $32, %edx
|
||||
; CHECK-NEXT: leal -1(%rsi), %ecx
|
||||
; CHECK-NEXT: jmp .LBB4_1
|
||||
; CHECK-NEXT: .p2align 4, 0x90
|
||||
; CHECK-NEXT: .LBB4_3: # %if.false
|
||||
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
|
||||
; CHECK-NEXT: movl %ecx, %esi
|
||||
; CHECK-NEXT: cmpw $7, %si
|
||||
; CHECK-NEXT: jne .LBB4_5
|
||||
; CHECK-NEXT: .LBB4_1: # %loop.bb1
|
||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: testb %al, %al
|
||||
; CHECK-NEXT: jne .LBB4_3
|
||||
; CHECK-NEXT: # %bb.2: # %if.true
|
||||
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%r8,%rdx)
|
||||
; CHECK-NEXT: cmpw $7, %si
|
||||
; CHECK-NEXT: je .LBB4_1
|
||||
; CHECK-NEXT: .LBB4_5: # %exit
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
br label %loop.bb1
|
||||
|
||||
loop.bb1:
|
||||
%2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ]
|
||||
br i1 undef, label %if.true, label %if.false
|
||||
|
||||
if.true:
|
||||
%3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %3)
|
||||
br label %loop.bb2
|
||||
|
||||
if.false:
|
||||
%4 = sub i16 %1, 1
|
||||
br label %loop.bb2
|
||||
|
||||
loop.bb2:
|
||||
%5 = phi i16 [ %2, %if.true ], [ %4, %if.false ]
|
||||
%6 = icmp eq i16 %5, 7
|
||||
br i1 %6, label %loop.bb1, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @test6(i16 signext %0) nounwind {
|
||||
; CHECK-LABEL: test6:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
|
||||
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
||||
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: xorl %r8d, %r8d
|
||||
; CHECK-NEXT: movl $buf, %ecx
|
||||
; CHECK-NEXT: movl $32, %edx
|
||||
; CHECK-NEXT: xorl %esi, %esi
|
||||
; CHECK-NEXT: jmp .LBB5_1
|
||||
; CHECK-NEXT: .p2align 4, 0x90
|
||||
; CHECK-NEXT: .LBB5_3: # %if.false
|
||||
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
||||
; CHECK-NEXT: decl %esi
|
||||
; CHECK-NEXT: .LBB5_4: # %loop.bb2
|
||||
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
||||
; CHECK-NEXT: leal (%rdi,%rsi), %eax
|
||||
; CHECK-NEXT: cmpw $7, %si
|
||||
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: tilezero %tmm0
|
||||
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
|
||||
; CHECK-NEXT: jne .LBB5_5
|
||||
; CHECK-NEXT: .LBB5_1: # %loop.bb1
|
||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: testb %r8b, %r8b
|
||||
; CHECK-NEXT: jne .LBB5_3
|
||||
; CHECK-NEXT: # %bb.2: # %if.true
|
||||
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
||||
; CHECK-NEXT: incl %esi
|
||||
; CHECK-NEXT: jmp .LBB5_4
|
||||
; CHECK-NEXT: .LBB5_5: # %exit
|
||||
; CHECK-NEXT: tilerelease
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
entry:
|
||||
br label %loop.bb1
|
||||
|
||||
loop.bb1:
|
||||
%1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ]
|
||||
br i1 undef, label %if.true, label %if.false
|
||||
|
||||
if.true:
|
||||
%2 = add i16 %1, 1
|
||||
br label %loop.bb2
|
||||
|
||||
if.false:
|
||||
%3 = sub i16 %1, 1
|
||||
br label %loop.bb2
|
||||
|
||||
loop.bb2:
|
||||
%4 = phi i16 [ %2, %if.true ], [ %3, %if.false ]
|
||||
%5 = icmp eq i16 %4, 7
|
||||
%6 = add i16 %0, %4
|
||||
%7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6)
|
||||
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %7)
|
||||
br i1 %5, label %loop.bb1, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare dso_local void @foo() nounwind
|
||||
declare void @llvm.dbg.value(metadata, metadata, metadata)
|
||||
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
||||
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
||||
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
||||
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!1}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1"))
|
||||
!1 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!2 = distinct !DISubprogram(unit: !0)
|
||||
!3 = !DILocation(line: 1, column: 1, scope: !2)
|
||||
|
|
|
@ -117,12 +117,12 @@
|
|||
; CHECK-NEXT: X86 EFLAGS copy lowering
|
||||
; CHECK-NEXT: X86 WinAlloca Expander
|
||||
; CHECK-NEXT: MachineDominator Tree Construction
|
||||
; CHECK-NEXT: Machine Natural Loop Construction
|
||||
; CHECK-NEXT: Tile Register Pre-configure
|
||||
; CHECK-NEXT: Detect Dead Lanes
|
||||
; CHECK-NEXT: Process Implicit Definitions
|
||||
; CHECK-NEXT: Remove unreachable machine basic blocks
|
||||
; CHECK-NEXT: Live Variable Analysis
|
||||
; CHECK-NEXT: Machine Natural Loop Construction
|
||||
; CHECK-NEXT: Eliminate PHI nodes for register allocation
|
||||
; CHECK-NEXT: Two-Address instruction pass
|
||||
; CHECK-NEXT: Slot index numbering
|
||||
|
|
Loading…
Reference in New Issue