forked from OSchip/llvm-project
303 lines
12 KiB
C++
303 lines
12 KiB
C++
//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Uops.h"
|
|
|
|
#include "Assembler.h"
|
|
#include "BenchmarkRunner.h"
|
|
#include "MCInstrDescView.h"
|
|
#include "PerfHelper.h"
|
|
#include "Target.h"
|
|
|
|
// FIXME: Load constants into registers (e.g. with fld1) to not break
|
|
// instructions like x87.
|
|
|
|
// Ideally we would like the only limitation on executing uops to be the issue
|
|
// ports. Maximizing port pressure increases the likelihood that the load is
|
|
// distributed evenly across possible ports.
|
|
|
|
// To achieve that, one approach is to generate instructions that do not have
|
|
// data dependencies between them.
|
|
//
|
|
// For some instructions, this is trivial:
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// mov rax, qword ptr [rsi]
|
|
// For the above snippet, haswell just renames rax four times and executes the
|
|
// four instructions two at a time on P23 and P0126.
|
|
//
|
|
// For some instructions, we just need to make sure that the source is
|
|
// different from the destination. For example, IDIV8r reads from GPR and
|
|
// writes to AX. We just need to ensure that the Var is assigned a
|
|
// register which is different from AX:
|
|
// idiv bx
|
|
// idiv bx
|
|
// idiv bx
|
|
// idiv bx
|
|
// The above snippet will be able to fully saturate the ports, while the same
|
|
// with ax would issue one uop every `latency(IDIV8r)` cycles.
|
|
//
|
|
// Some instructions make this harder because they both read and write from
|
|
// the same register:
|
|
// inc rax
|
|
// inc rax
|
|
// inc rax
|
|
// inc rax
|
|
// This has a data dependency from each instruction to the next, limit the
|
|
// number of instructions that can be issued in parallel.
|
|
// It turns out that this is not a big issue on recent Intel CPUs because they
|
|
// have heuristics to balance port pressure. In the snippet above, subsequent
|
|
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
|
|
// might end up executing them all on P0 (just because they can), or try
|
|
// avoiding P5 because it's usually under high pressure from vector
|
|
// instructions.
|
|
// This issue is even more important for high-latency instructions because
|
|
// they increase the idle time of the CPU, e.g. :
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
// imul rax, rbx
|
|
//
|
|
// To avoid that, we do the renaming statically by generating as many
|
|
// independent exclusive assignments as possible (until all possible registers
|
|
// are exhausted) e.g.:
|
|
// imul rax, rbx
|
|
// imul rcx, rbx
|
|
// imul rdx, rbx
|
|
// imul r8, rbx
|
|
//
|
|
// Some instruction even make the above static renaming impossible because
|
|
// they implicitly read and write from the same operand, e.g. ADC16rr reads
|
|
// and writes from EFLAGS.
|
|
// In that case we just use a greedy register assignment and hope for the
|
|
// best.
|
|
|
|
namespace exegesis {
|
|
|
|
static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
|
|
return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
|
|
}
|
|
|
|
llvm::Error
|
|
UopsSnippetGenerator::isInfeasible(const llvm::MCInstrDesc &MCInstrDesc) const {
|
|
if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand))
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
"Infeasible : has unknown operands");
|
|
return llvm::Error::success();
|
|
}
|
|
|
|
// Returns whether this Variable ties Use and Def operands together.
|
|
static bool hasTiedOperands(const Instruction &Instr, const Variable &Var) {
|
|
bool HasUse = false;
|
|
bool HasDef = false;
|
|
for (const unsigned OpIndex : Var.TiedOperands) {
|
|
const Operand &Op = Instr.Operands[OpIndex];
|
|
if (Op.IsDef)
|
|
HasDef = true;
|
|
else
|
|
HasUse = true;
|
|
}
|
|
return HasUse && HasDef;
|
|
}
|
|
|
|
static llvm::SmallVector<const Variable *, 8>
|
|
getTiedVariables(const Instruction &Instr) {
|
|
llvm::SmallVector<const Variable *, 8> Result;
|
|
for (const auto &Var : Instr.Variables)
|
|
if (hasTiedOperands(Instr, Var))
|
|
Result.push_back(&Var);
|
|
return Result;
|
|
}
|
|
|
|
static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
|
|
assert(a.size() == b.size());
|
|
for (auto I : b.set_bits())
|
|
a.reset(I);
|
|
}
|
|
|
|
UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
|
|
UopsSnippetGenerator::~UopsSnippetGenerator() = default;
|
|
|
|
void UopsSnippetGenerator::instantiateMemoryOperands(
|
|
const unsigned ScratchSpacePointerInReg,
|
|
std::vector<InstructionTemplate> &Instructions) const {
|
|
if (ScratchSpacePointerInReg == 0)
|
|
return; // no memory operands.
|
|
const auto &ET = State.getExegesisTarget();
|
|
const unsigned MemStep = ET.getMaxMemoryAccessSize();
|
|
const size_t OriginalInstructionsSize = Instructions.size();
|
|
size_t I = 0;
|
|
for (InstructionTemplate &IT : Instructions) {
|
|
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
++I;
|
|
}
|
|
|
|
while (Instructions.size() < kMinNumDifferentAddresses) {
|
|
InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
|
|
ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
++I;
|
|
Instructions.push_back(std::move(IT));
|
|
}
|
|
assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
|
|
"not enough scratch space");
|
|
}
|
|
|
|
llvm::Expected<CodeTemplate>
|
|
UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
|
|
const auto &InstrDesc = State.getInstrInfo().get(Opcode);
|
|
if (auto E = isInfeasible(InstrDesc))
|
|
return std::move(E);
|
|
const Instruction Instr(InstrDesc, RATC);
|
|
const auto &ET = State.getExegesisTarget();
|
|
CodeTemplate CT;
|
|
|
|
const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
|
|
if (Instr.hasMemoryOperands()) {
|
|
CT.ScratchSpacePointerInReg =
|
|
ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
|
|
if (CT.ScratchSpacePointerInReg == 0)
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
"Infeasible : target does not support memory instructions");
|
|
ScratchSpaceAliasedRegs =
|
|
&RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
|
|
// If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
|
|
// FIXME: We could make a copy of the scratch register.
|
|
for (const auto &Op : Instr.Operands) {
|
|
if (Op.IsDef && Op.ImplicitReg &&
|
|
ScratchSpaceAliasedRegs->test(*Op.ImplicitReg))
|
|
return llvm::make_error<BenchmarkFailure>(
|
|
"Infeasible : memory instruction uses scratch memory register");
|
|
}
|
|
}
|
|
|
|
const AliasingConfigurations SelfAliasing(Instr, Instr);
|
|
InstructionTemplate IT(Instr);
|
|
if (SelfAliasing.empty()) {
|
|
CT.Info = "instruction is parallel, repeating a random one.";
|
|
CT.Instructions.push_back(std::move(IT));
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return std::move(CT);
|
|
}
|
|
if (SelfAliasing.hasImplicitAliasing()) {
|
|
CT.Info = "instruction is serial, repeating a random one.";
|
|
CT.Instructions.push_back(std::move(IT));
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return std::move(CT);
|
|
}
|
|
const auto TiedVariables = getTiedVariables(Instr);
|
|
if (!TiedVariables.empty()) {
|
|
if (TiedVariables.size() > 1)
|
|
return llvm::make_error<llvm::StringError>(
|
|
"Infeasible : don't know how to handle several tied variables",
|
|
llvm::inconvertibleErrorCode());
|
|
const Variable *Var = TiedVariables.front();
|
|
assert(Var);
|
|
assert(!Var->TiedOperands.empty());
|
|
const Operand &Op = Instr.Operands[Var->TiedOperands.front()];
|
|
assert(Op.Tracker);
|
|
CT.Info = "instruction has tied variables using static renaming.";
|
|
for (const llvm::MCPhysReg Reg : Op.Tracker->sourceBits().set_bits()) {
|
|
if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
|
|
continue; // Do not use the scratch memory address register.
|
|
InstructionTemplate TmpIT = IT;
|
|
TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
|
|
CT.Instructions.push_back(std::move(TmpIT));
|
|
}
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return std::move(CT);
|
|
}
|
|
// No tied variables, we pick random values for defs.
|
|
llvm::BitVector Defs(State.getRegInfo().getNumRegs());
|
|
for (const auto &Op : Instr.Operands) {
|
|
if (Op.Tracker && Op.IsExplicit && Op.IsDef && !Op.IsMem) {
|
|
auto PossibleRegisters = Op.Tracker->sourceBits();
|
|
remove(PossibleRegisters, RATC.reservedRegisters());
|
|
// Do not use the scratch memory address register.
|
|
if (ScratchSpaceAliasedRegs)
|
|
remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
|
|
assert(PossibleRegisters.any() && "No register left to choose from");
|
|
const auto RandomReg = randomBit(PossibleRegisters);
|
|
Defs.set(RandomReg);
|
|
IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
|
|
}
|
|
}
|
|
// And pick random use values that are not reserved and don't alias with defs.
|
|
const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
|
|
for (const auto &Op : Instr.Operands) {
|
|
if (Op.Tracker && Op.IsExplicit && !Op.IsDef && !Op.IsMem) {
|
|
auto PossibleRegisters = Op.Tracker->sourceBits();
|
|
remove(PossibleRegisters, RATC.reservedRegisters());
|
|
// Do not use the scratch memory address register.
|
|
if (ScratchSpaceAliasedRegs)
|
|
remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
|
|
remove(PossibleRegisters, DefAliases);
|
|
assert(PossibleRegisters.any() && "No register left to choose from");
|
|
const auto RandomReg = randomBit(PossibleRegisters);
|
|
IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
|
|
}
|
|
}
|
|
CT.Info =
|
|
"instruction has no tied variables picking Uses different from defs";
|
|
CT.Instructions.push_back(std::move(IT));
|
|
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
return std::move(CT);
|
|
}
|
|
|
|
std::vector<BenchmarkMeasure>
|
|
UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
|
|
ScratchSpace &Scratch) const {
|
|
const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
|
|
|
|
const auto RunMeasurement = [&Function,
|
|
&Scratch](const char *const Counters) {
|
|
// We sum counts when there are several counters for a single ProcRes
|
|
// (e.g. P23 on SandyBridge).
|
|
int64_t CounterValue = 0;
|
|
llvm::SmallVector<llvm::StringRef, 2> CounterNames;
|
|
llvm::StringRef(Counters).split(CounterNames, ',');
|
|
for (const auto &CounterName : CounterNames) {
|
|
pfm::PerfEvent UopPerfEvent(CounterName);
|
|
if (!UopPerfEvent.valid())
|
|
llvm::report_fatal_error(
|
|
llvm::Twine("invalid perf event ").concat(Counters));
|
|
pfm::Counter Counter(UopPerfEvent);
|
|
Scratch.clear();
|
|
Counter.start();
|
|
Function(Scratch.ptr());
|
|
Counter.stop();
|
|
CounterValue += Counter.read();
|
|
}
|
|
return CounterValue;
|
|
};
|
|
|
|
std::vector<BenchmarkMeasure> Result;
|
|
const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
|
|
// Uops per port.
|
|
for (unsigned ProcResIdx = 1;
|
|
ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
|
|
const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
|
|
if (!Counters)
|
|
continue;
|
|
const double CounterValue = RunMeasurement(Counters);
|
|
Result.push_back(BenchmarkMeasure::Create(
|
|
SchedModel.getProcResource(ProcResIdx)->Name, CounterValue));
|
|
}
|
|
// NumMicroOps.
|
|
if (const char *const UopsCounter = PfmCounters.UopsCounter) {
|
|
const double CounterValue = RunMeasurement(UopsCounter);
|
|
Result.push_back(BenchmarkMeasure::Create("NumMicroOps", CounterValue));
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
|
|
|
|
} // namespace exegesis
|