Reland [llvm-exegesis] Add benchmark latency option on X86 that uses LBR for more precise measurements.

Starting with Skylake, the LBR contains the precise number of cycles between the two
        consecutive branches.
        Making use of this will hopefully make the measurements more precise than the
        existing methods of using RDTSC.

                Differential Revision: https://reviews.llvm.org/D77422

New change: check for existence of field `cycles` in perf_branch_entry before enabling this mode.
This should prevent compilation errors when building for older kernel whose headers don't support it.
This commit is contained in:
Vy Nguyen 2020-07-27 12:38:05 -04:00
parent 52dd18ab1d
commit ee7caa7593
12 changed files with 415 additions and 12 deletions

View File

@ -192,10 +192,24 @@ OPTIONS
.. option:: -mode=[latency|uops|inverse_throughput|analysis]
Specify the run mode. Note that if you pick `analysis` mode, you also need
to specify at least one of the `-analysis-clusters-output-file=` and
`-analysis-inconsistencies-output-file=`.
Specify the run mode. Note that some modes have additional requirements and options.
`latency` mode can be make use of either RDTSC or LBR.
`latency[LBR]` is only available on X86 (at least `Skylake`).
To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop`
In `analysis` mode, you also need to specify at least one of the
`-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
.. option:: -x86-lbr-sample-period=<nBranches/sample>
Specify the LBR sampling period - how many branches before we take a sample.
When a positive value is specified for this option and when the mode is `latency`,
we will use LBRs for measuring.
On choosing the "right" sampling period, a small value is preferred, but throttling
could occur if the sampling is too frequent. A prime number should be used to
avoid consistently skipping certain blocks.
.. option:: -repetition-mode=[duplicate|loop|min]
Specify the repetition mode. `duplicate` will create a large, straight line

View File

@ -0,0 +1,4 @@
# LLVM-EXEGESIS-LIVEIN RDI
# LLVM-EXEGESIS-DEFREG XMM1 42
movq $2, %rdi
addq $0x10, %rdi

View File

@ -0,0 +1,31 @@
import subprocess
import lit.util
if not ('X86' in config.root.targets):
# We need support for X86.
config.unsupported = True
elif not ('x86_64' in config.root.host_triple):
# We need to be running on an X86 host.
config.unsupported = True
else:
# We need libpfm to be installed and the host to be at least skylake.
llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir)
if not llvm_exegesis_exe:
print('llvm-exegesis not found')
config.unsupported = True
else:
try:
with open(os.devnull, 'w') as quiet:
check_llvm_exegesis_uops_result = subprocess.call(
[llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
check_llvm_exegesis_latency_result = subprocess.call(
[llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
except OSError:
print('could not exec llvm-exegesis')
config.unsupported = True
if not check_llvm_exegesis_uops_result == 0:
config.unsupported = True
if not check_llvm_exegesis_latency_result == 0:
config.unsupported = True

View File

@ -0,0 +1,18 @@
# RUN: llvm-exegesis -mode=latency --repetition-mode=loop --x86-lbr-sample-period=521 --snippets-file=%p/Inputs/mov_add.att
CHECK: ---
CHECK-NEXT: mode: latency
CHECK-NEXT: key:
CHECK-NEXT: instructions:
CHECK-NEXT: 'MOV64ri32 RDI i_0x2'
CHECK-NEXT: 'ADD64ri8 RDI RDI i_0x10'
CHECK-NEXT: config: ''
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: num_repetitions: 10000
CHECK-NEXT: measurements:
CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}}
CHECK-LAST: ...

View File

@ -55,7 +55,6 @@ private:
static void
accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
llvm::SmallVector<int64_t, 4> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
@ -106,10 +105,10 @@ private:
if (Crashed)
return make_error<SnippetCrash>("snippet crashed while running");
}
auto ValueOrError = Counter->readOrError();
auto ValueOrError = Counter->readOrError(Function.getFunctionBytes());
if (!ValueOrError)
return ValueOrError.takeError();
accumulateCounterValues(ValueOrError.get(), &CounterValues);
}
return CounterValues;

View File

@ -128,7 +128,8 @@ int64_t Counter::read() const {
return -1;
}
llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
llvm::Expected<llvm::SmallVector<int64_t, 4>>
Counter::readOrError(StringRef /*unused*/) const {
int64_t Count = 0;
ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count));
if (ReadSize != sizeof(Count))
@ -152,7 +153,8 @@ void Counter::stop() {}
int64_t Counter::read() const { return 42; }
llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
llvm::Expected<llvm::SmallVector<int64_t, 4>>
Counter::readOrError(StringRef /*unused*/) const {
return llvm::make_error<llvm::StringError>("Not implemented",
llvm::errc::io_error);
}

View File

@ -59,8 +59,9 @@ public:
// e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1"
StringRef getPfmEventString() const;
private:
const std::string EventString;
protected:
PerfEvent() = default;
std::string EventString;
std::string FullQualifiedEventString;
perf_event_attr *Attr;
};
@ -87,11 +88,17 @@ public:
int64_t read() const;
/// Returns the current value of the counter or error if it cannot be read.
virtual llvm::Expected<llvm::SmallVector<int64_t, 4>> readOrError() const;
/// FunctionBytes: The benchmark function being executed.
/// This is used to filter out the measurements to ensure they are only
/// within the benchmarked code.
/// If empty (or not specified), then no filtering will be done.
/// Not all counters choose to use this.
virtual llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes = StringRef()) const;
virtual int numValues() const;
private:
protected:
PerfEvent Event;
#ifdef HAVE_LIBPFM
int FileDescriptor = -1;

View File

@ -6,6 +6,7 @@ include_directories(
add_library(LLVMExegesisX86
STATIC
Target.cpp
X86Counter.cpp
)
llvm_update_compile_flags(LLVMExegesisX86)

View File

@ -14,15 +14,40 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86.h"
#include "X86Counter.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"
#include <memory>
#include <string>
#include <vector>
namespace llvm {
namespace exegesis {
static cl::OptionCategory
BenchmarkOptions("llvm-exegesis benchmark x86-options");
// If a positive value is specified, we are going to use the LBR in
// latency-mode.
//
// Note:
// - A small value is preferred, but too low a value could result in
// throttling.
// - A prime number is preferred to avoid always skipping certain blocks.
//
static cl::opt<unsigned> LbrSamplingPeriod(
"x86-lbr-sample-period",
cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
cl::cat(BenchmarkOptions), cl::init(0));
// FIXME: Validates that repetition-mode is loop if LBR is requested.
// Returns a non-null reason if we cannot handle the memory references in this
// instruction.
static const char *isInvalidMemoryInstr(const Instruction &Instr) {
@ -568,10 +593,32 @@ void ConstantInliner::initStack(unsigned Bytes) {
#include "X86GenExegesis.inc"
namespace {
class ExegesisX86Target : public ExegesisTarget {
public:
ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName, const LLVMState &State) const override {
// If LbrSamplingPeriod was provided, then ignore the
// CounterName because we only have one for LBR.
if (LbrSamplingPeriod > 0) {
// Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
// __linux__ (for now)
#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
defined(__linux__)
return std::make_unique<X86LbrCounter>(
X86LbrPerfEvent(LbrSamplingPeriod));
#else
return llvm::make_error<llvm::StringError>(
"LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
"or running on Linux.",
llvm::errc::invalid_argument);
#endif
}
return ExegesisTarget::createCounter(CounterName, State);
}
private:
void addTargetSpecificPasses(PassManagerBase &PM) const override;

View File

@ -0,0 +1,212 @@
//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "X86Counter.h"
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#ifdef __linux__
#include "llvm/Support/Endian.h"
#include "llvm/Support/Errc.h"
#ifdef HAVE_LIBPFM
#include "perfmon/perf_event.h"
#include "perfmon/pfmlib.h"
#include "perfmon/pfmlib_perf_event.h"
#endif // HAVE_LIBPFM
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include <poll.h>
#include <sys/mman.h>
#include <unistd.h>
#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
namespace llvm {
namespace exegesis {
static constexpr size_t kBufferPages = 8;
static const size_t kDataBufferSize = kBufferPages * getpagesize();
// Waits for the LBR perf events.
static int pollLbrPerfEvent(const int FileDescriptor) {
struct pollfd PollFd;
PollFd.fd = FileDescriptor;
PollFd.events = POLLIN;
PollFd.revents = 0;
return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
}
// Copies the data-buffer into Buf, given the pointer to MMapped.
static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
size_t DataSize) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page.
char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
// The LBR buffer is a cyclic buffer, we copy data to another buffer.
uint64_t Offset = Tail % kDataBufferSize;
size_t CopySize = kDataBufferSize - Offset;
memcpy(Buf, Start + Offset, CopySize);
if (CopySize >= DataSize)
return;
memcpy(Buf + CopySize, Start, Offset);
return;
}
// Parses the given data-buffer for stats and fill the CycleArray.
// If data has been extracted successfully, also modifies the code to jump
// out the benchmark loop.
static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
const void *From, const void *To,
llvm::SmallVector<int64_t, 4> *CycleArray) {
assert(From != nullptr && To != nullptr);
const char *DataPtr = DataBuf;
while (DataPtr < DataBuf + DataSize) {
struct perf_event_header Header;
memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
if (Header.type != PERF_RECORD_SAMPLE) {
// Ignores non-sample records.
DataPtr += Header.size;
continue;
}
DataPtr += sizeof(Header);
uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
DataPtr += sizeof(Count);
struct perf_branch_entry Entry;
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
// Read the perf_branch_entry array.
for (uint64_t i = 0; i < Count; ++i) {
const uint64_t BlockStart = From == nullptr
? std::numeric_limits<uint64_t>::min()
: reinterpret_cast<uint64_t>(From);
const uint64_t BlockEnd = To == nullptr
? std::numeric_limits<uint64_t>::max()
: reinterpret_cast<uint64_t>(To);
if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
CycleArray->push_back(Entry.cycles);
if (i == Count - 1)
// We've reached the last entry.
return llvm::Error::success();
// Advance to next entry
DataPtr += sizeof(Entry);
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
}
}
return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
llvm::errc::io_error);
}
X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
EventString = "BR_INST_RETIRED.NEAR_TAKEN";
Attr = new perf_event_attr();
Attr->size = sizeof(*Attr);
Attr->type = PERF_TYPE_RAW;
// FIXME This is SKL's encoding. Not sure if it'll change.
Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
// Don't need to specify "USER" because we've already excluded HV and Kernel.
Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
Attr->sample_period = SamplingPeriod;
Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
Attr->disabled = 1;
Attr->exclude_kernel = 1;
Attr->exclude_hv = 1;
Attr->read_format = PERF_FORMAT_GROUP;
FullQualifiedEventString = EventString;
}
X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
: Counter(std::move(NewEvent)) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page, so we allocate one more page.
MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
if (MMappedBuffer == MAP_FAILED)
llvm::errs() << "Failed to mmap buffer.";
}
X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
void X86LbrCounter::start() {
ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::readOrError(StringRef FunctionBytes) const {
// The max number of time-outs/retries before we give up.
static constexpr int kMaxTimeouts = 160;
// Disable the event before reading
ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
// Parses the LBR buffer and fills CycleArray with the sequence of cycle
// counts from the buffer.
llvm::SmallVector<int64_t, 4> CycleArray;
std::unique_ptr<char[]> DataBuf(new char[kDataBufferSize]);
int NumTimeouts = 0;
int PollResult = 0;
// Find the boundary of the function so that we could filter the LBRs
// to keep only the relevant records.
if (FunctionBytes.empty())
return llvm::make_error<llvm::StringError>("Empty function bytes",
llvm::errc::invalid_argument);
const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
FunctionBytes.size());
while (PollResult <= 0) {
PollResult = pollLbrPerfEvent(FileDescriptor);
if (PollResult > 0)
break;
if (PollResult == -1)
return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
llvm::errc::io_error);
if (NumTimeouts++ >= kMaxTimeouts)
return llvm::make_error<llvm::StringError>(
"LBR polling still timed out after max number of attempts.",
llvm::errc::device_or_resource_busy);
}
struct perf_event_mmap_page Page;
memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
const uint64_t DataTail = Page.data_tail;
const uint64_t DataHead = Page.data_head;
// We're supposed to use a barrier after reading data_head.
std::atomic_thread_fence(std::memory_order_acq_rel);
const size_t DataSize = DataHead - DataTail;
if (DataSize > kDataBufferSize)
return llvm::make_error<llvm::StringError>(
"DataSize larger than buffer size.", llvm::errc::invalid_argument);
copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
llvm::Error error =
parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
if (!error)
return CycleArray;
return std::move(error);
}
} // namespace exegesis
} // namespace llvm
#endif // defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
#endif // __linux__

View File

@ -0,0 +1,55 @@
//===-- X86Counter.h --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Perf counter that reads the LBRs for measuring the benchmarked block's
/// throughput.
///
/// More info at: https://lwn.net/Articles/680985
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H
#define LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H
#include "../PerfHelper.h"
#include "llvm/Support/Error.h"
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#if defined(__linux__) && defined(HAVE_LIBPFM) && \
defined(LIBPFM_HAS_FIELD_CYCLES)
namespace llvm {
namespace exegesis {
class X86LbrPerfEvent : public pfm::PerfEvent {
public:
X86LbrPerfEvent(unsigned SamplingPeriod);
};
class X86LbrCounter : public pfm::Counter {
public:
explicit X86LbrCounter(pfm::PerfEvent &&Event);
virtual ~X86LbrCounter();
void start() override;
llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes) const override;
private:
void *MMappedBuffer = nullptr;
};
} // namespace exegesis
} // namespace llvm
#endif // defined(__linux__) && defined(HAVE_LIBPFM) &&
// defined(LIBPFM_HAS_FIELD_CYCLES)
#endif // LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H

View File

@ -160,6 +160,12 @@ static cl::opt<std::string>
cl::desc(""), cl::cat(AnalysisOptions),
cl::init(""));
static cl::list<std::string>
AllowedHostCpus("allowed-host-cpu",
cl::desc("If specified, only run the benchmark if the host "
"CPU matches the names"),
cl::cat(Options), cl::ZeroOrMore);
static cl::opt<bool> AnalysisDisplayUnstableOpcodes(
"analysis-display-unstable-clusters",
cl::desc("if there is more than one benchmark for an opcode, said "
@ -296,6 +302,13 @@ void benchmarkMain() {
const LLVMState State(CpuName);
llvm::StringRef ActualCpu = State.getTargetMachine().getTargetCPU();
for (auto Begin = AllowedHostCpus.begin(); Begin != AllowedHostCpus.end();
++Begin) {
if (ActualCpu != *Begin)
ExitWithError(llvm::Twine("Unexpected host CPU ").concat(ActualCpu));
}
const std::unique_ptr<BenchmarkRunner> Runner =
ExitOnErr(State.getExegesisTarget().createBenchmarkRunner(
BenchmarkMode, State, ResultAggMode));