Revert "[llvm-exegesis] Add benchmark latency option on X86 that uses LBR for more precise measurements."

From @erichkeane:
```
This patch doesn't seem to build for me:
/iusers/ekeane1/workspaces/llvm-project/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp: In function ‘llvm::Error llvm::exegesis::parseDataBuffer(const char*, size_t, const void*, const void*, llvm::SmallVector<long int, 4>*)’:
/iusers/ekeane1/workspaces/llvm-project/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp:99:37: error: ‘struct perf_branch_entry’ has no member named ‘cycles’

CycleArray->push_back(Entry.cycles);
I'm on RHEL7, so I have kernel 3.10, so it doesn't have 'cycles'.

According ot this: https://elixir.bootlin.com/linux/v4.3/source/include/uapi/linux/perf_event.h#L963 kernel 4.3 is the first time that 'cycles' appeared in this structure.
```
This commit is contained in:
Clement Courbet 2020-07-17 16:53:44 +02:00
parent 7ebc6bed84
commit 6bddd099ac
12 changed files with 12 additions and 416 deletions

View File

@ -192,24 +192,10 @@ OPTIONS
.. option:: -mode=[latency|uops|inverse_throughput|analysis]
Specify the run mode. Note that some modes have additional requirements and options.
Specify the run mode. Note that if you pick `analysis` mode, you also need
to specify at least one of the `-analysis-clusters-output-file=` and
`-analysis-inconsistencies-output-file=`.
`latency` mode can be make use of either RDTSC or LBR.
`latency[LBR]` is only available on X86 (at least `Skylake`).
To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop`
In `analysis` mode, you also need to specify at least one of the
`-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
.. option:: -x86-lbr-sample-period=<nBranches/sample>
Specify the LBR sampling period - how many branches before we take a sample.
When a positive value is specified for this option and when the mode is `latency`,
we will use LBRs for measuring.
On choosing the "right" sampling period, a small value is preferred, but throttling
could occur if the sampling is too frequent. A prime number should be used to
avoid consistently skipping certain blocks.
.. option:: -repetition-mode=[duplicate|loop|min]
Specify the repetition mode. `duplicate` will create a large, straight line

View File

@ -1,4 +0,0 @@
# LLVM-EXEGESIS-LIVEIN RDI
# LLVM-EXEGESIS-DEFREG XMM1 42
movq $2, %rdi
addq $0x10, %rdi

View File

@ -1,31 +0,0 @@
import subprocess
import lit.util
if not ('X86' in config.root.targets):
# We need support for X86.
config.unsupported = True
elif not ('x86_64' in config.root.host_triple):
# We need to be running on an X86 host.
config.unsupported = True
else:
# We need libpfm to be installed and the host to be at least skylake.
llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir)
if not llvm_exegesis_exe:
print('llvm-exegesis not found')
config.unsupported = True
else:
try:
with open(os.devnull, 'w') as quiet:
check_llvm_exegesis_uops_result = subprocess.call(
[llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
check_llvm_exegesis_latency_result = subprocess.call(
[llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet)
except OSError:
print('could not exec llvm-exegesis')
config.unsupported = True
if not check_llvm_exegesis_uops_result == 0:
config.unsupported = True
if not check_llvm_exegesis_latency_result == 0:
config.unsupported = True

View File

@ -1,18 +0,0 @@
# RUN: llvm-exegesis -mode=latency --repetition-mode=loop --x86-lbr-sample-period=521 --snippets-file=%p/Inputs/mov_add.att
CHECK: ---
CHECK-NEXT: mode: latency
CHECK-NEXT: key:
CHECK-NEXT: instructions:
CHECK-NEXT: 'MOV64ri32 RDI i_0x2'
CHECK-NEXT: 'ADD64ri8 RDI RDI i_0x10'
CHECK-NEXT: config: ''
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: {{.*}}
CHECK-NEXT: num_repetitions: 10000
CHECK-NEXT: measurements:
CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}}
CHECK-LAST: ...

View File

@ -55,6 +55,7 @@ private:
static void
accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
llvm::SmallVector<int64_t, 4> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
@ -105,10 +106,10 @@ private:
if (Crashed)
return make_error<SnippetCrash>("snippet crashed while running");
}
auto ValueOrError = Counter->readOrError(Function.getFunctionBytes());
auto ValueOrError = Counter->readOrError();
if (!ValueOrError)
return ValueOrError.takeError();
accumulateCounterValues(ValueOrError.get(), &CounterValues);
}
return CounterValues;

View File

@ -128,8 +128,7 @@ int64_t Counter::read() const {
return -1;
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
Counter::readOrError(StringRef /*unused*/) const {
llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
int64_t Count = 0;
ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count));
if (ReadSize != sizeof(Count))
@ -153,8 +152,7 @@ void Counter::stop() {}
int64_t Counter::read() const { return 42; }
llvm::Expected<llvm::SmallVector<int64_t, 4>>
Counter::readOrError(StringRef /*unused*/) const {
llvm::Expected<llvm::SmallVector<int64_t, 4>> Counter::readOrError() const {
return llvm::make_error<llvm::StringError>("Not implemented",
llvm::errc::io_error);
}

View File

@ -59,9 +59,8 @@ public:
// e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1"
StringRef getPfmEventString() const;
protected:
PerfEvent() = default;
std::string EventString;
private:
const std::string EventString;
std::string FullQualifiedEventString;
perf_event_attr *Attr;
};
@ -88,17 +87,11 @@ public:
int64_t read() const;
/// Returns the current value of the counter or error if it cannot be read.
/// FunctionBytes: The benchmark function being executed.
/// This is used to filter out the measurements to ensure they are only
/// within the benchmarked code.
/// If empty (or not specified), then no filtering will be done.
/// Not all counters choose to use this.
virtual llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes = StringRef()) const;
virtual llvm::Expected<llvm::SmallVector<int64_t, 4>> readOrError() const;
virtual int numValues() const;
protected:
private:
PerfEvent Event;
#ifdef HAVE_LIBPFM
int FileDescriptor = -1;

View File

@ -6,7 +6,6 @@ include_directories(
add_library(LLVMExegesisX86
STATIC
Target.cpp
X86Counter.cpp
)
llvm_update_compile_flags(LLVMExegesisX86)

View File

@ -14,40 +14,15 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86.h"
#include "X86Counter.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"
#include <memory>
#include <string>
#include <vector>
namespace llvm {
namespace exegesis {
static cl::OptionCategory
BenchmarkOptions("llvm-exegesis benchmark x86-options");
// If a positive value is specified, we are going to use the LBR in
// latency-mode.
//
// Note:
// - A small value is preferred, but too low a value could result in
// throttling.
// - A prime number is preferred to avoid always skipping certain blocks.
//
static cl::opt<unsigned> LbrSamplingPeriod(
"x86-lbr-sample-period",
cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
cl::cat(BenchmarkOptions), cl::init(0));
// FIXME: Validates that repetition-mode is loop if LBR is requested.
// Returns a non-null reason if we cannot handle the memory references in this
// instruction.
static const char *isInvalidMemoryInstr(const Instruction &Instr) {
@ -593,29 +568,10 @@ void ConstantInliner::initStack(unsigned Bytes) {
#include "X86GenExegesis.inc"
namespace {
class ExegesisX86Target : public ExegesisTarget {
public:
ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName, const LLVMState &State) const override {
// If LbrSamplingPeriod was provided, then ignore the
// CounterName because we only have one for LBR.
if (LbrSamplingPeriod > 0) {
// Can't use LBR without HAVE_LIBPFM, or __linux__ (for now)
#if defined(HAVE_LIBPFM) && defined(__linux__)
return std::make_unique<X86LbrCounter>(
X86LbrPerfEvent(LbrSamplingPeriod));
#else
return llvm::make_error<llvm::StringError>(
"LBR counter requested without HAVE_LIBPFM or running on Linux.",
llvm::errc::invalid_argument);
#endif
}
return ExegesisTarget::createCounter(CounterName, State);
}
private:
void addTargetSpecificPasses(PassManagerBase &PM) const override;

View File

@ -1,218 +0,0 @@
//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "X86Counter.h"
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#ifdef __linux__
#include "llvm/Support/Endian.h"
#include "llvm/Support/Errc.h"
#ifdef HAVE_LIBPFM
#include "perfmon/perf_event.h"
#include "perfmon/pfmlib.h"
#include "perfmon/pfmlib_perf_event.h"
#endif // HAVE_LIBPFM
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include <poll.h>
#include <sys/mman.h>
#include <unistd.h>
#ifdef HAVE_LIBPFM
namespace llvm {
namespace exegesis {
static constexpr size_t kBufferPages = 8;
static const size_t kDataBufferSize = kBufferPages * getpagesize();
// Waits for the LBR perf events.
static int pollLbrPerfEvent(const int FileDescriptor) {
struct pollfd PollFd;
PollFd.fd = FileDescriptor;
PollFd.events = POLLIN;
PollFd.revents = 0;
return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
}
// Copies the data-buffer into Buf, given the pointer to MMapped.
static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
size_t DataSize) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page.
char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
// The LBR buffer is a cyclic buffer, we copy data to another buffer.
uint64_t Offset = Tail % kDataBufferSize;
size_t CopySize = kDataBufferSize - Offset;
memcpy(Buf, Start + Offset, CopySize);
if (CopySize >= DataSize)
return;
memcpy(Buf + CopySize, Start, Offset);
return;
}
// Parses the given data-buffer for stats and fill the CycleArray.
// If data has been extracted successfully, also modifies the code to jump
// out the benchmark loop.
static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
const void *From, const void *To,
llvm::SmallVector<int64_t, 4> *CycleArray) {
assert(From != nullptr && To != nullptr);
const char *DataPtr = DataBuf;
while (DataPtr < DataBuf + DataSize) {
struct perf_event_header Header;
memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
if (Header.type != PERF_RECORD_SAMPLE) {
// Ignores non-sample records.
DataPtr += Header.size;
continue;
}
DataPtr += sizeof(Header);
uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
DataPtr += sizeof(Count);
struct perf_branch_entry Entry;
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
// Read the perf_branch_entry array.
for (uint64_t i = 0; i < Count; ++i) {
const uint64_t BlockStart = From == nullptr
? std::numeric_limits<uint64_t>::min()
: reinterpret_cast<uint64_t>(From);
const uint64_t BlockEnd = To == nullptr
? std::numeric_limits<uint64_t>::max()
: reinterpret_cast<uint64_t>(To);
if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
CycleArray->push_back(Entry.cycles);
if (i == Count - 1)
// We've reached the last entry.
return llvm::Error::success();
// Advance to next entry
DataPtr += sizeof(Entry);
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
}
}
return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
llvm::errc::io_error);
}
#ifdef HAVE_LIBPFM
X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
EventString = "BR_INST_RETIRED.NEAR_TAKEN";
Attr = new perf_event_attr();
Attr->size = sizeof(*Attr);
Attr->type = PERF_TYPE_RAW;
// FIXME This is SKL's encoding. Not sure if it'll change.
Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
// Don't need to specify "USER" because we've already excluded HV and Kernel.
Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
Attr->sample_period = SamplingPeriod;
Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
Attr->disabled = 1;
Attr->exclude_kernel = 1;
Attr->exclude_hv = 1;
Attr->read_format = PERF_FORMAT_GROUP;
FullQualifiedEventString = EventString;
}
#else
X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
EventString = "";
Attr = nullptr;
}
#endif
X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
: Counter(std::move(NewEvent)) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page, so we allocate one more page.
MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
if (MMappedBuffer == MAP_FAILED)
llvm::errs() << "Failed to mmap buffer.";
}
X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
void X86LbrCounter::start() {
ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::readOrError(StringRef FunctionBytes) const {
// The max number of time-outs/retries before we give up.
static constexpr int kMaxTimeouts = 160;
// Disable the event before reading
ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
// Parses the LBR buffer and fills CycleArray with the sequence of cycle
// counts from the buffer.
llvm::SmallVector<int64_t, 4> CycleArray;
std::unique_ptr<char[]> DataBuf(new char[kDataBufferSize]);
int NumTimeouts = 0;
int PollResult = 0;
// Find the boundary of the function so that we could filter the LBRs
// to keep only the relevant records.
if (FunctionBytes.empty())
return llvm::make_error<llvm::StringError>("Empty function bytes",
llvm::errc::invalid_argument);
const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
FunctionBytes.size());
while (PollResult <= 0) {
PollResult = pollLbrPerfEvent(FileDescriptor);
if (PollResult > 0)
break;
if (PollResult == -1)
return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
llvm::errc::io_error);
if (NumTimeouts++ >= kMaxTimeouts)
return llvm::make_error<llvm::StringError>(
"LBR polling still timed out after max number of attempts.",
llvm::errc::device_or_resource_busy);
}
struct perf_event_mmap_page Page;
memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
const uint64_t DataTail = Page.data_tail;
const uint64_t DataHead = Page.data_head;
// We're supposed to use a barrier after reading data_head.
std::atomic_thread_fence(std::memory_order_acq_rel);
const size_t DataSize = DataHead - DataTail;
if (DataSize > kDataBufferSize)
return llvm::make_error<llvm::StringError>(
"DataSize larger than buffer size.", llvm::errc::invalid_argument);
copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
llvm::Error error =
parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
if (!error)
return CycleArray;
return std::move(error);
}
} // namespace exegesis
} // namespace llvm
#endif // HAVE_LIBPFM
#endif // __linux__

View File

@ -1,53 +0,0 @@
//===-- X86Counter.h --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Perf counter that reads the LBRs for measuring the benchmarked block's
/// throughput.
///
/// More info at: https://lwn.net/Articles/680985
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H
#define LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H
#include "../PerfHelper.h"
#include "llvm/Support/Error.h"
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#if defined(__linux__) && defined(HAVE_LIBPFM)
namespace llvm {
namespace exegesis {
class X86LbrPerfEvent : public pfm::PerfEvent {
public:
X86LbrPerfEvent(unsigned SamplingPeriod);
};
class X86LbrCounter : public pfm::Counter {
public:
explicit X86LbrCounter(pfm::PerfEvent &&Event);
virtual ~X86LbrCounter();
void start() override;
llvm::Expected<llvm::SmallVector<int64_t, 4>>
readOrError(StringRef FunctionBytes) const override;
private:
void *MMappedBuffer = nullptr;
};
} // namespace exegesis
} // namespace llvm
#endif // defined(__linux__) && defined(HAVE_LIBPFM)
#endif // LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H

View File

@ -160,12 +160,6 @@ static cl::opt<std::string>
cl::desc(""), cl::cat(AnalysisOptions),
cl::init(""));
static cl::list<std::string>
AllowedHostCpus("allowed-host-cpu",
cl::desc("If specified, only run the benchmark if the host "
"CPU matches the names"),
cl::cat(Options), cl::ZeroOrMore);
static cl::opt<bool> AnalysisDisplayUnstableOpcodes(
"analysis-display-unstable-clusters",
cl::desc("if there is more than one benchmark for an opcode, said "
@ -302,13 +296,6 @@ void benchmarkMain() {
const LLVMState State(CpuName);
llvm::StringRef ActualCpu = State.getTargetMachine().getTargetCPU();
for (auto Begin = AllowedHostCpus.begin(); Begin != AllowedHostCpus.end();
++Begin) {
if (ActualCpu != *Begin)
ExitWithError(llvm::Twine("Unexpected host CPU ").concat(ActualCpu));
}
const std::unique_ptr<BenchmarkRunner> Runner =
ExitOnErr(State.getExegesisTarget().createBenchmarkRunner(
BenchmarkMode, State, ResultAggMode));