[llvm-libc] Add memory function benchmarks

Summary:
This patch adds a benchmarking infrastructure for llvm-libc memory functions.

In a nutshell, the code can benchmark small and large buffers for the memcpy, memset and memcmp functions.
It also produces graphs of size vs latency by running targets of the form `render-libc-{memcpy|memset|memcmp}-benchmark-{small|big}`.

The configurations are provided as JSON files and the benchmark also produces a JSON file.
This file is then parsed and rendered as a PNG file via the `render.py` script (make sure to run `pip3 install matplotlib scipy numpy`).
The script can take several JSON files as input and will superimpose the curves if they are from the same host.

TODO:
 - The code benchmarks whatever is available on the host but should be configured to benchmark the -to be added- llvm-libc memory functions.
 - Add a README file with instructions and rationale.
 - Produce scores to track the performance of the functions over time to allow for regression detection.

Reviewers: sivachandra, ckennelly

Subscribers: mgorny, MaskRay, libc-commits

Tags: #libc-project

Differential Revision: https://reviews.llvm.org/D72516
This commit is contained in:
Guillaume Chatelet 2020-01-06 13:17:04 +01:00
parent b901335193
commit aba80d0734
22 changed files with 2588 additions and 0 deletions

View File

@ -1,2 +1,3 @@
add_subdirectory(HdrGen)
add_subdirectory(UnitTest)
add_subdirectory(benchmarks)

View File

@ -1,3 +1,5 @@
set(LLVM_LINK_COMPONENTS Support)
add_tablegen(libc-hdrgen llvm-libc
Command.h
Command.cpp

View File

@ -0,0 +1,184 @@
find_package(Threads)
include(ExternalProject)
set(LLVM_LINK_COMPONENTS Support)
#==============================================================================
# Build Google Benchmark
#==============================================================================
set(GOOGLE_BENCHMARK_TARGET_FLAGS ${BENCHMARK_DIALECT_FLAG})
if (LIBCXX_BENCHMARK_GCC_TOOLCHAIN)
set(GOOGLE_BENCHMARK_TARGET_FLAGS
-gcc-toolchain ${LIBCXX_BENCHMARK_GCC_TOOLCHAIN})
endif()
string(REPLACE ";" " " GOOGLE_BENCHMARK_TARGET_FLAGS "${GOOGLE_BENCHMARK_TARGET_FLAGS}")
ExternalProject_Add(google-benchmark
EXCLUDE_FROM_ALL ON
PREFIX google-benchmark
SOURCE_DIR ${LIBC_SOURCE_DIR}/../llvm/utils/benchmark
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark
CMAKE_CACHE_ARGS
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS:STRING=${GOOGLE_BENCHMARK_TARGET_FLAGS}
-DCMAKE_BUILD_TYPE:STRING=RELEASE
-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-DBENCHMARK_ENABLE_TESTING:BOOL=OFF)
set(GOOGLE_BENCHMARK_LIBC_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark)
set(GOOGLE_BENCHMARK_LINK_FLAGS -L${GOOGLE_BENCHMARK_LIBC_INSTALL}/lib/)
#==============================================================================
# Add Unit Testing Support
#==============================================================================
function(add_libc_benchmark_unittest target_name)
if(NOT LLVM_INCLUDE_TESTS)
return()
endif()
cmake_parse_arguments(
"LIBC_BENCHMARKS_UNITTEST"
"" # No optional arguments
"SUITE" # Single value arguments
"SRCS;DEPENDS" # Multi-value arguments
${ARGN}
)
add_executable(${target_name}
EXCLUDE_FROM_ALL
${LIBC_BENCHMARKS_UNITTEST_SRCS}
)
target_include_directories(${target_name}
PRIVATE
${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include
${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include
)
target_link_libraries(${target_name}
PRIVATE
gtest_main
gtest
${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
)
add_custom_command(
TARGET ${target_name}
POST_BUILD
COMMAND $<TARGET_FILE:${target_name}>
)
add_dependencies(check-libc-benchmark ${target_name})
endfunction()
#==============================================================================
# Build Google Benchmark for libc
#==============================================================================
add_custom_target(check-libc-benchmark)
function(fix_rtti target)
# TODO: Make this portable and inline with rtti mode from llvm/
target_compile_options(${target} PUBLIC -fno-rtti)
endfunction()
# libc-benchmark
add_library(libc-benchmark
STATIC
EXCLUDE_FROM_ALL
LibcBenchmark.cpp
LibcBenchmark.h
)
add_dependencies(libc-benchmark google-benchmark)
target_include_directories(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LIBC_INSTALL}/include")
target_link_options(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LINK_FLAGS}")
target_link_libraries(libc-benchmark PUBLIC LLVMSupport -lbenchmark Threads::Threads)
fix_rtti(libc-benchmark)
add_libc_benchmark_unittest(libc-benchmark-test
SRCS LibcBenchmarkTest.cpp
DEPENDS libc-benchmark
)
# libc-memory-benchmark
add_library(libc-memory-benchmark
STATIC
EXCLUDE_FROM_ALL
LibcMemoryBenchmark.cpp
LibcMemoryBenchmark.h
)
target_link_libraries(libc-memory-benchmark PUBLIC libc-benchmark)
fix_rtti(libc-memory-benchmark)
add_libc_benchmark_unittest(libc-memory-benchmark-test
SRCS LibcMemoryBenchmarkTest.cpp
DEPENDS libc-memory-benchmark
)
# json
add_library(json
STATIC
EXCLUDE_FROM_ALL
JSON.cpp
JSON.h
)
target_link_libraries(json PUBLIC libc-memory-benchmark)
fix_rtti(json)
add_libc_benchmark_unittest(json-test
SRCS JSONTest.cpp
DEPENDS json
)
#==============================================================================
# Benchmark tests configuration
#==============================================================================
function(add_libc_benchmark_analysis conf_target run_target)
set(png_file "/tmp/last-${conf_target}.png")
set(render_target render-${conf_target})
add_custom_target(${render_target}
COMMAND python3 render.py3 ${json_file} --headless --output=${png_file}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "render ${libc_target} to ${png_file}"
)
add_dependencies(${render_target} ${run_target})
set(display_target display-${conf_target})
add_custom_target(${display_target}
COMMAND python3 render.py3 ${json_file}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "display ${libc_target}"
)
add_dependencies(${display_target} ${run_target})
endfunction()
function(add_libc_benchmark_configuration target configuration)
set(conf_target ${target}-${configuration})
set(json_file "/tmp/last-${conf_target}.json")
set(run_target run-${conf_target})
add_custom_target(${run_target}
COMMAND ${libc_target} --conf=configuration_${configuration}.json -o ${json_file}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
add_libc_benchmark_analysis(${conf_target} ${run_target})
endfunction()
function(add_libc_benchmark name file)
set(libc_target libc-${name}-benchmark)
add_executable(${libc_target}
EXCLUDE_FROM_ALL
${file}
LibcMemoryBenchmarkMain.h
LibcMemoryBenchmarkMain.cpp
)
target_link_libraries(${libc_target} PUBLIC json)
foreach(configuration "small" "big")
add_libc_benchmark_configuration(${libc_target} ${configuration})
endforeach()
endfunction()
add_libc_benchmark(memcpy Memcpy.cpp)
add_libc_benchmark(memcmp Memcmp.cpp)
add_libc_benchmark(memset Memset.cpp)

View File

@ -0,0 +1,367 @@
//===-------- JSON serialization routines ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "JSON.h"
#include "LibcBenchmark.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/MathExtras.h"
#include <chrono>
#include <limits>
#include <memory>
#include <vector>
namespace llvm {
namespace libc_benchmarks {
template <typename T>
static Error intFromJsonTemplate(const json::Value &V, T &Out) {
if (const auto &MaybeInt64 = V.getAsInteger()) {
int64_t Value = *MaybeInt64;
if (Value < std::numeric_limits<T>::min() ||
Value > std::numeric_limits<T>::max())
return createStringError(errc::io_error, "Out of bound Integer");
Out = Value;
return Error::success();
}
return createStringError(errc::io_error, "Can't parse Integer");
}
static Error fromJson(const json::Value &V, double &Out) {
if (auto S = V.getAsNumber()) {
Out = *S;
return Error::success();
}
return createStringError(errc::io_error, "Can't parse Double");
}
static Error fromJson(const json::Value &V, std::string &Out) {
if (auto S = V.getAsString()) {
Out = *S;
return Error::success();
}
return createStringError(errc::io_error, "Can't parse String");
}
static Error fromJson(const json::Value &V, uint32_t &Out) {
return intFromJsonTemplate(V, Out);
}
static Error fromJson(const json::Value &V, uint8_t &Out) {
return intFromJsonTemplate(V, Out);
}
static Error fromJson(const json::Value &V, int &Out) {
return intFromJsonTemplate(V, Out);
}
static Error fromJson(const json::Value &V, libc_benchmarks::Duration &D) {
if (V.kind() != json::Value::Kind::Number)
return createStringError(errc::io_error, "Can't parse Duration");
D = libc_benchmarks::Duration(*V.getAsNumber());
return Error::success();
}
static Error fromJson(const json::Value &V, MaybeAlign &Out) {
const auto MaybeInt = V.getAsInteger();
if (!MaybeInt)
return createStringError(errc::io_error,
"Can't parse Align, not an Integer");
const int64_t Value = *MaybeInt;
if (!Value) {
Out = None;
return Error::success();
}
if (isPowerOf2_64(Value)) {
Out = Align(Value);
return Error::success();
}
return createStringError(errc::io_error,
"Can't parse Align, not a power of two");
}
static Error fromJson(const json::Value &V,
libc_benchmarks::BenchmarkLog &Out) {
if (V.kind() != json::Value::Kind::String)
return createStringError(errc::io_error,
"Can't parse BenchmarkLog, not a String");
const auto String = *V.getAsString();
auto Parsed =
llvm::StringSwitch<Optional<libc_benchmarks::BenchmarkLog>>(String)
.Case("None", libc_benchmarks::BenchmarkLog::None)
.Case("Last", libc_benchmarks::BenchmarkLog::Last)
.Case("Full", libc_benchmarks::BenchmarkLog::Full)
.Default(None);
if (!Parsed)
return createStringError(errc::io_error,
Twine("Can't parse BenchmarkLog, invalid value '")
.concat(String)
.concat("'"));
Out = *Parsed;
return Error::success();
}
template <typename C>
Error vectorFromJsonTemplate(const json::Value &V, C &Out) {
auto *A = V.getAsArray();
if (!A)
return createStringError(errc::io_error, "Can't parse Array");
Out.clear();
Out.resize(A->size());
for (auto InOutPair : llvm::zip(*A, Out))
if (auto E = fromJson(std::get<0>(InOutPair), std::get<1>(InOutPair)))
return std::move(E);
return Error::success();
}
template <typename T>
static Error fromJson(const json::Value &V, std::vector<T> &Out) {
return vectorFromJsonTemplate(V, Out);
}
template <typename T>
static Error fromJson(const json::Value &V, SmallVectorImpl<T> &Out) {
return vectorFromJsonTemplate(V, Out);
}
// Same as llvm::json::ObjectMapper but adds a finer error reporting mechanism.
class JsonObjectMapper {
const json::Object *O;
Error E;
SmallDenseSet<StringRef> SeenFields;
public:
explicit JsonObjectMapper(const json::Value &V)
: O(V.getAsObject()),
E(O ? Error::success()
: createStringError(errc::io_error, "Expected JSON Object")) {}
Error takeError() {
if (E)
return std::move(E);
for (const auto &Itr : *O) {
const StringRef Key = Itr.getFirst();
if (!SeenFields.count(Key))
E = createStringError(errc::io_error,
Twine("Unknown field: ").concat(Key));
}
return std::move(E);
}
template <typename T> void map(StringRef Key, T &Out) {
if (E)
return;
if (const json::Value *Value = O->get(Key)) {
SeenFields.insert(Key);
E = fromJson(*Value, Out);
}
}
};
static Error fromJson(const json::Value &V,
libc_benchmarks::BenchmarkOptions &Out) {
JsonObjectMapper O(V);
O.map("MinDuration", Out.MinDuration);
O.map("MaxDuration", Out.MaxDuration);
O.map("InitialIterations", Out.InitialIterations);
O.map("MaxIterations", Out.MaxIterations);
O.map("MinSamples", Out.MinSamples);
O.map("MaxSamples", Out.MaxSamples);
O.map("Epsilon", Out.Epsilon);
O.map("ScalingFactor", Out.ScalingFactor);
O.map("Log", Out.Log);
return O.takeError();
}
static Error fromJson(const json::Value &V, libc_benchmarks::SizeRange &Out) {
JsonObjectMapper O(V);
O.map("From", Out.From);
O.map("To", Out.To);
O.map("Step", Out.Step);
return O.takeError();
}
static Error fromJson(const json::Value &V,
libc_benchmarks::StudyConfiguration &Out) {
JsonObjectMapper O(V);
O.map("Runs", Out.Runs);
O.map("BufferSize", Out.BufferSize);
O.map("Size", Out.Size);
O.map("AddressAlignment", Out.AddressAlignment);
O.map("MemsetValue", Out.MemsetValue);
O.map("MemcmpMismatchAt", Out.MemcmpMismatchAt);
return O.takeError();
}
static Error fromJson(const json::Value &V, libc_benchmarks::CacheInfo &Out) {
JsonObjectMapper O(V);
O.map("Type", Out.Type);
O.map("Level", Out.Level);
O.map("Size", Out.Size);
O.map("NumSharing", Out.NumSharing);
return O.takeError();
}
static Error fromJson(const json::Value &V, libc_benchmarks::HostState &Out) {
JsonObjectMapper O(V);
O.map("CpuName", Out.CpuName);
O.map("CpuFrequency", Out.CpuFrequency);
O.map("Caches", Out.Caches);
return O.takeError();
}
static Error fromJson(const json::Value &V,
libc_benchmarks::FunctionMeasurements &Out) {
JsonObjectMapper O(V);
O.map("Name", Out.Name);
std::vector<uint32_t> Sizes;
O.map("Sizes", Sizes);
std::vector<libc_benchmarks::Duration> Runtimes;
O.map("Runtimes", Runtimes);
if (Sizes.size() != Runtimes.size())
return createStringError(errc::io_error,
"Measurement Size and Runtime mistmatch");
Out.Measurements.resize(Sizes.size());
for (size_t I = 0; I < Sizes.size(); ++I) {
Out.Measurements[I].Size = Sizes[I];
Out.Measurements[I].Runtime = Runtimes[I];
}
return O.takeError();
}
static Error fromJson(const json::Value &V, libc_benchmarks::Study &Out) {
JsonObjectMapper O(V);
O.map("Host", Out.Host);
O.map("Options", Out.Options);
O.map("Configuration", Out.Configuration);
O.map("Functions", Out.Functions);
return O.takeError();
}
static double Seconds(const Duration &D) {
return std::chrono::duration<double>(D).count();
}
Expected<Study> ParseJsonStudy(StringRef Content) {
Expected<json::Value> EV = json::parse(Content);
if (!EV)
return EV.takeError();
Study S;
if (Error E = fromJson(*EV, S))
return std::move(E);
return S;
}
static StringRef Serialize(const BenchmarkLog &L) {
switch (L) {
case BenchmarkLog::None:
return "None";
case BenchmarkLog::Last:
return "Last";
case BenchmarkLog::Full:
return "Full";
}
llvm_unreachable("Unhandled BenchmarkLog value");
}
static void Serialize(const BenchmarkOptions &BO, json::OStream &JOS) {
JOS.object([&]() {
JOS.attribute("MinDuration", Seconds(BO.MinDuration));
JOS.attribute("MaxDuration", Seconds(BO.MaxDuration));
JOS.attribute("InitialIterations", BO.InitialIterations);
JOS.attribute("MaxIterations", BO.MaxIterations);
JOS.attribute("MinSamples", BO.MinSamples);
JOS.attribute("MaxSamples", BO.MaxSamples);
JOS.attribute("Epsilon", BO.Epsilon);
JOS.attribute("ScalingFactor", BO.ScalingFactor);
JOS.attribute("Log", Serialize(BO.Log));
});
}
static void Serialize(const CacheInfo &CI, json::OStream &JOS) {
JOS.object([&]() {
JOS.attribute("Type", CI.Type);
JOS.attribute("Level", CI.Level);
JOS.attribute("Size", CI.Size);
JOS.attribute("NumSharing", CI.NumSharing);
});
}
static void Serialize(const HostState &HS, json::OStream &JOS) {
JOS.object([&]() {
JOS.attribute("CpuName", HS.CpuName);
JOS.attribute("CpuFrequency", HS.CpuFrequency);
JOS.attributeArray("Caches", [&]() {
for (const auto &CI : HS.Caches)
Serialize(CI, JOS);
});
});
}
static void Serialize(const StudyConfiguration &SC, json::OStream &JOS) {
JOS.object([&]() {
JOS.attribute("Runs", SC.Runs);
JOS.attribute("BufferSize", SC.BufferSize);
JOS.attributeObject("Size", [&]() {
JOS.attribute("From", SC.Size.From);
JOS.attribute("To", SC.Size.To);
JOS.attribute("Step", SC.Size.Step);
});
if (SC.AddressAlignment)
JOS.attribute("AddressAlignment",
static_cast<int64_t>(SC.AddressAlignment->value()));
JOS.attribute("MemsetValue", SC.MemsetValue);
JOS.attribute("MemcmpMismatchAt", SC.MemcmpMismatchAt);
});
}
static void Serialize(const FunctionMeasurements &FM, json::OStream &JOS) {
JOS.object([&]() {
JOS.attribute("Name", FM.Name);
JOS.attributeArray("Sizes", [&]() {
for (const auto &M : FM.Measurements)
JOS.value(M.Size);
});
JOS.attributeArray("Runtimes", [&]() {
for (const auto &M : FM.Measurements)
JOS.value(Seconds(M.Runtime));
});
});
}
void SerializeToJson(const Study &S, json::OStream &JOS) {
JOS.object([&]() {
JOS.attributeBegin("Host");
Serialize(S.Host, JOS);
JOS.attributeEnd();
JOS.attributeBegin("Options");
Serialize(S.Options, JOS);
JOS.attributeEnd();
JOS.attributeBegin("Configuration");
Serialize(S.Configuration, JOS);
JOS.attributeEnd();
if (!S.Functions.empty()) {
JOS.attributeArray("Functions", [&]() {
for (const auto &FM : S.Functions)
Serialize(FM, JOS);
});
}
});
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,28 @@
//===-------- JSON serialization routines -----------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_UTILS_BENCHMARK_JSON_H
#define LLVM_LIBC_UTILS_BENCHMARK_JSON_H
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "llvm/Support/JSON.h"
namespace llvm {
namespace libc_benchmarks {
// Parses a Study from a json string.
Expected<Study> ParseJsonStudy(StringRef Content);
// Serialize a Study as json.
void SerializeToJson(const Study &S, llvm::json::OStream &JOS);
} // namespace libc_benchmarks
} // namespace llvm
#endif // LLVM_LIBC_UTILS_BENCHMARK_JSON_H

View File

@ -0,0 +1,190 @@
#include "JSON.h"
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/raw_ostream.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
using testing::AllOf;
using testing::ExplainMatchResult;
using testing::Field;
using testing::Pointwise;
namespace llvm {
namespace libc_benchmarks {
namespace {
Study getStudy() {
return Study{
HostState{
"CpuName", 123, {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2), 10,
100, 6, 100, 0.1, 2, BenchmarkLog::Full},
StudyConfiguration{2, 3, SizeRange{4, 5, 6}, Align(8), 9, 10},
{FunctionMeasurements{"A",
{Measurement{3, std::chrono::seconds(3)},
Measurement{3, std::chrono::seconds(4)}}},
FunctionMeasurements{"B", {}}}};
}
static std::string SerializeToString(const Study &S) {
std::string Buffer;
raw_string_ostream RSO(Buffer);
json::OStream JOS(RSO);
SerializeToJson(S, JOS);
return Buffer;
}
MATCHER(EqualsCacheInfo, "") {
const CacheInfo &A = ::testing::get<0>(arg);
const CacheInfo &B = ::testing::get<1>(arg);
return ExplainMatchResult(AllOf(Field(&CacheInfo::Type, B.Type),
Field(&CacheInfo::Level, B.Level),
Field(&CacheInfo::Size, B.Size),
Field(&CacheInfo::NumSharing, B.NumSharing)),
A, result_listener);
}
auto Equals(const HostState &H) -> auto {
return AllOf(
Field(&HostState::CpuName, H.CpuName),
Field(&HostState::CpuFrequency, H.CpuFrequency),
Field(&HostState::Caches, Pointwise(EqualsCacheInfo(), H.Caches)));
}
auto Equals(const BenchmarkOptions &BO) -> auto {
return AllOf(
Field(&BenchmarkOptions::MinDuration, BO.MinDuration),
Field(&BenchmarkOptions::MaxDuration, BO.MaxDuration),
Field(&BenchmarkOptions::InitialIterations, BO.InitialIterations),
Field(&BenchmarkOptions::MaxIterations, BO.MaxIterations),
Field(&BenchmarkOptions::MinSamples, BO.MinSamples),
Field(&BenchmarkOptions::MaxSamples, BO.MaxSamples),
Field(&BenchmarkOptions::Epsilon, BO.Epsilon),
Field(&BenchmarkOptions::ScalingFactor, BO.ScalingFactor),
Field(&BenchmarkOptions::Log, BO.Log));
}
auto Equals(const SizeRange &SR) -> auto {
return AllOf(Field(&SizeRange::From, SR.From), Field(&SizeRange::To, SR.To),
Field(&SizeRange::Step, SR.Step));
}
auto Equals(const StudyConfiguration &SC) -> auto {
return AllOf(
Field(&StudyConfiguration::Runs, SC.Runs),
Field(&StudyConfiguration::BufferSize, SC.BufferSize),
Field(&StudyConfiguration::Size, Equals(SC.Size)),
Field(&StudyConfiguration::AddressAlignment, SC.AddressAlignment),
Field(&StudyConfiguration::MemsetValue, SC.MemsetValue),
Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
}
MATCHER(EqualsMeasurement, "") {
const Measurement &A = ::testing::get<0>(arg);
const Measurement &B = ::testing::get<1>(arg);
return ExplainMatchResult(AllOf(Field(&Measurement::Size, B.Size),
Field(&Measurement::Runtime, B.Runtime)),
A, result_listener);
}
MATCHER(EqualsFunctions, "") {
const FunctionMeasurements &A = ::testing::get<0>(arg);
const FunctionMeasurements &B = ::testing::get<1>(arg);
return ExplainMatchResult(
AllOf(Field(&FunctionMeasurements::Name, B.Name),
Field(&FunctionMeasurements::Measurements,
Pointwise(EqualsMeasurement(), B.Measurements))),
A, result_listener);
}
auto Equals(const Study &S) -> auto {
return AllOf(
Field(&Study::Host, Equals(S.Host)),
Field(&Study::Options, Equals(S.Options)),
Field(&Study::Configuration, Equals(S.Configuration)),
Field(&Study::Functions, Pointwise(EqualsFunctions(), S.Functions)));
}
TEST(JsonTest, RoundTrip) {
const Study S = getStudy();
auto StudyOrError = ParseJsonStudy(SerializeToString(S));
if (auto Err = StudyOrError.takeError())
EXPECT_FALSE(Err) << "Unexpected error";
const Study &Parsed = *StudyOrError;
EXPECT_THAT(Parsed, Equals(S));
}
TEST(JsonTest, SupplementaryField) {
auto Failure = ParseJsonStudy(R"({
"UnknownField": 10
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Unknown field: UnknownField");
}
TEST(JsonTest, InvalidType) {
auto Failure = ParseJsonStudy(R"({
"Options": 1
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Expected JSON Object");
}
TEST(JsonTest, InvalidDuration) {
auto Failure = ParseJsonStudy(R"({
"Options": {
"MinDuration": "Duration should be a Number"
}
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Duration");
}
TEST(JsonTest, InvalidAlignType) {
auto Failure = ParseJsonStudy(R"({
"Configuration":{
"AddressAlignment": "Align should be an Integer"
}
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Align, not an Integer");
}
TEST(JsonTest, InvalidAlign) {
auto Failure = ParseJsonStudy(R"({
"Configuration":{
"AddressAlignment":3
}
}
)");
EXPECT_EQ(toString(Failure.takeError()),
"Can't parse Align, not a power of two");
}
TEST(JsonTest, InvalidBenchmarkLogType) {
auto Failure = ParseJsonStudy(R"({
"Options":{
"Log": 3
}
}
)");
EXPECT_EQ(toString(Failure.takeError()),
"Can't parse BenchmarkLog, not a String");
}
TEST(JsonTest, InvalidBenchmarkLog) {
auto Failure = ParseJsonStudy(R"({
"Options":{
"Log": "Unknown"
}
}
)");
EXPECT_EQ(toString(Failure.takeError()),
"Can't parse BenchmarkLog, invalid value 'Unknown'");
}
} // namespace
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,40 @@
//===-------- `Benchmark` function ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcBenchmark.h"
#include "llvm/Support/Host.h"
namespace llvm {
namespace libc_benchmarks {
void checkRequirements() {
const auto &CpuInfo = benchmark::CPUInfo::Get();
if (CpuInfo.scaling_enabled)
report_fatal_error(
"CPU scaling is enabled, the benchmark real time measurements may be "
"noisy and will incur extra overhead.");
}
HostState HostState::get() {
const auto &CpuInfo = benchmark::CPUInfo::Get();
HostState H;
H.CpuFrequency = CpuInfo.cycles_per_second;
H.CpuName = llvm::sys::getHostCPUName().str();
for (const auto &BenchmarkCacheInfo : CpuInfo.caches) {
CacheInfo CI;
CI.Type = BenchmarkCacheInfo.type;
CI.Level = BenchmarkCacheInfo.level;
CI.Size = BenchmarkCacheInfo.size;
CI.NumSharing = BenchmarkCacheInfo.num_sharing;
H.Caches.push_back(std::move(CI));
}
return H;
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,324 @@
//===-------- `Benchmark` function ------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file mainly defines a `Benchmark` function.
//
// The benchmarking process is as follows:
// - We start by measuring the time it takes to run the function
// `InitialIterations` times. This is called a Sample. From this we can derive
// the time it took to run a single iteration.
//
// - We repeat the previous step with a greater number of iterations to lower
// the impact of the measurement. We can derive a more precise estimation of the
// runtime for a single iteration.
//
// - Each sample gives a more accurate estimation of the runtime for a single
// iteration but also takes more time to run. We stop the process when:
// * The measure stabilize under a certain precision (Epsilon),
// * The overall benchmarking time is greater than MaxDuration,
// * The overall sample count is greater than MaxSamples,
// * The last sample used more than MaxIterations iterations.
//
// - We also makes sure that the benchmark doesn't run for a too short period of
// time by defining MinDuration and MinSamples.
#ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
#define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
#include "benchmark/benchmark.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include <array>
#include <chrono>
#include <cstdint>
namespace llvm {
namespace libc_benchmarks {
// Makes sure the binary was compiled in release mode and that frequency
// governor is set on performance.
void checkRequirements();
using Duration = std::chrono::duration<double>;
enum class BenchmarkLog {
None, // Don't keep the internal state of the benchmark.
Last, // Keep only the last batch.
Full // Keep all iterations states, useful for testing or debugging.
};
// An object to configure the benchmark stopping conditions.
// See documentation at the beginning of the file for the overall algorithm and
// meaning of each field.
struct BenchmarkOptions {
// The minimum time for which the benchmark is running.
Duration MinDuration = std::chrono::seconds(0);
// The maximum time for which the benchmark is running.
Duration MaxDuration = std::chrono::seconds(10);
// The number of iterations in the first sample.
uint32_t InitialIterations = 1;
// The maximum number of iterations for any given sample.
uint32_t MaxIterations = 10000000;
// The minimum number of samples.
uint32_t MinSamples = 4;
// The maximum number of samples.
uint32_t MaxSamples = 1000;
// The benchmark will stop is the relative difference between the current and
// the last estimation is less than epsilon. This is 1% by default.
double Epsilon = 0.01;
// The number of iterations grows exponentially between each sample.
// Must be greater or equal to 1.
double ScalingFactor = 1.4;
BenchmarkLog Log = BenchmarkLog::None;
};
// The state of a benchmark.
enum class BenchmarkStatus {
Running,
MaxDurationReached,
MaxIterationsReached,
MaxSamplesReached,
PrecisionReached,
};
// The internal state of the benchmark, useful to debug, test or report
// statistics.
struct BenchmarkState {
size_t LastSampleIterations;
Duration LastBatchElapsed;
BenchmarkStatus CurrentStatus;
Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
double ChangeRatio; // The change in time estimation between previous and
// current samples.
};
// A lightweight result for a benchmark.
struct BenchmarkResult {
BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
Duration BestGuess = {};
llvm::Optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
};
// Stores information about a cache in the host memory system.
struct CacheInfo {
std::string Type; // e.g. "Instruction", "Data", "Unified".
int Level; // 0 is closest to processing unit.
int Size; // In bytes.
int NumSharing; // The number of processing units (Hyper-Threading Thread)
// with which this cache is shared.
};
// Stores information about the host.
struct HostState {
std::string CpuName; // returns a string compatible with the -march option.
double CpuFrequency; // in Hertz.
std::vector<CacheInfo> Caches;
static HostState get();
};
namespace internal {
struct Measurement {
size_t Iterations = 0;
Duration Elapsed = {};
};
// Updates the estimation of the elapsed time for a single iteration.
class RefinableRuntimeEstimation {
Duration TotalTime = {};
size_t TotalIterations = 0;
public:
Duration update(const Measurement &M) {
assert(M.Iterations > 0);
// Duration is encoded as a double (see definition).
// `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
// loss of precision due to radically different scales.
TotalTime += M.Elapsed;
TotalIterations += M.Iterations;
return TotalTime / TotalIterations;
}
};
// This class tracks the progression of the runtime estimation.
class RuntimeEstimationProgression {
RefinableRuntimeEstimation RRE;
public:
Duration CurrentEstimation = {};
// Returns the change ratio between our best guess so far and the one from the
// new measurement.
double computeImprovement(const Measurement &M) {
const Duration NewEstimation = RRE.update(M);
const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
CurrentEstimation = NewEstimation;
return Ratio;
}
};
} // namespace internal
// Measures the runtime of `foo` until conditions defined by `Options` are met.
//
// To avoid measurement's imprecisions we measure batches of `foo`.
// The batch size is growing by `ScalingFactor` to minimize the effect of
// measuring.
//
// Note: The benchmark is not responsible for serializing the executions of
// `foo`. It is not suitable for measuring, very small & side effect free
// functions, as the processor is free to execute serveral executions in
// parallel.
//
// - Options: A set of parameters controlling the stopping conditions for the
// benchmark.
// - foo: The function under test. It takes one value and returns one value.
// The input value is used to randomize the execution of `foo` as part of a
// batch to mitigate the effect of the branch predictor. Signature:
// `ProductType foo(ParameterProvider::value_type value);`
// The output value is a product of the execution of `foo` and prevents the
// compiler from optimizing out foo's body.
// - ParameterProvider: An object responsible for providing a range of
// `Iterations` values to use as input for `foo`. The `value_type` of the
// returned container has to be compatible with `foo` argument.
// Must implement one of:
// `Container<ParameterType> generateBatch(size_t Iterations);`
// `const Container<ParameterType>& generateBatch(size_t Iterations);`
// - Clock: An object providing the current time. Must implement:
// `std::chrono::time_point now();`
template <typename Function, typename ParameterProvider,
typename BenchmarkClock = const std::chrono::high_resolution_clock>
BenchmarkResult benchmark(const BenchmarkOptions &Options,
ParameterProvider &PP, Function foo,
BenchmarkClock &Clock = BenchmarkClock()) {
BenchmarkResult Result;
internal::RuntimeEstimationProgression REP;
Duration TotalBenchmarkDuration = {};
size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
size_t Samples = 0;
if (Options.ScalingFactor < 1.0)
report_fatal_error("ScalingFactor should be >= 1");
if (Options.Log != BenchmarkLog::None)
Result.MaybeBenchmarkLog.emplace();
for (;;) {
// Request a new Batch of size `Iterations`.
const auto &Batch = PP.generateBatch(Iterations);
// Measuring this Batch.
const auto StartTime = Clock.now();
for (const auto Parameter : Batch) {
const auto Production = foo(Parameter);
benchmark::DoNotOptimize(Production);
}
const auto EndTime = Clock.now();
const Duration Elapsed = EndTime - StartTime;
// Updating statistics.
++Samples;
TotalBenchmarkDuration += Elapsed;
const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
Result.BestGuess = REP.CurrentEstimation;
// Stopping condition.
if (TotalBenchmarkDuration >= Options.MinDuration &&
Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
else if (Samples >= Options.MaxSamples)
Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
else if (TotalBenchmarkDuration >= Options.MaxDuration)
Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
else if (Iterations >= Options.MaxIterations)
Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
if (Result.MaybeBenchmarkLog) {
auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
BenchmarkLog.pop_back();
BenchmarkState BS;
BS.LastSampleIterations = Iterations;
BS.LastBatchElapsed = Elapsed;
BS.CurrentStatus = Result.TerminationStatus;
BS.CurrentBestGuess = Result.BestGuess;
BS.ChangeRatio = ChangeRatio;
BenchmarkLog.push_back(BS);
}
if (Result.TerminationStatus != BenchmarkStatus::Running)
return Result;
if (Options.ScalingFactor > 1 &&
Iterations * Options.ScalingFactor == Iterations)
report_fatal_error(
"`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
"or InitialIterations.");
Iterations *= Options.ScalingFactor;
}
}
// Interprets `Array` as a circular buffer of `Size` elements.
template <typename T> class CircularArrayRef {
llvm::ArrayRef<T> Array;
size_t Size;
public:
using value_type = T;
using reference = T &;
using const_reference = const T &;
using difference_type = ssize_t;
using size_type = size_t;
class const_iterator
: public std::iterator<std::input_iterator_tag, T, ssize_t> {
llvm::ArrayRef<T> Array;
size_t Index;
public:
explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
: Array(Array), Index(Index) {}
const_iterator &operator++() {
++Index;
return *this;
}
bool operator==(const_iterator Other) const { return Index == Other.Index; }
bool operator!=(const_iterator Other) const { return !(*this == Other); }
const T &operator*() const { return Array[Index % Array.size()]; }
};
CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
: Array(Array), Size(Size) {
assert(Array.size() > 0);
}
const_iterator begin() const { return const_iterator(Array); }
const_iterator end() const { return const_iterator(Array, Size); }
};
// A convenient helper to produce a CircularArrayRef from an ArrayRef.
template <typename T>
CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
return {Array, Size};
}
// Creates an std::array which storage size is constrained under `Bytes`.
template <typename T, size_t Bytes>
using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
// A convenient helper to produce a CircularArrayRef from a
// ByteConstrainedArray.
template <typename T, size_t N>
CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
}
} // namespace libc_benchmarks
} // namespace llvm
#endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H

View File

@ -0,0 +1,168 @@
#include "LibcBenchmark.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include <chrono>
#include <limits>
#include <queue>
#include <vector>
using std::chrono::nanoseconds;
using ::testing::ElementsAre;
using ::testing::Field;
using ::testing::IsEmpty;
using ::testing::SizeIs;
namespace llvm {
namespace libc_benchmarks {
namespace {
// A simple parameter provider returning a zero initialized vector of size
// `iterations`.
struct DummyParameterProvider {
std::vector<char> generateBatch(size_t iterations) {
return std::vector<char>(iterations);
}
};
class LibcBenchmark : public ::testing::Test {
public:
// A Clock interface suitable for testing.
// - Either it returns 0,
// - Or a timepoint coming from the `setMeasurements` call.
Duration now() {
if (!MaybeTimepoints)
return {};
assert(!MaybeTimepoints->empty());
const Duration timepoint = MaybeTimepoints->front();
MaybeTimepoints->pop();
return timepoint;
}
protected:
void SetUp() override { Options.Log = BenchmarkLog::Full; }
void TearDown() override {
// We make sure all the expected measurements were performed.
if (MaybeTimepoints)
EXPECT_THAT(*MaybeTimepoints, IsEmpty());
}
BenchmarkResult run() {
return benchmark(Options, ParameterProvider, DummyFunction, *this);
}
void setMeasurements(llvm::ArrayRef<Duration> Durations) {
MaybeTimepoints.emplace(); // Create the optional value.
Duration CurrentTime = nanoseconds(1);
for (const auto &Duration : Durations) {
MaybeTimepoints->push(CurrentTime);
CurrentTime += Duration;
MaybeTimepoints->push(CurrentTime);
CurrentTime += nanoseconds(1);
}
}
BenchmarkOptions Options;
private:
DummyParameterProvider ParameterProvider;
static char DummyFunction(char Payload) { return Payload; }
llvm::Optional<std::queue<Duration>> MaybeTimepoints;
};
TEST_F(LibcBenchmark, MaxSamplesReached) {
Options.MaxSamples = 1;
const auto Result = run();
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxSamplesReached);
}
TEST_F(LibcBenchmark, MaxDurationReached) {
Options.MaxDuration = nanoseconds(10);
setMeasurements({nanoseconds(11)});
const auto Result = run();
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxDurationReached);
}
TEST_F(LibcBenchmark, MaxIterationsReached) {
Options.InitialIterations = 1;
Options.MaxIterations = 20;
Options.ScalingFactor = 2;
Options.Epsilon = 0; // unreachable.
const auto Result = run();
EXPECT_THAT(*Result.MaybeBenchmarkLog,
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
Field(&BenchmarkState::LastSampleIterations, 2),
Field(&BenchmarkState::LastSampleIterations, 4),
Field(&BenchmarkState::LastSampleIterations, 8),
Field(&BenchmarkState::LastSampleIterations, 16),
Field(&BenchmarkState::LastSampleIterations, 32)));
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 6);
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxIterationsReached);
}
TEST_F(LibcBenchmark, MinSamples) {
Options.MinSamples = 4;
Options.ScalingFactor = 2;
Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
setMeasurements(
{nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
const auto Result = run();
EXPECT_THAT(*Result.MaybeBenchmarkLog,
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
Field(&BenchmarkState::LastSampleIterations, 2),
Field(&BenchmarkState::LastSampleIterations, 4),
Field(&BenchmarkState::LastSampleIterations, 8)));
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
}
TEST_F(LibcBenchmark, Epsilon) {
Options.MinSamples = 4;
Options.ScalingFactor = 2;
Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
setMeasurements(
{nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
const auto Result = run();
EXPECT_THAT(*Result.MaybeBenchmarkLog,
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
Field(&BenchmarkState::LastSampleIterations, 2),
Field(&BenchmarkState::LastSampleIterations, 4),
Field(&BenchmarkState::LastSampleIterations, 8)));
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
}
TEST(ArrayRefLoop, Cycle) {
std::array<int, 2> array = {1, 2};
EXPECT_THAT(cycle(array, 0), ElementsAre());
EXPECT_THAT(cycle(array, 1), ElementsAre(1));
EXPECT_THAT(cycle(array, 2), ElementsAre(1, 2));
EXPECT_THAT(cycle(array, 3), ElementsAre(1, 2, 1));
EXPECT_THAT(cycle(array, 4), ElementsAre(1, 2, 1, 2));
EXPECT_THAT(cycle(array, 5), ElementsAre(1, 2, 1, 2, 1));
}
TEST(ByteConstrainedArray, Simple) {
EXPECT_THAT((ByteConstrainedArray<char, 17>()), SizeIs(17));
EXPECT_THAT((ByteConstrainedArray<uint16_t, 17>()), SizeIs(8));
EXPECT_THAT((ByteConstrainedArray<uint32_t, 17>()), SizeIs(4));
EXPECT_THAT((ByteConstrainedArray<uint64_t, 17>()), SizeIs(2));
EXPECT_LE(sizeof(ByteConstrainedArray<char, 17>), 17U);
EXPECT_LE(sizeof(ByteConstrainedArray<uint16_t, 17>), 17U);
EXPECT_LE(sizeof(ByteConstrainedArray<uint32_t, 17>), 17U);
EXPECT_LE(sizeof(ByteConstrainedArray<uint64_t, 17>), 17U);
}
TEST(ByteConstrainedArray, Cycle) {
ByteConstrainedArray<uint64_t, 17> TwoValues{{1UL, 2UL}};
EXPECT_THAT(cycle(TwoValues, 5), ElementsAre(1, 2, 1, 2, 1));
}
} // namespace
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,62 @@
//===-------- Benchmark memory specific tools -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcMemoryBenchmark.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
namespace llvm {
namespace libc_benchmarks {
// Returns a distribution that samples the buffer to satisfy the required
// alignment.
// When alignment is set, the distribution is scaled down by `Factor` and scaled
// up again by the same amount during sampling.
static std::uniform_int_distribution<uint32_t>
GetOffsetDistribution(const StudyConfiguration &Conf) {
if (Conf.AddressAlignment &&
*Conf.AddressAlignment > AlignedBuffer::Alignment)
report_fatal_error(
"AddressAlignment must be less or equal to AlignedBuffer::Alignment");
if (!Conf.AddressAlignment)
return std::uniform_int_distribution<uint32_t>(0, 0); // Always 0.
// If we test up to Size bytes, the returned offset must stay under
// BuffersSize - Size.
int64_t MaxOffset = Conf.BufferSize;
MaxOffset -= Conf.Size.To;
MaxOffset -= 1;
if (MaxOffset < 0)
report_fatal_error(
"BufferSize too small to exercise specified Size configuration");
MaxOffset /= Conf.AddressAlignment->value();
return std::uniform_int_distribution<uint32_t>(0, MaxOffset);
}
OffsetDistribution::OffsetDistribution(const StudyConfiguration &Conf)
: Distribution(GetOffsetDistribution(Conf)),
Factor(Conf.AddressAlignment.valueOrOne().value()) {}
// Precomputes offset where to insert mismatches between the two buffers.
MismatchOffsetDistribution::MismatchOffsetDistribution(
const StudyConfiguration &Conf)
: MismatchAt(Conf.MemcmpMismatchAt) {
if (MismatchAt <= 1)
return;
const auto ToSize = Conf.Size.To;
for (size_t I = ToSize + 1; I < Conf.BufferSize; I += ToSize)
MismatchIndices.push_back(I);
if (MismatchIndices.empty())
llvm::report_fatal_error("Unable to generate mismatch");
MismatchIndexSelector =
std::uniform_int_distribution<size_t>(0, MismatchIndices.size() - 1);
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,183 @@
//===-------- Benchmark memory specific tools -------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file complements the `benchmark` header with memory specific tools and
// benchmarking facilities.
#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
#include "LibcBenchmark.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Alignment.h"
#include <cstdint>
#include <random>
namespace llvm {
namespace libc_benchmarks {
//--------------
// Configuration
//--------------
// Specifies a range of sizes to explore.
struct SizeRange {
uint32_t From = 0; // Inclusive
uint32_t To = 1024; // Inclusive
uint32_t Step = 1;
};
// An object to define how to test a memory function.
struct StudyConfiguration {
// The number of run for the study.
uint32_t Runs = 1;
// The size of the buffers (1 buffer for memset but 2 for memcpy or memcmp).
// When testing small sizes, it's important to keep the total allocated
// size under the size of the L1 cache (usually 16 or 32KiB). The framework
// will also use 2KiB of additional L1 memory to store the function
// parameters.
uint32_t BufferSize = 8192;
// The range of sizes to exercise.
SizeRange Size;
MaybeAlign AddressAlignment; // Unset : Use start of buffer which is at
// least cache line aligned)
// 1 : Use random address,
// >1 : Use random address aligned to value.
// The value to use for memset.
uint8_t MemsetValue = 0;
// The mismatch position for memcmp.
uint32_t MemcmpMismatchAt = 0; // 0 : Buffer compare equal,
// >0 : Buffer compare different at byte N-1.
};
//--------
// Results
//--------
// The time to run one iteration of the function under test for the specified
// Size.
struct Measurement {
uint32_t Size = 0;
Duration Runtime = {};
};
// The measurements for a specific function.
struct FunctionMeasurements {
std::string Name;
std::vector<Measurement> Measurements;
};
// The root object containing all the data (configuration and measurements).
struct Study {
HostState Host;
BenchmarkOptions Options;
StudyConfiguration Configuration;
SmallVector<FunctionMeasurements, 4> Functions;
};
// Provides an aligned, dynamically allocated buffer.
class AlignedBuffer {
char *const Buffer = nullptr;
size_t Size = 0;
public:
static constexpr size_t Alignment = 1024;
explicit AlignedBuffer(size_t Size)
: Buffer(static_cast<char *>(aligned_alloc(1024, Size))), Size(Size) {}
~AlignedBuffer() { free(Buffer); }
inline char *operator+(size_t Index) { return Buffer + Index; }
inline const char *operator+(size_t Index) const { return Buffer + Index; }
inline char &operator[](size_t Index) { return Buffer[Index]; }
inline const char &operator[](size_t Index) const { return Buffer[Index]; }
inline char *begin() { return Buffer; }
inline char *end() { return Buffer + Size; }
};
// Implements the ParameterProvider abstraction needed by the `benchmark`
// function. This implementation makes sure that all parameters will fit into
// `StorageSize` bytes. The total memory accessed during benchmark should be
// less than the data L1 cache, that is the storage for the ParameterProvider
// and the memory buffers.
template <typename Context, size_t StorageSize = 8 * 1024>
class SmallParameterProvider {
using ParameterType = typename Context::ParameterType;
ByteConstrainedArray<ParameterType, StorageSize> Parameters;
size_t LastIterations;
Context &Ctx;
public:
explicit SmallParameterProvider(Context &C) : Ctx(C) {}
SmallParameterProvider(const SmallParameterProvider &) = delete;
SmallParameterProvider &operator=(const SmallParameterProvider &) = delete;
// Useful to compute the histogram of the size parameter.
CircularArrayRef<ParameterType> getLastBatch() const {
return cycle(Parameters, LastIterations);
}
// Implements the interface needed by the `benchmark` function.
CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
LastIterations = Iterations;
Ctx.Randomize(Parameters);
return getLastBatch();
}
};
// Helper to generate random buffer offsets that satisfy the configuration
// constraints.
class OffsetDistribution {
std::uniform_int_distribution<uint32_t> Distribution;
uint32_t Factor;
public:
explicit OffsetDistribution(const StudyConfiguration &Conf);
template <class Generator> uint32_t operator()(Generator &G) {
return Distribution(G) * Factor;
}
};
// Helper to generate random buffer offsets that satisfy the configuration
// constraints. It is specifically designed to benchmark `memcmp` functions
// where we may want the Nth byte to differ.
class MismatchOffsetDistribution {
std::uniform_int_distribution<size_t> MismatchIndexSelector;
llvm::SmallVector<uint32_t, 16> MismatchIndices;
const uint32_t MismatchAt;
public:
explicit MismatchOffsetDistribution(const StudyConfiguration &Conf);
explicit operator bool() const { return !MismatchIndices.empty(); }
const llvm::SmallVectorImpl<uint32_t> &getMismatchIndices() const {
return MismatchIndices;
}
template <class Generator> uint32_t operator()(Generator &G, uint32_t Size) {
const uint32_t MismatchIndex = MismatchIndices[MismatchIndexSelector(G)];
// We need to position the offset so that a mismatch occurs at MismatchAt.
if (Size >= MismatchAt)
return MismatchIndex - MismatchAt;
// Size is too small to trigger the mismatch.
return MismatchIndex - Size - 1;
}
};
} // namespace libc_benchmarks
} // namespace llvm
#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H

View File

@ -0,0 +1,100 @@
//===-------- Benchmark --------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcMemoryBenchmarkMain.h"
#include "JSON.h"
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace libc_benchmarks {
static cl::opt<std::string>
Configuration("conf", cl::desc("Specify configuration filename"),
cl::value_desc("filename"), cl::init(""));
static cl::opt<std::string> Output("o", cl::desc("Specify output filename"),
cl::value_desc("filename"), cl::init("-"));
extern std::unique_ptr<BenchmarkRunner>
getRunner(const StudyConfiguration &Conf);
void Main() {
#ifndef NDEBUG
static_assert(
false,
"For reproducibility benchmarks should not be compiled in DEBUG mode.");
#endif
checkRequirements();
ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
MemoryBuffer::getFileOrSTDIN(Configuration);
if (!MB)
report_fatal_error(
Twine("Could not open configuration file: ").concat(Configuration));
auto ErrorOrStudy = ParseJsonStudy((*MB)->getBuffer());
if (!ErrorOrStudy)
report_fatal_error(ErrorOrStudy.takeError());
const auto StudyPrototype = *ErrorOrStudy;
Study S;
S.Host = HostState::get();
S.Options = StudyPrototype.Options;
S.Configuration = StudyPrototype.Configuration;
const auto Runs = S.Configuration.Runs;
const auto &SR = S.Configuration.Size;
std::unique_ptr<BenchmarkRunner> Runner = getRunner(S.Configuration);
const size_t TotalSteps =
Runner->getFunctionNames().size() * Runs * ((SR.To - SR.From) / SR.Step);
size_t Steps = 0;
for (auto FunctionName : Runner->getFunctionNames()) {
FunctionMeasurements FM;
FM.Name = FunctionName;
for (size_t Run = 0; Run < Runs; ++Run) {
for (uint32_t Size = SR.From; Size <= SR.To; Size += SR.Step) {
const auto Result = Runner->benchmark(S.Options, FunctionName, Size);
Measurement Measurement;
Measurement.Runtime = Result.BestGuess;
Measurement.Size = Size;
FM.Measurements.push_back(Measurement);
outs() << format("%3d%% run: %2d / %2d size: %5d ",
(Steps * 100 / TotalSteps), Run, Runs, Size)
<< FunctionName
<< " \r";
++Steps;
}
}
S.Functions.push_back(std::move(FM));
}
std::error_code EC;
raw_fd_ostream FOS(Output, EC);
if (EC)
report_fatal_error(Twine("Could not open file: ")
.concat(EC.message())
.concat(", ")
.concat(Output));
json::OStream JOS(FOS);
SerializeToJson(S, JOS);
}
} // namespace libc_benchmarks
} // namespace llvm
int main(int argc, char **argv) {
llvm::cl::ParseCommandLineOptions(argc, argv);
llvm::libc_benchmarks::Main();
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,36 @@
//===-------- BenchmarkRunner interface -------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
#include "LibcBenchmark.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
namespace llvm {
namespace libc_benchmarks {
// Each memory function benchmark implements this interface.
// It is used by the main function to run all benchmarks in a uniform manner.
class BenchmarkRunner {
public:
virtual ~BenchmarkRunner() {}
// Returns a list of all available functions to test.
virtual ArrayRef<StringRef> getFunctionNames() const = 0;
// Performs the benchmarking for a particular FunctionName and Size.
virtual BenchmarkResult benchmark(const BenchmarkOptions &Options,
StringRef FunctionName, size_t Size) = 0;
};
} // namespace libc_benchmarks
} // namespace llvm
#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H

View File

@ -0,0 +1,112 @@
#include "LibcMemoryBenchmark.h"
#include "llvm/Support/Alignment.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
using testing::AllOf;
using testing::AnyOf;
using testing::ElementsAre;
using testing::Ge;
using testing::Gt;
using testing::Le;
using testing::Lt;
namespace llvm {
namespace libc_benchmarks {
namespace {
TEST(AlignedBuffer, IsAligned) {
AlignedBuffer AB(0);
EXPECT_TRUE(isAddrAligned(Align(AlignedBuffer::Alignment), AB.begin()));
}
TEST(AlignedBuffer, Empty) {
AlignedBuffer AB(0);
EXPECT_EQ(std::distance(AB.begin(), AB.end()), 0U);
}
TEST(OffsetDistribution, AlignToBegin) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.AddressAlignment = None;
OffsetDistribution OD(Conf);
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_EQ(OD(Gen), 0U);
}
TEST(OffsetDistribution, NoAlignment) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.AddressAlignment = Align::None();
Conf.Size.To = 1;
OffsetDistribution OD(Conf);
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U)));
}
MATCHER_P(IsDivisibleBy, n, "") {
*result_listener << "where the remainder is " << (arg % n);
return (arg % n) == 0;
}
TEST(OffsetDistribution, Aligned) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.AddressAlignment = Align(16);
Conf.Size.To = 1;
OffsetDistribution OD(Conf);
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U), IsDivisibleBy(16U)));
}
TEST(MismatchOffsetDistribution, EqualBufferDisablesDistribution) {
StudyConfiguration Conf;
Conf.MemcmpMismatchAt = 0; // buffer are equal.
MismatchOffsetDistribution MOD(Conf);
EXPECT_FALSE(MOD);
}
TEST(MismatchOffsetDistribution, DifferentBufferDisablesDistribution) {
StudyConfiguration Conf;
Conf.MemcmpMismatchAt = 1; // buffer are different.
MismatchOffsetDistribution MOD(Conf);
EXPECT_FALSE(MOD);
}
TEST(MismatchOffsetDistribution, MismatchAt2) {
const uint32_t MismatchAt = 2;
const uint32_t ToSize = 4;
StudyConfiguration Conf;
Conf.BufferSize = 16;
Conf.MemcmpMismatchAt = MismatchAt; // buffer are different at position 2.
Conf.Size.To = ToSize;
MismatchOffsetDistribution MOD(Conf);
EXPECT_TRUE(MOD);
// We test equality up to ToSize (=4) so we need spans of 4 equal bytes spaced
// by one mismatch.
EXPECT_THAT(MOD.getMismatchIndices(), ElementsAre(5, 9, 13));
std::default_random_engine Gen;
for (size_t Iterations = 0; Iterations <= 10; ++Iterations) {
for (size_t Size = Conf.Size.From; Size <= ToSize; ++Size) {
if (Size >= MismatchAt)
EXPECT_THAT(MOD(Gen, Size),
AnyOf(5 - MismatchAt, 9 - MismatchAt, 13 - MismatchAt));
else
EXPECT_THAT(MOD(Gen, Size),
AnyOf(5 - Size - 1, 9 - Size - 1, 13 - Size - 1));
}
}
}
} // namespace
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,87 @@
//===-------- Benchmark memcmp implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "LibcMemoryBenchmarkMain.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace libc_benchmarks {
// The context encapsulates the buffers, parameters and the measure.
struct MemcmpContext : public BenchmarkRunner {
using FunctionPrototype = int (*)(const void *, const void *, size_t);
struct ParameterType {
uint16_t Offset = 0;
};
explicit MemcmpContext(const StudyConfiguration &Conf)
: MOD(Conf), OD(Conf), ABuffer(Conf.BufferSize), BBuffer(Conf.BufferSize),
PP(*this) {
std::uniform_int_distribution<char> Dis;
// Generate random buffer A.
for (size_t I = 0; I < Conf.BufferSize; ++I)
ABuffer[I] = Dis(Gen);
// Copy buffer A to B.
::memcpy(BBuffer.begin(), ABuffer.begin(), Conf.BufferSize);
if (Conf.MemcmpMismatchAt == 0)
return; // all same.
else if (Conf.MemcmpMismatchAt == 1)
for (char &c : BBuffer)
++c; // all different.
else
for (const auto I : MOD.getMismatchIndices())
++BBuffer[I];
}
// Needed by the ParameterProvider to update the current batch of parameter.
void Randomize(MutableArrayRef<ParameterType> Parameters) {
if (MOD)
for (auto &P : Parameters)
P.Offset = MOD(Gen, CurrentSize);
else
for (auto &P : Parameters)
P.Offset = OD(Gen);
}
ArrayRef<StringRef> getFunctionNames() const override {
static std::array<StringRef, 1> kFunctionNames = {"memcmp"};
return kFunctionNames;
}
BenchmarkResult benchmark(const BenchmarkOptions &Options,
StringRef FunctionName, size_t Size) override {
CurrentSize = Size;
// FIXME: Add `bcmp` once we're guaranteed that the function is provided.
FunctionPrototype Function =
StringSwitch<FunctionPrototype>(FunctionName).Case("memcmp", &::memcmp);
return llvm::libc_benchmarks::benchmark(
Options, PP, [this, Function, Size](ParameterType p) {
return Function(ABuffer + p.Offset, BBuffer + p.Offset, Size);
});
}
private:
std::default_random_engine Gen;
MismatchOffsetDistribution MOD;
OffsetDistribution OD;
size_t CurrentSize = 0;
AlignedBuffer ABuffer;
AlignedBuffer BBuffer;
SmallParameterProvider<MemcmpContext> PP;
};
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
return std::make_unique<MemcmpContext>(Conf);
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,69 @@
//===-------- Benchmark memcpy implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "LibcMemoryBenchmarkMain.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/raw_ostream.h"
#include <memory>
namespace llvm {
namespace libc_benchmarks {
// The context encapsulates the buffers, parameters and the measure.
struct MemcpyContext : public BenchmarkRunner {
using FunctionPrototype = void *(*)(void *, const void *, size_t);
struct ParameterType {
uint16_t SrcOffset = 0;
uint16_t DstOffset = 0;
};
explicit MemcpyContext(const StudyConfiguration &Conf)
: OD(Conf), SrcBuffer(Conf.BufferSize), DstBuffer(Conf.BufferSize),
PP(*this) {}
// Needed by the ParameterProvider to update the current batch of parameter.
void Randomize(MutableArrayRef<ParameterType> Parameters) {
for (auto &P : Parameters) {
P.DstOffset = OD(Gen);
P.SrcOffset = OD(Gen);
}
}
ArrayRef<StringRef> getFunctionNames() const override {
static std::array<StringRef, 1> kFunctionNames = {"memcpy"};
return kFunctionNames;
}
BenchmarkResult benchmark(const BenchmarkOptions &Options,
StringRef FunctionName, size_t Size) override {
FunctionPrototype Function =
StringSwitch<FunctionPrototype>(FunctionName).Case("memcpy", &::memcpy);
return llvm::libc_benchmarks::benchmark(
Options, PP, [this, Function, Size](ParameterType p) {
Function(DstBuffer + p.DstOffset, SrcBuffer + p.SrcOffset, Size);
return DstBuffer + p.DstOffset;
});
}
private:
std::default_random_engine Gen;
OffsetDistribution OD;
AlignedBuffer SrcBuffer;
AlignedBuffer DstBuffer;
SmallParameterProvider<MemcpyContext> PP;
};
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
return std::make_unique<MemcpyContext>(Conf);
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,66 @@
//===-------- Benchmark memset implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LibcBenchmark.h"
#include "LibcMemoryBenchmark.h"
#include "LibcMemoryBenchmarkMain.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace libc_benchmarks {
// The context encapsulates the buffers, parameters and the measure.
struct MemsetContext : public BenchmarkRunner {
using FunctionPrototype = void *(*)(void *, int, size_t);
struct ParameterType {
uint16_t DstOffset = 0;
};
explicit MemsetContext(const StudyConfiguration &Conf)
: OD(Conf), DstBuffer(Conf.BufferSize), MemsetValue(Conf.MemsetValue),
PP(*this) {}
// Needed by the ParameterProvider to update the current batch of parameter.
void Randomize(MutableArrayRef<ParameterType> Parameters) {
for (auto &P : Parameters) {
P.DstOffset = OD(Gen);
}
}
ArrayRef<StringRef> getFunctionNames() const override {
static std::array<StringRef, 1> kFunctionNames = {"memset"};
return kFunctionNames;
}
BenchmarkResult benchmark(const BenchmarkOptions &Options,
StringRef FunctionName, size_t Size) override {
FunctionPrototype Function =
StringSwitch<FunctionPrototype>(FunctionName).Case("memset", &::memset);
return llvm::libc_benchmarks::benchmark(
Options, PP, [this, Function, Size](ParameterType p) {
Function(DstBuffer + p.DstOffset, MemsetValue, Size);
return DstBuffer + p.DstOffset;
});
}
private:
std::default_random_engine Gen;
OffsetDistribution OD;
AlignedBuffer DstBuffer;
const uint8_t MemsetValue;
SmallParameterProvider<MemsetContext> PP;
};
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
return std::make_unique<MemsetContext>(Conf);
}
} // namespace libc_benchmarks
} // namespace llvm

View File

@ -0,0 +1,243 @@
# Benchmarking `llvm-libc`'s memory functions
## Foreword
Microbenchmarks are valuable tools to assess and compare the performance of
isolated pieces of code. However they don't capture all interactions of complex
systems; and so other metrics can be equally important:
- **code size** (to reduce instruction cache pressure),
- **Profile Guided Optimization** friendliness,
- **hyperthreading / multithreading** friendliness.
## Rationale
The goal here is to satisfy the [Benchmarking
Principles](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Benchmarking_Principles).
1. **Relevance**: Benchmarks should measure relatively vital features.
2. **Representativeness**: Benchmark performance metrics should be broadly
accepted by industry and academia.
3. **Equity**: All systems should be fairly compared.
4. **Repeatability**: Benchmark results can be verified.
5. **Cost-effectiveness**: Benchmark tests are economical.
6. **Scalability**: Benchmark tests should measure from single server to
multiple servers.
7. **Transparency**: Benchmark metrics should be easy to understand.
Benchmarking is a [subtle
art](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Challenges) and
benchmarking memory functions is no exception. Here we'll dive into
peculiarities of designing good microbenchmarks for `llvm-libc` memory
functions.
## Challenges
As seen in the [README.md](README.md#benchmarking-regimes) the microbenchmarking
facility should focus on measuring **low latency code**. If copying a few bytes
takes in the order of a few cycles, the benchmark should be able to **measure
accurately down to the cycle**.
### Measuring instruments
There are different sources of time in a computer (ordered from high to low resolution)
- [Performance
Counters](https://en.wikipedia.org/wiki/Hardware_performance_counter): used to
introspect the internals of the CPU,
- [High Precision Event
Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer): used to
trigger short lived actions,
- [Real-Time Clocks (RTC)](https://en.wikipedia.org/wiki/Real-time_clock): used
to keep track of the computer's time.
In theory **Performance Counters** provide cycle accurate measurement via the
`cpu cycles` event. But as we'll see, they are not really practical in this
context.
### Performance counters and modern processor architecture
Modern CPUs are [out of
order](https://en.wikipedia.org/wiki/Out-of-order_execution) and
[superscalar](https://en.wikipedia.org/wiki/Superscalar_processor) as a
consequence it is [hard to know what is included when the counter is
read](https://en.wikipedia.org/wiki/Hardware_performance_counter#Instruction_based_sampling),
some instructions may still be **in flight**, some others may be executing
[**speculatively**](https://en.wikipedia.org/wiki/Speculative_execution). As a
matter of fact **on the same machine, measuring twice the same piece of code will yield
different results.**
### Performance counters semantics inconsistencies and availability
Although they have the same name, the exact semantics of performance counters
are micro-architecture dependent: **it is generally not possible to compare two
micro-architectures exposing the same performance counters.**
Each vendor decides which performance counters to implement and their exact
meaning. Although we want to benchmark `llvm-libc` memory functions for all
available [target
triples](https://clang.llvm.org/docs/CrossCompilation.html#target-triple), there
are **no guarantees that the counter we're interested in is available.**
### Additional imprecisions
- Reading performance counters is done through Kernel [System
calls](https://en.wikipedia.org/wiki/System_call). The System call itself
is costly (hundreds of cycles) and will perturbate the counter's value.
- [Interruptions](https://en.wikipedia.org/wiki/Interrupt#Processor_response)
can occur during measurement.
- If the system is already under monitoring (virtual machines or system wide
profiling) the kernel can decide to multiplex the performance counters
leading to lower precision or even completely missing the measurement.
- The Kernel can decide to [migrate the
process](https://en.wikipedia.org/wiki/Process_migration) to a different
core.
- [Dynamic frequency
scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling) can kick
in during the measurement and change the ticking duration. **Ultimately we
care about the amount of work over a period of time**. This removes some
legitimacy of measuring cycles rather than **raw time**.
### Cycle accuracy conclusion
We have seen that performance counters are: not widely available, semantically
inconsistent across micro-architectures and imprecise on modern CPUs for small
snippets of code.
## Design decisions
In order to achieve the needed precision we would need to resort on more widely
available counters and derive the time from a high number of runs: going from a
single deterministic measure to a probabilistic one.
**To get a good signal to noise ratio we need the running time of the piece of
code to be orders of magnitude greater than the measurement precision.**
For instance, if measurement precision is of 10 cycles, we need the function
runtime to take more than 1000 cycles to achieve 1%
[SNR](https://en.wikipedia.org/wiki/Signal-to-noise_ratio).
### Repeating code N-times until precision is sufficient
The algorithm is as follows:
- We measure the time it takes to run the code _N_ times (Initially _N_ is 10
for instance)
- We deduce an approximation of the runtime of one iteration (= _runtime_ /
_N_).
- We increase _N_ by _X%_ and repeat the measurement (geometric progression).
- We keep track of the _one iteration runtime approximation_ and build a
weighted mean of all the samples so far (weight is proportional to _N_)
- We stop the process when the difference between the weighted mean and the
last estimation is smaller than _ε_ or when other stopping conditions are
met (total runtime, maximum iterations or maximum sample count).
This method allows us to be as precise as needed provided that the measured
runtime is proportional to _N_. Longer run times also smooth out imprecision
related to _interrupts_ and _context switches_.
Note: When measuring longer runtimes (e.g. copying several megabytes of data)
the above assumption doesn't hold anymore and the _ε_ precision cannot be
reached by increasing iterations. The whole benchmarking process becomes
prohibitively slow. In this case the algorithm is limited to a single sample and
repeated several times to get a decent 95% confidence interval.
### Effect of branch prediction
When measuring code with branches, repeating the same call again and again will
allow the processor to learn the branching patterns and perfectly predict all
the branches, leading to unrealistic results.
**Decision: When benchmarking small buffer sizes, the function parameters should
be randomized between calls to prevent perfect branch predictions.**
### Effect of the memory subsystem
The CPU is tightly coupled to the memory subsystem. It is common to see `L1`,
`L2` and `L3` data caches.
We may be tempted to randomize data accesses widely to exercise all the caching
layers down to RAM but the [cost of accessing lower layers of
memory](https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html)
completely dominates the runtime for small sizes.
So to respect **Equity** and **Repeatability** principles we should make sure we
**do not** depend on the memory subsystem.
**Decision: When benchmarking small buffer sizes, the data accessed by the
function should stay in `L1`.**
### Effect of prefetching
In case of small buffer sizes,
[prefetching](https://en.wikipedia.org/wiki/Cache_prefetching) should not kick
in but in case of large buffers it may introduce a bias.
**Decision: When benchmarking large buffer sizes, the data should be accessed in
a random fashion to lower the impact of prefetching between calls.**
### Effect of dynamic frequency scaling
Modern processors implement [dynamic frequency
scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling). In so-called
`performance` mode the CPU will increase its frequency and run faster than usual
within [some limits](https://en.wikipedia.org/wiki/Intel_Turbo_Boost) : _"The
increased clock rate is limited by the processor's power, current, and thermal
limits, the number of cores currently in use, and the maximum frequency of the
active cores."_
**Decision: When benchmarking we want to make sure the dynamic frequency scaling
is always set to `performance`. We also want to make sure that the time based
events are not impacted by frequency scaling.**
See [REAME.md](REAME.md) on how to set this up.
### Reserved and pinned cores
Some operating systems allow [core
reservation](https://stackoverflow.com/questions/13583146/whole-one-core-dedicated-to-single-process).
It removes a set of perturbation sources like: process migration, context
switches and interrupts. When a core is hyperthreaded, both cores should be
reserved.
## Microbenchmarks limitations
As stated in the Foreword section a number of effects do play a role in
production but are not directly measurable through microbenchmarks. The code
size of the benchmark is (much) smaller than the hot code of real applications
and **doesn't exhibit instruction cache pressure as much**.
### iCache pressure
Fundamental functions that are called frequently will occupy the L1 iCache
([illustration](https://en.wikipedia.org/wiki/CPU_cache#Example:_the_K8)). If
they are too big they will prevent other hot code to stay in the cache and incur
[stalls](https://en.wikipedia.org/wiki/CPU_cache#CPU_stalls). So the memory
functions should be as small as possible.
### iTLB pressure
The same reasoning goes for instruction Translation Lookaside Buffer
([iTLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer)) incurring
[TLB
misses](https://en.wikipedia.org/wiki/Translation_lookaside_buffer#TLB-miss_handling).
## FAQ
1. Why don't you use Google Benchmark directly?
We reuse some parts of Google Benchmark (detection of frequency scaling, CPU
cache hierarchy informations) but when it comes to measuring memory
functions Google Benchmark have a few issues:
- Google Benchmark privileges code based configuration via macros and
builders. It is typically done in a static manner. In our case the
parameters we need to setup are a mix of what's usually controlled by
the framework (number of trials, maximum number of iterations, size
ranges) and parameters that are more tied to the function under test
(randomization strategies, custom values). Achieving this with Google
Benchmark is cumbersome as it involves templated benchmarks and
duplicated code. In the end, the configuration would be spread across
command line flags (via framework's option or custom flags), and code
constants.
- Output of the measurements is done through a `BenchmarkReporter` class,
that makes it hard to access the parameters discussed above.

View File

@ -0,0 +1,103 @@
# Libc mem* benchmarks
This framework has been designed to evaluate and compare relative performance of
memory function implementations on a particular host.
It will also be use to track implementations performances over time.
## Quick start
### Setup
**Python 2** [being deprecated](https://www.python.org/doc/sunset-python-2/) it is
advised to used **Python 3**.
Then make sure to have `matplotlib`, `scipy` and `numpy` setup correctly:
```shell
apt-get install python3-pip
pip3 install matplotlib scipy numpy
```
To get good reproducibility it is important to make sure that the system runs in
`performance` mode. This is achieved by running:
```shell
cpupower frequency-set --governor performance
```
### Run and display `memcpy` benchmark
The following commands will run the benchmark and display a 95 percentile
confidence interval curve of **time per copied bytes**. It also features **host
informations** and **benchmarking configuration**.
```shell
cd llvm-project
cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS=libc -DCMAKE_BUILD_TYPE=Release
make -C /tmp/build -j display-libc-memcpy-benchmark-small
```
## Benchmarking regimes
Using a profiler to observe size distributions for calls into libc functions, it
was found most operations act on a small number of bytes.
Function | % of calls with size ≤ 128 | % of calls with size ≤ 1024
------------------ | --------------------------: | ---------------------------:
memcpy | 96% | 99%
memset | 91% | 99.9%
memcmp<sup>1</sup> | 99.5% | ~100%
Benchmarking configurations come in two flavors:
- [small](libc/utils/benchmarks/configuration_small.json)
- Exercises sizes up to `1KiB`, representative of normal usage
- The data is kept in the `L1` cache to prevent measuring the memory
subsystem
- [big](libc/utils/benchmarks/configuration_big.json)
- Exercises sizes up to `32MiB` to test large operations
- Caching effects can show up here which prevents comparing different hosts
_<sup>1</sup> - The size refers to the size of the buffers to compare and not
the number of bytes until the first difference._
## Benchmarking targets
The benchmarking process occurs in two steps:
1. Benchmark the functions and produce a `json` file
2. Display (or renders) the `json` file
Targets are of the form `<action>-libc-<function>-benchmark-<configuration>`
- `action` is one of :
- `run`, runs the benchmark and writes the `json` file
- `display`, displays the graph on screen
- `render`, renders the graph on disk as a `png` file
- `function` is one of : `memcpy`, `memcmp`, `memset`
- `configuration` is one of : `small`, `big`
## Superposing curves
It is possible to **merge** several `json` files into a single graph. This is
useful to **compare** implementations.
In the following example we superpose the curves for `memcpy`, `memset` and
`memcmp`:
```shell
> make -C /tmp/build run-libc-memcpy-benchmark-small run-libc-memcmp-benchmark-small run-libc-memset-benchmark-small
> python libc/utils/benchmarks/render.py3 /tmp/last-libc-memcpy-benchmark-small.json /tmp/last-libc-memcmp-benchmark-small.json /tmp/last-libc-memset-benchmark-small.json
```
## Useful `render.py3` flags
- To save the produced graph `--output=/tmp/benchmark_curve.png`.
- To prevent the graph from appearing on the screen `--headless`.
## Under the hood
To learn more about the design decisions behind the benchmarking framework,
have a look at the [RATIONALE.md](RATIONALE.md) file.

View File

@ -0,0 +1,24 @@
{
"Options":{
"MinDuration":0.001,
"MaxDuration":1,
"InitialIterations":100,
"MaxIterations":10000000,
"MinSamples":1,
"MaxSamples":1,
"Epsilon":0.01,
"ScalingFactor":1.4
},
"Configuration":{
"Runs":5,
"BufferSize":134217728,
"Size":{
"From":0,
"To":33554432,
"Step":1048576
},
"AddressAlignment":1,
"MemsetValue":0,
"MemcmpMismatchAt":0
}
}

View File

@ -0,0 +1,24 @@
{
"Options":{
"MinDuration":0.001,
"MaxDuration":1,
"InitialIterations":100,
"MaxIterations":10000000,
"MinSamples":4,
"MaxSamples":1000,
"Epsilon":0.01,
"ScalingFactor":1.4
},
"Configuration":{
"Runs":10,
"BufferSize":8192,
"Size":{
"From":0,
"To":1024,
"Step":1
},
"AddressAlignment":1,
"MemsetValue":0,
"MemcmpMismatchAt":0
}
}

View File

@ -0,0 +1,175 @@
"""Reads JSON files produced by the benchmarking framework and renders them.
Installation:
> apt-get install python3-pip
> pip3 install matplotlib scipy numpy
Run:
> python3 render.py3 <files>
Rendering can occur on disk by specifying the --output option or on screen if
the --headless flag is not set.
"""
import argparse
import collections
import json
import math
import pprint
import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter
import numpy as np
import scipy.stats
def format_freq(number):
"""Returns a human readable frequency."""
magnitude = 0
while math.fabs(number) >= 1000:
number /= 1000.0
magnitude += 1
return "%g%sHz" % (number, ["", "k", "M", "G"][magnitude])
def format_size(number):
"""Returns number in human readable form."""
magnitude = 0
while number >= 1000 and number % 1000 == 0:
number /= 1000
magnitude += 1
return "%g%s" % (number, ["", "K", "M", "G"][magnitude])
def mean_confidence_interval(dataset, confidence=0.95):
"""Returns the mean and half confidence interval for the dataset."""
a = 1.0 * np.array(dataset)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
return m, h
def add_plot(function_name, points):
"""Plots measurements for a function."""
n = len(points.keys())
x = np.zeros(n)
y = np.zeros(n)
yerr = np.zeros(n)
for i, key in enumerate(sorted(points.keys())):
values = points[key]
m, e = mean_confidence_interval(values)
x[i] = key
y[i] = m
yerr[i] = e
plt.plot(x, y, linewidth=1, label=function_name)
plt.fill_between(x, y - yerr, y + yerr, alpha=0.5)
def get_title(host):
"""Formats the Host object into a title for the plot."""
cpu_name = host["CpuName"]
cpu_freq = format_freq(host["CpuFrequency"])
cache_strings = []
for cache in host["Caches"]:
prefix = {
"Instruction": "i",
"Data": "d",
"Unified": "u",
}.get(cache["Type"])
cache_strings.append(r"%sL_%d %s_{/%d}" %
(prefix, cache["Level"], format_size(
cache["Size"]), cache["NumSharing"]))
title = "%s (%s)" % (cpu_name, cpu_freq)
subtitle = r"$" + ", ".join(sorted(cache_strings)) + r"$"
return title + "\n" + subtitle
def get_host(jsons):
"""Returns the host of the different json objects iff they are all the same.
"""
host = None
for root in jsons:
if host and host != root["Host"]:
sys.exit("The datasets are not coming from the same Host")
if not host:
host = root["Host"]
return host
def get_configuration(jsons):
"""Returns the configuration of the different json objects iff they are all
the same.
"""
config = None
for root in jsons:
if config and config != root["Configuration"]:
return None
if not config:
config = root["Configuration"]
return config
def setup_graphs(files):
"""Setups the graphs to render from the json files."""
jsons = []
for file in files:
with open(file) as json_file:
jsons.append(json.load(json_file))
if not jsons:
sys.exit("Nothing to process")
for root in jsons:
for function in root["Functions"]:
function_name = function["Name"]
sizes = function["Sizes"]
runtimes = function["Runtimes"]
assert len(sizes) == len(runtimes)
values = collections.defaultdict(lambda: [])
for i in range(len(sizes)):
values[sizes[i]].append(runtimes[i])
add_plot(function_name, values)
config = get_configuration(jsons)
if config:
plt.figtext(
0.95,
0.15,
pprint.pformat(config),
verticalalignment="bottom",
horizontalalignment="right",
multialignment="left",
fontsize="small",
bbox=dict(boxstyle="round", facecolor="wheat"))
axes = plt.gca()
axes.set_title(get_title(get_host(jsons)))
axes.set_ylim(bottom=0)
axes.set_xlabel("Size")
axes.set_ylabel("Time")
axes.xaxis.set_major_formatter(EngFormatter(unit="B"))
axes.yaxis.set_major_formatter(EngFormatter(unit="s"))
plt.legend()
plt.grid()
def main():
parser = argparse.ArgumentParser(
description="Process benchmark json files.")
parser.add_argument("files", nargs="+", help="The json files to read from.")
parser.add_argument("--output", help="The output file to write the graph.")
parser.add_argument(
"--headless",
help="If set do not display the graph.",
action="store_true")
args = parser.parse_args()
setup_graphs(args.files)
if args.output:
plt.savefig(args.output)
if not args.headless:
plt.show()
if __name__ == "__main__":
main()