forked from OSchip/llvm-project
[llvm-libc] Add memory function benchmarks
Summary: This patch adds a benchmarking infrastructure for llvm-libc memory functions. In a nutshell, the code can benchmark small and large buffers for the memcpy, memset and memcmp functions. It also produces graphs of size vs latency by running targets of the form `render-libc-{memcpy|memset|memcmp}-benchmark-{small|big}`. The configurations are provided as JSON files and the benchmark also produces a JSON file. This file is then parsed and rendered as a PNG file via the `render.py` script (make sure to run `pip3 install matplotlib scipy numpy`). The script can take several JSON files as input and will superimpose the curves if they are from the same host. TODO: - The code benchmarks whatever is available on the host but should be configured to benchmark the -to be added- llvm-libc memory functions. - Add a README file with instructions and rationale. - Produce scores to track the performance of the functions over time to allow for regression detection. Reviewers: sivachandra, ckennelly Subscribers: mgorny, MaskRay, libc-commits Tags: #libc-project Differential Revision: https://reviews.llvm.org/D72516
This commit is contained in:
parent
b901335193
commit
aba80d0734
|
@ -1,2 +1,3 @@
|
|||
add_subdirectory(HdrGen)
|
||||
add_subdirectory(UnitTest)
|
||||
add_subdirectory(benchmarks)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set(LLVM_LINK_COMPONENTS Support)
|
||||
|
||||
add_tablegen(libc-hdrgen llvm-libc
|
||||
Command.h
|
||||
Command.cpp
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
find_package(Threads)
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
set(LLVM_LINK_COMPONENTS Support)
|
||||
|
||||
#==============================================================================
|
||||
# Build Google Benchmark
|
||||
#==============================================================================
|
||||
set(GOOGLE_BENCHMARK_TARGET_FLAGS ${BENCHMARK_DIALECT_FLAG})
|
||||
if (LIBCXX_BENCHMARK_GCC_TOOLCHAIN)
|
||||
set(GOOGLE_BENCHMARK_TARGET_FLAGS
|
||||
-gcc-toolchain ${LIBCXX_BENCHMARK_GCC_TOOLCHAIN})
|
||||
endif()
|
||||
string(REPLACE ";" " " GOOGLE_BENCHMARK_TARGET_FLAGS "${GOOGLE_BENCHMARK_TARGET_FLAGS}")
|
||||
|
||||
ExternalProject_Add(google-benchmark
|
||||
EXCLUDE_FROM_ALL ON
|
||||
PREFIX google-benchmark
|
||||
SOURCE_DIR ${LIBC_SOURCE_DIR}/../llvm/utils/benchmark
|
||||
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark
|
||||
CMAKE_CACHE_ARGS
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
|
||||
-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
|
||||
-DCMAKE_CXX_FLAGS:STRING=${GOOGLE_BENCHMARK_TARGET_FLAGS}
|
||||
-DCMAKE_BUILD_TYPE:STRING=RELEASE
|
||||
-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
|
||||
-DBENCHMARK_ENABLE_TESTING:BOOL=OFF)
|
||||
|
||||
set(GOOGLE_BENCHMARK_LIBC_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark)
|
||||
set(GOOGLE_BENCHMARK_LINK_FLAGS -L${GOOGLE_BENCHMARK_LIBC_INSTALL}/lib/)
|
||||
|
||||
#==============================================================================
|
||||
# Add Unit Testing Support
|
||||
#==============================================================================
|
||||
|
||||
function(add_libc_benchmark_unittest target_name)
|
||||
if(NOT LLVM_INCLUDE_TESTS)
|
||||
return()
|
||||
endif()
|
||||
|
||||
cmake_parse_arguments(
|
||||
"LIBC_BENCHMARKS_UNITTEST"
|
||||
"" # No optional arguments
|
||||
"SUITE" # Single value arguments
|
||||
"SRCS;DEPENDS" # Multi-value arguments
|
||||
${ARGN}
|
||||
)
|
||||
|
||||
add_executable(${target_name}
|
||||
EXCLUDE_FROM_ALL
|
||||
${LIBC_BENCHMARKS_UNITTEST_SRCS}
|
||||
)
|
||||
target_include_directories(${target_name}
|
||||
PRIVATE
|
||||
${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include
|
||||
${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include
|
||||
)
|
||||
target_link_libraries(${target_name}
|
||||
PRIVATE
|
||||
gtest_main
|
||||
gtest
|
||||
${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${target_name}
|
||||
POST_BUILD
|
||||
COMMAND $<TARGET_FILE:${target_name}>
|
||||
)
|
||||
add_dependencies(check-libc-benchmark ${target_name})
|
||||
endfunction()
|
||||
|
||||
#==============================================================================
|
||||
# Build Google Benchmark for libc
|
||||
#==============================================================================
|
||||
|
||||
add_custom_target(check-libc-benchmark)
|
||||
|
||||
function(fix_rtti target)
|
||||
# TODO: Make this portable and inline with rtti mode from llvm/
|
||||
target_compile_options(${target} PUBLIC -fno-rtti)
|
||||
endfunction()
|
||||
|
||||
# libc-benchmark
|
||||
add_library(libc-benchmark
|
||||
STATIC
|
||||
EXCLUDE_FROM_ALL
|
||||
LibcBenchmark.cpp
|
||||
LibcBenchmark.h
|
||||
)
|
||||
add_dependencies(libc-benchmark google-benchmark)
|
||||
target_include_directories(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LIBC_INSTALL}/include")
|
||||
target_link_options(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LINK_FLAGS}")
|
||||
target_link_libraries(libc-benchmark PUBLIC LLVMSupport -lbenchmark Threads::Threads)
|
||||
fix_rtti(libc-benchmark)
|
||||
|
||||
add_libc_benchmark_unittest(libc-benchmark-test
|
||||
SRCS LibcBenchmarkTest.cpp
|
||||
DEPENDS libc-benchmark
|
||||
)
|
||||
|
||||
# libc-memory-benchmark
|
||||
add_library(libc-memory-benchmark
|
||||
STATIC
|
||||
EXCLUDE_FROM_ALL
|
||||
LibcMemoryBenchmark.cpp
|
||||
LibcMemoryBenchmark.h
|
||||
)
|
||||
target_link_libraries(libc-memory-benchmark PUBLIC libc-benchmark)
|
||||
fix_rtti(libc-memory-benchmark)
|
||||
|
||||
add_libc_benchmark_unittest(libc-memory-benchmark-test
|
||||
SRCS LibcMemoryBenchmarkTest.cpp
|
||||
DEPENDS libc-memory-benchmark
|
||||
)
|
||||
|
||||
# json
|
||||
add_library(json
|
||||
STATIC
|
||||
EXCLUDE_FROM_ALL
|
||||
JSON.cpp
|
||||
JSON.h
|
||||
)
|
||||
target_link_libraries(json PUBLIC libc-memory-benchmark)
|
||||
fix_rtti(json)
|
||||
|
||||
add_libc_benchmark_unittest(json-test
|
||||
SRCS JSONTest.cpp
|
||||
DEPENDS json
|
||||
)
|
||||
|
||||
#==============================================================================
|
||||
# Benchmark tests configuration
|
||||
#==============================================================================
|
||||
|
||||
function(add_libc_benchmark_analysis conf_target run_target)
|
||||
set(png_file "/tmp/last-${conf_target}.png")
|
||||
set(render_target render-${conf_target})
|
||||
add_custom_target(${render_target}
|
||||
COMMAND python3 render.py3 ${json_file} --headless --output=${png_file}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "render ${libc_target} to ${png_file}"
|
||||
)
|
||||
add_dependencies(${render_target} ${run_target})
|
||||
|
||||
set(display_target display-${conf_target})
|
||||
add_custom_target(${display_target}
|
||||
COMMAND python3 render.py3 ${json_file}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "display ${libc_target}"
|
||||
)
|
||||
add_dependencies(${display_target} ${run_target})
|
||||
endfunction()
|
||||
|
||||
function(add_libc_benchmark_configuration target configuration)
|
||||
set(conf_target ${target}-${configuration})
|
||||
set(json_file "/tmp/last-${conf_target}.json")
|
||||
set(run_target run-${conf_target})
|
||||
add_custom_target(${run_target}
|
||||
COMMAND ${libc_target} --conf=configuration_${configuration}.json -o ${json_file}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
add_libc_benchmark_analysis(${conf_target} ${run_target})
|
||||
endfunction()
|
||||
|
||||
function(add_libc_benchmark name file)
|
||||
set(libc_target libc-${name}-benchmark)
|
||||
add_executable(${libc_target}
|
||||
EXCLUDE_FROM_ALL
|
||||
${file}
|
||||
LibcMemoryBenchmarkMain.h
|
||||
LibcMemoryBenchmarkMain.cpp
|
||||
)
|
||||
target_link_libraries(${libc_target} PUBLIC json)
|
||||
foreach(configuration "small" "big")
|
||||
add_libc_benchmark_configuration(${libc_target} ${configuration})
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
add_libc_benchmark(memcpy Memcpy.cpp)
|
||||
add_libc_benchmark(memcmp Memcmp.cpp)
|
||||
add_libc_benchmark(memset Memset.cpp)
|
|
@ -0,0 +1,367 @@
|
|||
//===-------- JSON serialization routines ---------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "JSON.h"
|
||||
#include "LibcBenchmark.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/Support/Errc.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/JSON.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
template <typename T>
|
||||
static Error intFromJsonTemplate(const json::Value &V, T &Out) {
|
||||
if (const auto &MaybeInt64 = V.getAsInteger()) {
|
||||
int64_t Value = *MaybeInt64;
|
||||
if (Value < std::numeric_limits<T>::min() ||
|
||||
Value > std::numeric_limits<T>::max())
|
||||
return createStringError(errc::io_error, "Out of bound Integer");
|
||||
Out = Value;
|
||||
return Error::success();
|
||||
}
|
||||
return createStringError(errc::io_error, "Can't parse Integer");
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, double &Out) {
|
||||
if (auto S = V.getAsNumber()) {
|
||||
Out = *S;
|
||||
return Error::success();
|
||||
}
|
||||
return createStringError(errc::io_error, "Can't parse Double");
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, std::string &Out) {
|
||||
if (auto S = V.getAsString()) {
|
||||
Out = *S;
|
||||
return Error::success();
|
||||
}
|
||||
return createStringError(errc::io_error, "Can't parse String");
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, uint32_t &Out) {
|
||||
return intFromJsonTemplate(V, Out);
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, uint8_t &Out) {
|
||||
return intFromJsonTemplate(V, Out);
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, int &Out) {
|
||||
return intFromJsonTemplate(V, Out);
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, libc_benchmarks::Duration &D) {
|
||||
if (V.kind() != json::Value::Kind::Number)
|
||||
return createStringError(errc::io_error, "Can't parse Duration");
|
||||
D = libc_benchmarks::Duration(*V.getAsNumber());
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, MaybeAlign &Out) {
|
||||
const auto MaybeInt = V.getAsInteger();
|
||||
if (!MaybeInt)
|
||||
return createStringError(errc::io_error,
|
||||
"Can't parse Align, not an Integer");
|
||||
const int64_t Value = *MaybeInt;
|
||||
if (!Value) {
|
||||
Out = None;
|
||||
return Error::success();
|
||||
}
|
||||
if (isPowerOf2_64(Value)) {
|
||||
Out = Align(Value);
|
||||
return Error::success();
|
||||
}
|
||||
return createStringError(errc::io_error,
|
||||
"Can't parse Align, not a power of two");
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V,
|
||||
libc_benchmarks::BenchmarkLog &Out) {
|
||||
if (V.kind() != json::Value::Kind::String)
|
||||
return createStringError(errc::io_error,
|
||||
"Can't parse BenchmarkLog, not a String");
|
||||
const auto String = *V.getAsString();
|
||||
auto Parsed =
|
||||
llvm::StringSwitch<Optional<libc_benchmarks::BenchmarkLog>>(String)
|
||||
.Case("None", libc_benchmarks::BenchmarkLog::None)
|
||||
.Case("Last", libc_benchmarks::BenchmarkLog::Last)
|
||||
.Case("Full", libc_benchmarks::BenchmarkLog::Full)
|
||||
.Default(None);
|
||||
if (!Parsed)
|
||||
return createStringError(errc::io_error,
|
||||
Twine("Can't parse BenchmarkLog, invalid value '")
|
||||
.concat(String)
|
||||
.concat("'"));
|
||||
Out = *Parsed;
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
Error vectorFromJsonTemplate(const json::Value &V, C &Out) {
|
||||
auto *A = V.getAsArray();
|
||||
if (!A)
|
||||
return createStringError(errc::io_error, "Can't parse Array");
|
||||
Out.clear();
|
||||
Out.resize(A->size());
|
||||
for (auto InOutPair : llvm::zip(*A, Out))
|
||||
if (auto E = fromJson(std::get<0>(InOutPair), std::get<1>(InOutPair)))
|
||||
return std::move(E);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static Error fromJson(const json::Value &V, std::vector<T> &Out) {
|
||||
return vectorFromJsonTemplate(V, Out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static Error fromJson(const json::Value &V, SmallVectorImpl<T> &Out) {
|
||||
return vectorFromJsonTemplate(V, Out);
|
||||
}
|
||||
|
||||
// Same as llvm::json::ObjectMapper but adds a finer error reporting mechanism.
|
||||
class JsonObjectMapper {
|
||||
const json::Object *O;
|
||||
Error E;
|
||||
SmallDenseSet<StringRef> SeenFields;
|
||||
|
||||
public:
|
||||
explicit JsonObjectMapper(const json::Value &V)
|
||||
: O(V.getAsObject()),
|
||||
E(O ? Error::success()
|
||||
: createStringError(errc::io_error, "Expected JSON Object")) {}
|
||||
|
||||
Error takeError() {
|
||||
if (E)
|
||||
return std::move(E);
|
||||
for (const auto &Itr : *O) {
|
||||
const StringRef Key = Itr.getFirst();
|
||||
if (!SeenFields.count(Key))
|
||||
E = createStringError(errc::io_error,
|
||||
Twine("Unknown field: ").concat(Key));
|
||||
}
|
||||
return std::move(E);
|
||||
}
|
||||
|
||||
template <typename T> void map(StringRef Key, T &Out) {
|
||||
if (E)
|
||||
return;
|
||||
if (const json::Value *Value = O->get(Key)) {
|
||||
SeenFields.insert(Key);
|
||||
E = fromJson(*Value, Out);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static Error fromJson(const json::Value &V,
|
||||
libc_benchmarks::BenchmarkOptions &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("MinDuration", Out.MinDuration);
|
||||
O.map("MaxDuration", Out.MaxDuration);
|
||||
O.map("InitialIterations", Out.InitialIterations);
|
||||
O.map("MaxIterations", Out.MaxIterations);
|
||||
O.map("MinSamples", Out.MinSamples);
|
||||
O.map("MaxSamples", Out.MaxSamples);
|
||||
O.map("Epsilon", Out.Epsilon);
|
||||
O.map("ScalingFactor", Out.ScalingFactor);
|
||||
O.map("Log", Out.Log);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, libc_benchmarks::SizeRange &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("From", Out.From);
|
||||
O.map("To", Out.To);
|
||||
O.map("Step", Out.Step);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V,
|
||||
libc_benchmarks::StudyConfiguration &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("Runs", Out.Runs);
|
||||
O.map("BufferSize", Out.BufferSize);
|
||||
O.map("Size", Out.Size);
|
||||
O.map("AddressAlignment", Out.AddressAlignment);
|
||||
O.map("MemsetValue", Out.MemsetValue);
|
||||
O.map("MemcmpMismatchAt", Out.MemcmpMismatchAt);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, libc_benchmarks::CacheInfo &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("Type", Out.Type);
|
||||
O.map("Level", Out.Level);
|
||||
O.map("Size", Out.Size);
|
||||
O.map("NumSharing", Out.NumSharing);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, libc_benchmarks::HostState &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("CpuName", Out.CpuName);
|
||||
O.map("CpuFrequency", Out.CpuFrequency);
|
||||
O.map("Caches", Out.Caches);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V,
|
||||
libc_benchmarks::FunctionMeasurements &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("Name", Out.Name);
|
||||
std::vector<uint32_t> Sizes;
|
||||
O.map("Sizes", Sizes);
|
||||
std::vector<libc_benchmarks::Duration> Runtimes;
|
||||
O.map("Runtimes", Runtimes);
|
||||
if (Sizes.size() != Runtimes.size())
|
||||
return createStringError(errc::io_error,
|
||||
"Measurement Size and Runtime mistmatch");
|
||||
Out.Measurements.resize(Sizes.size());
|
||||
for (size_t I = 0; I < Sizes.size(); ++I) {
|
||||
Out.Measurements[I].Size = Sizes[I];
|
||||
Out.Measurements[I].Runtime = Runtimes[I];
|
||||
}
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static Error fromJson(const json::Value &V, libc_benchmarks::Study &Out) {
|
||||
JsonObjectMapper O(V);
|
||||
O.map("Host", Out.Host);
|
||||
O.map("Options", Out.Options);
|
||||
O.map("Configuration", Out.Configuration);
|
||||
O.map("Functions", Out.Functions);
|
||||
return O.takeError();
|
||||
}
|
||||
|
||||
static double Seconds(const Duration &D) {
|
||||
return std::chrono::duration<double>(D).count();
|
||||
}
|
||||
|
||||
Expected<Study> ParseJsonStudy(StringRef Content) {
|
||||
Expected<json::Value> EV = json::parse(Content);
|
||||
if (!EV)
|
||||
return EV.takeError();
|
||||
Study S;
|
||||
if (Error E = fromJson(*EV, S))
|
||||
return std::move(E);
|
||||
return S;
|
||||
}
|
||||
|
||||
static StringRef Serialize(const BenchmarkLog &L) {
|
||||
switch (L) {
|
||||
case BenchmarkLog::None:
|
||||
return "None";
|
||||
case BenchmarkLog::Last:
|
||||
return "Last";
|
||||
case BenchmarkLog::Full:
|
||||
return "Full";
|
||||
}
|
||||
llvm_unreachable("Unhandled BenchmarkLog value");
|
||||
}
|
||||
|
||||
static void Serialize(const BenchmarkOptions &BO, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attribute("MinDuration", Seconds(BO.MinDuration));
|
||||
JOS.attribute("MaxDuration", Seconds(BO.MaxDuration));
|
||||
JOS.attribute("InitialIterations", BO.InitialIterations);
|
||||
JOS.attribute("MaxIterations", BO.MaxIterations);
|
||||
JOS.attribute("MinSamples", BO.MinSamples);
|
||||
JOS.attribute("MaxSamples", BO.MaxSamples);
|
||||
JOS.attribute("Epsilon", BO.Epsilon);
|
||||
JOS.attribute("ScalingFactor", BO.ScalingFactor);
|
||||
JOS.attribute("Log", Serialize(BO.Log));
|
||||
});
|
||||
}
|
||||
|
||||
static void Serialize(const CacheInfo &CI, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attribute("Type", CI.Type);
|
||||
JOS.attribute("Level", CI.Level);
|
||||
JOS.attribute("Size", CI.Size);
|
||||
JOS.attribute("NumSharing", CI.NumSharing);
|
||||
});
|
||||
}
|
||||
|
||||
static void Serialize(const HostState &HS, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attribute("CpuName", HS.CpuName);
|
||||
JOS.attribute("CpuFrequency", HS.CpuFrequency);
|
||||
JOS.attributeArray("Caches", [&]() {
|
||||
for (const auto &CI : HS.Caches)
|
||||
Serialize(CI, JOS);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void Serialize(const StudyConfiguration &SC, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attribute("Runs", SC.Runs);
|
||||
JOS.attribute("BufferSize", SC.BufferSize);
|
||||
JOS.attributeObject("Size", [&]() {
|
||||
JOS.attribute("From", SC.Size.From);
|
||||
JOS.attribute("To", SC.Size.To);
|
||||
JOS.attribute("Step", SC.Size.Step);
|
||||
});
|
||||
if (SC.AddressAlignment)
|
||||
JOS.attribute("AddressAlignment",
|
||||
static_cast<int64_t>(SC.AddressAlignment->value()));
|
||||
JOS.attribute("MemsetValue", SC.MemsetValue);
|
||||
JOS.attribute("MemcmpMismatchAt", SC.MemcmpMismatchAt);
|
||||
});
|
||||
}
|
||||
|
||||
static void Serialize(const FunctionMeasurements &FM, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attribute("Name", FM.Name);
|
||||
JOS.attributeArray("Sizes", [&]() {
|
||||
for (const auto &M : FM.Measurements)
|
||||
JOS.value(M.Size);
|
||||
});
|
||||
JOS.attributeArray("Runtimes", [&]() {
|
||||
for (const auto &M : FM.Measurements)
|
||||
JOS.value(Seconds(M.Runtime));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void SerializeToJson(const Study &S, json::OStream &JOS) {
|
||||
JOS.object([&]() {
|
||||
JOS.attributeBegin("Host");
|
||||
Serialize(S.Host, JOS);
|
||||
JOS.attributeEnd();
|
||||
|
||||
JOS.attributeBegin("Options");
|
||||
Serialize(S.Options, JOS);
|
||||
JOS.attributeEnd();
|
||||
|
||||
JOS.attributeBegin("Configuration");
|
||||
Serialize(S.Configuration, JOS);
|
||||
JOS.attributeEnd();
|
||||
|
||||
if (!S.Functions.empty()) {
|
||||
JOS.attributeArray("Functions", [&]() {
|
||||
for (const auto &FM : S.Functions)
|
||||
Serialize(FM, JOS);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,28 @@
|
|||
//===-------- JSON serialization routines -----------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_UTILS_BENCHMARK_JSON_H
|
||||
#define LLVM_LIBC_UTILS_BENCHMARK_JSON_H
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "llvm/Support/JSON.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// Parses a Study from a json string.
|
||||
Expected<Study> ParseJsonStudy(StringRef Content);
|
||||
|
||||
// Serialize a Study as json.
|
||||
void SerializeToJson(const Study &S, llvm::json::OStream &JOS);
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_BENCHMARK_JSON_H
|
|
@ -0,0 +1,190 @@
|
|||
#include "JSON.h"
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "llvm/Support/JSON.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using testing::AllOf;
|
||||
using testing::ExplainMatchResult;
|
||||
using testing::Field;
|
||||
using testing::Pointwise;
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
namespace {
|
||||
|
||||
Study getStudy() {
|
||||
return Study{
|
||||
HostState{
|
||||
"CpuName", 123, {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
|
||||
BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2), 10,
|
||||
100, 6, 100, 0.1, 2, BenchmarkLog::Full},
|
||||
StudyConfiguration{2, 3, SizeRange{4, 5, 6}, Align(8), 9, 10},
|
||||
{FunctionMeasurements{"A",
|
||||
{Measurement{3, std::chrono::seconds(3)},
|
||||
Measurement{3, std::chrono::seconds(4)}}},
|
||||
FunctionMeasurements{"B", {}}}};
|
||||
}
|
||||
|
||||
static std::string SerializeToString(const Study &S) {
|
||||
std::string Buffer;
|
||||
raw_string_ostream RSO(Buffer);
|
||||
json::OStream JOS(RSO);
|
||||
SerializeToJson(S, JOS);
|
||||
return Buffer;
|
||||
}
|
||||
|
||||
MATCHER(EqualsCacheInfo, "") {
|
||||
const CacheInfo &A = ::testing::get<0>(arg);
|
||||
const CacheInfo &B = ::testing::get<1>(arg);
|
||||
return ExplainMatchResult(AllOf(Field(&CacheInfo::Type, B.Type),
|
||||
Field(&CacheInfo::Level, B.Level),
|
||||
Field(&CacheInfo::Size, B.Size),
|
||||
Field(&CacheInfo::NumSharing, B.NumSharing)),
|
||||
A, result_listener);
|
||||
}
|
||||
|
||||
auto Equals(const HostState &H) -> auto {
|
||||
return AllOf(
|
||||
Field(&HostState::CpuName, H.CpuName),
|
||||
Field(&HostState::CpuFrequency, H.CpuFrequency),
|
||||
Field(&HostState::Caches, Pointwise(EqualsCacheInfo(), H.Caches)));
|
||||
}
|
||||
|
||||
auto Equals(const BenchmarkOptions &BO) -> auto {
|
||||
return AllOf(
|
||||
Field(&BenchmarkOptions::MinDuration, BO.MinDuration),
|
||||
Field(&BenchmarkOptions::MaxDuration, BO.MaxDuration),
|
||||
Field(&BenchmarkOptions::InitialIterations, BO.InitialIterations),
|
||||
Field(&BenchmarkOptions::MaxIterations, BO.MaxIterations),
|
||||
Field(&BenchmarkOptions::MinSamples, BO.MinSamples),
|
||||
Field(&BenchmarkOptions::MaxSamples, BO.MaxSamples),
|
||||
Field(&BenchmarkOptions::Epsilon, BO.Epsilon),
|
||||
Field(&BenchmarkOptions::ScalingFactor, BO.ScalingFactor),
|
||||
Field(&BenchmarkOptions::Log, BO.Log));
|
||||
}
|
||||
|
||||
auto Equals(const SizeRange &SR) -> auto {
|
||||
return AllOf(Field(&SizeRange::From, SR.From), Field(&SizeRange::To, SR.To),
|
||||
Field(&SizeRange::Step, SR.Step));
|
||||
}
|
||||
|
||||
auto Equals(const StudyConfiguration &SC) -> auto {
|
||||
return AllOf(
|
||||
Field(&StudyConfiguration::Runs, SC.Runs),
|
||||
Field(&StudyConfiguration::BufferSize, SC.BufferSize),
|
||||
Field(&StudyConfiguration::Size, Equals(SC.Size)),
|
||||
Field(&StudyConfiguration::AddressAlignment, SC.AddressAlignment),
|
||||
Field(&StudyConfiguration::MemsetValue, SC.MemsetValue),
|
||||
Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
|
||||
}
|
||||
|
||||
MATCHER(EqualsMeasurement, "") {
|
||||
const Measurement &A = ::testing::get<0>(arg);
|
||||
const Measurement &B = ::testing::get<1>(arg);
|
||||
return ExplainMatchResult(AllOf(Field(&Measurement::Size, B.Size),
|
||||
Field(&Measurement::Runtime, B.Runtime)),
|
||||
A, result_listener);
|
||||
}
|
||||
|
||||
MATCHER(EqualsFunctions, "") {
|
||||
const FunctionMeasurements &A = ::testing::get<0>(arg);
|
||||
const FunctionMeasurements &B = ::testing::get<1>(arg);
|
||||
return ExplainMatchResult(
|
||||
AllOf(Field(&FunctionMeasurements::Name, B.Name),
|
||||
Field(&FunctionMeasurements::Measurements,
|
||||
Pointwise(EqualsMeasurement(), B.Measurements))),
|
||||
A, result_listener);
|
||||
}
|
||||
|
||||
auto Equals(const Study &S) -> auto {
|
||||
return AllOf(
|
||||
Field(&Study::Host, Equals(S.Host)),
|
||||
Field(&Study::Options, Equals(S.Options)),
|
||||
Field(&Study::Configuration, Equals(S.Configuration)),
|
||||
Field(&Study::Functions, Pointwise(EqualsFunctions(), S.Functions)));
|
||||
}
|
||||
|
||||
TEST(JsonTest, RoundTrip) {
|
||||
const Study S = getStudy();
|
||||
auto StudyOrError = ParseJsonStudy(SerializeToString(S));
|
||||
if (auto Err = StudyOrError.takeError())
|
||||
EXPECT_FALSE(Err) << "Unexpected error";
|
||||
const Study &Parsed = *StudyOrError;
|
||||
EXPECT_THAT(Parsed, Equals(S));
|
||||
}
|
||||
|
||||
TEST(JsonTest, SupplementaryField) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"UnknownField": 10
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()), "Unknown field: UnknownField");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidType) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Options": 1
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()), "Expected JSON Object");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidDuration) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Options": {
|
||||
"MinDuration": "Duration should be a Number"
|
||||
}
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Duration");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidAlignType) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Configuration":{
|
||||
"AddressAlignment": "Align should be an Integer"
|
||||
}
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Align, not an Integer");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidAlign) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Configuration":{
|
||||
"AddressAlignment":3
|
||||
}
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()),
|
||||
"Can't parse Align, not a power of two");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidBenchmarkLogType) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Options":{
|
||||
"Log": 3
|
||||
}
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()),
|
||||
"Can't parse BenchmarkLog, not a String");
|
||||
}
|
||||
|
||||
TEST(JsonTest, InvalidBenchmarkLog) {
|
||||
auto Failure = ParseJsonStudy(R"({
|
||||
"Options":{
|
||||
"Log": "Unknown"
|
||||
}
|
||||
}
|
||||
)");
|
||||
EXPECT_EQ(toString(Failure.takeError()),
|
||||
"Can't parse BenchmarkLog, invalid value 'Unknown'");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,40 @@
|
|||
//===-------- `Benchmark` function ----------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "llvm/Support/Host.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
void checkRequirements() {
|
||||
const auto &CpuInfo = benchmark::CPUInfo::Get();
|
||||
if (CpuInfo.scaling_enabled)
|
||||
report_fatal_error(
|
||||
"CPU scaling is enabled, the benchmark real time measurements may be "
|
||||
"noisy and will incur extra overhead.");
|
||||
}
|
||||
|
||||
HostState HostState::get() {
|
||||
const auto &CpuInfo = benchmark::CPUInfo::Get();
|
||||
HostState H;
|
||||
H.CpuFrequency = CpuInfo.cycles_per_second;
|
||||
H.CpuName = llvm::sys::getHostCPUName().str();
|
||||
for (const auto &BenchmarkCacheInfo : CpuInfo.caches) {
|
||||
CacheInfo CI;
|
||||
CI.Type = BenchmarkCacheInfo.type;
|
||||
CI.Level = BenchmarkCacheInfo.level;
|
||||
CI.Size = BenchmarkCacheInfo.size;
|
||||
CI.NumSharing = BenchmarkCacheInfo.num_sharing;
|
||||
H.Caches.push_back(std::move(CI));
|
||||
}
|
||||
return H;
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,324 @@
|
|||
//===-------- `Benchmark` function ------------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// This file mainly defines a `Benchmark` function.
|
||||
//
|
||||
// The benchmarking process is as follows:
|
||||
// - We start by measuring the time it takes to run the function
|
||||
// `InitialIterations` times. This is called a Sample. From this we can derive
|
||||
// the time it took to run a single iteration.
|
||||
//
|
||||
// - We repeat the previous step with a greater number of iterations to lower
|
||||
// the impact of the measurement. We can derive a more precise estimation of the
|
||||
// runtime for a single iteration.
|
||||
//
|
||||
// - Each sample gives a more accurate estimation of the runtime for a single
|
||||
// iteration but also takes more time to run. We stop the process when:
|
||||
// * The measure stabilize under a certain precision (Epsilon),
|
||||
// * The overall benchmarking time is greater than MaxDuration,
|
||||
// * The overall sample count is greater than MaxSamples,
|
||||
// * The last sample used more than MaxIterations iterations.
|
||||
//
|
||||
// - We also makes sure that the benchmark doesn't run for a too short period of
|
||||
// time by defining MinDuration and MinSamples.
|
||||
|
||||
#ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
|
||||
#define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// Makes sure the binary was compiled in release mode and that frequency
|
||||
// governor is set on performance.
|
||||
void checkRequirements();
|
||||
|
||||
using Duration = std::chrono::duration<double>;
|
||||
|
||||
enum class BenchmarkLog {
|
||||
None, // Don't keep the internal state of the benchmark.
|
||||
Last, // Keep only the last batch.
|
||||
Full // Keep all iterations states, useful for testing or debugging.
|
||||
};
|
||||
|
||||
// An object to configure the benchmark stopping conditions.
|
||||
// See documentation at the beginning of the file for the overall algorithm and
|
||||
// meaning of each field.
|
||||
struct BenchmarkOptions {
|
||||
// The minimum time for which the benchmark is running.
|
||||
Duration MinDuration = std::chrono::seconds(0);
|
||||
// The maximum time for which the benchmark is running.
|
||||
Duration MaxDuration = std::chrono::seconds(10);
|
||||
// The number of iterations in the first sample.
|
||||
uint32_t InitialIterations = 1;
|
||||
// The maximum number of iterations for any given sample.
|
||||
uint32_t MaxIterations = 10000000;
|
||||
// The minimum number of samples.
|
||||
uint32_t MinSamples = 4;
|
||||
// The maximum number of samples.
|
||||
uint32_t MaxSamples = 1000;
|
||||
// The benchmark will stop is the relative difference between the current and
|
||||
// the last estimation is less than epsilon. This is 1% by default.
|
||||
double Epsilon = 0.01;
|
||||
// The number of iterations grows exponentially between each sample.
|
||||
// Must be greater or equal to 1.
|
||||
double ScalingFactor = 1.4;
|
||||
BenchmarkLog Log = BenchmarkLog::None;
|
||||
};
|
||||
|
||||
// The state of a benchmark.
|
||||
enum class BenchmarkStatus {
|
||||
Running,
|
||||
MaxDurationReached,
|
||||
MaxIterationsReached,
|
||||
MaxSamplesReached,
|
||||
PrecisionReached,
|
||||
};
|
||||
|
||||
// The internal state of the benchmark, useful to debug, test or report
|
||||
// statistics.
|
||||
struct BenchmarkState {
|
||||
size_t LastSampleIterations;
|
||||
Duration LastBatchElapsed;
|
||||
BenchmarkStatus CurrentStatus;
|
||||
Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
|
||||
double ChangeRatio; // The change in time estimation between previous and
|
||||
// current samples.
|
||||
};
|
||||
|
||||
// A lightweight result for a benchmark.
|
||||
struct BenchmarkResult {
|
||||
BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
|
||||
Duration BestGuess = {};
|
||||
llvm::Optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
|
||||
};
|
||||
|
||||
// Stores information about a cache in the host memory system.
|
||||
struct CacheInfo {
|
||||
std::string Type; // e.g. "Instruction", "Data", "Unified".
|
||||
int Level; // 0 is closest to processing unit.
|
||||
int Size; // In bytes.
|
||||
int NumSharing; // The number of processing units (Hyper-Threading Thread)
|
||||
// with which this cache is shared.
|
||||
};
|
||||
|
||||
// Stores information about the host.
|
||||
struct HostState {
|
||||
std::string CpuName; // returns a string compatible with the -march option.
|
||||
double CpuFrequency; // in Hertz.
|
||||
std::vector<CacheInfo> Caches;
|
||||
|
||||
static HostState get();
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
struct Measurement {
|
||||
size_t Iterations = 0;
|
||||
Duration Elapsed = {};
|
||||
};
|
||||
|
||||
// Updates the estimation of the elapsed time for a single iteration.
|
||||
class RefinableRuntimeEstimation {
|
||||
Duration TotalTime = {};
|
||||
size_t TotalIterations = 0;
|
||||
|
||||
public:
|
||||
Duration update(const Measurement &M) {
|
||||
assert(M.Iterations > 0);
|
||||
// Duration is encoded as a double (see definition).
|
||||
// `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
|
||||
// loss of precision due to radically different scales.
|
||||
TotalTime += M.Elapsed;
|
||||
TotalIterations += M.Iterations;
|
||||
return TotalTime / TotalIterations;
|
||||
}
|
||||
};
|
||||
|
||||
// This class tracks the progression of the runtime estimation.
|
||||
class RuntimeEstimationProgression {
|
||||
RefinableRuntimeEstimation RRE;
|
||||
|
||||
public:
|
||||
Duration CurrentEstimation = {};
|
||||
|
||||
// Returns the change ratio between our best guess so far and the one from the
|
||||
// new measurement.
|
||||
double computeImprovement(const Measurement &M) {
|
||||
const Duration NewEstimation = RRE.update(M);
|
||||
const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
|
||||
CurrentEstimation = NewEstimation;
|
||||
return Ratio;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Measures the runtime of `foo` until conditions defined by `Options` are met.
|
||||
//
|
||||
// To avoid measurement's imprecisions we measure batches of `foo`.
|
||||
// The batch size is growing by `ScalingFactor` to minimize the effect of
|
||||
// measuring.
|
||||
//
|
||||
// Note: The benchmark is not responsible for serializing the executions of
|
||||
// `foo`. It is not suitable for measuring, very small & side effect free
|
||||
// functions, as the processor is free to execute serveral executions in
|
||||
// parallel.
|
||||
//
|
||||
// - Options: A set of parameters controlling the stopping conditions for the
|
||||
// benchmark.
|
||||
// - foo: The function under test. It takes one value and returns one value.
|
||||
// The input value is used to randomize the execution of `foo` as part of a
|
||||
// batch to mitigate the effect of the branch predictor. Signature:
|
||||
// `ProductType foo(ParameterProvider::value_type value);`
|
||||
// The output value is a product of the execution of `foo` and prevents the
|
||||
// compiler from optimizing out foo's body.
|
||||
// - ParameterProvider: An object responsible for providing a range of
|
||||
// `Iterations` values to use as input for `foo`. The `value_type` of the
|
||||
// returned container has to be compatible with `foo` argument.
|
||||
// Must implement one of:
|
||||
// `Container<ParameterType> generateBatch(size_t Iterations);`
|
||||
// `const Container<ParameterType>& generateBatch(size_t Iterations);`
|
||||
// - Clock: An object providing the current time. Must implement:
|
||||
// `std::chrono::time_point now();`
|
||||
template <typename Function, typename ParameterProvider,
|
||||
typename BenchmarkClock = const std::chrono::high_resolution_clock>
|
||||
BenchmarkResult benchmark(const BenchmarkOptions &Options,
|
||||
ParameterProvider &PP, Function foo,
|
||||
BenchmarkClock &Clock = BenchmarkClock()) {
|
||||
BenchmarkResult Result;
|
||||
internal::RuntimeEstimationProgression REP;
|
||||
Duration TotalBenchmarkDuration = {};
|
||||
size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
|
||||
size_t Samples = 0;
|
||||
if (Options.ScalingFactor < 1.0)
|
||||
report_fatal_error("ScalingFactor should be >= 1");
|
||||
if (Options.Log != BenchmarkLog::None)
|
||||
Result.MaybeBenchmarkLog.emplace();
|
||||
for (;;) {
|
||||
// Request a new Batch of size `Iterations`.
|
||||
const auto &Batch = PP.generateBatch(Iterations);
|
||||
|
||||
// Measuring this Batch.
|
||||
const auto StartTime = Clock.now();
|
||||
for (const auto Parameter : Batch) {
|
||||
const auto Production = foo(Parameter);
|
||||
benchmark::DoNotOptimize(Production);
|
||||
}
|
||||
const auto EndTime = Clock.now();
|
||||
const Duration Elapsed = EndTime - StartTime;
|
||||
|
||||
// Updating statistics.
|
||||
++Samples;
|
||||
TotalBenchmarkDuration += Elapsed;
|
||||
const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
|
||||
Result.BestGuess = REP.CurrentEstimation;
|
||||
|
||||
// Stopping condition.
|
||||
if (TotalBenchmarkDuration >= Options.MinDuration &&
|
||||
Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
|
||||
Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
|
||||
else if (Samples >= Options.MaxSamples)
|
||||
Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
|
||||
else if (TotalBenchmarkDuration >= Options.MaxDuration)
|
||||
Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
|
||||
else if (Iterations >= Options.MaxIterations)
|
||||
Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
|
||||
|
||||
if (Result.MaybeBenchmarkLog) {
|
||||
auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
|
||||
if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
|
||||
BenchmarkLog.pop_back();
|
||||
BenchmarkState BS;
|
||||
BS.LastSampleIterations = Iterations;
|
||||
BS.LastBatchElapsed = Elapsed;
|
||||
BS.CurrentStatus = Result.TerminationStatus;
|
||||
BS.CurrentBestGuess = Result.BestGuess;
|
||||
BS.ChangeRatio = ChangeRatio;
|
||||
BenchmarkLog.push_back(BS);
|
||||
}
|
||||
|
||||
if (Result.TerminationStatus != BenchmarkStatus::Running)
|
||||
return Result;
|
||||
|
||||
if (Options.ScalingFactor > 1 &&
|
||||
Iterations * Options.ScalingFactor == Iterations)
|
||||
report_fatal_error(
|
||||
"`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
|
||||
"or InitialIterations.");
|
||||
|
||||
Iterations *= Options.ScalingFactor;
|
||||
}
|
||||
}
|
||||
|
||||
// Interprets `Array` as a circular buffer of `Size` elements.
|
||||
template <typename T> class CircularArrayRef {
|
||||
llvm::ArrayRef<T> Array;
|
||||
size_t Size;
|
||||
|
||||
public:
|
||||
using value_type = T;
|
||||
using reference = T &;
|
||||
using const_reference = const T &;
|
||||
using difference_type = ssize_t;
|
||||
using size_type = size_t;
|
||||
|
||||
class const_iterator
|
||||
: public std::iterator<std::input_iterator_tag, T, ssize_t> {
|
||||
llvm::ArrayRef<T> Array;
|
||||
size_t Index;
|
||||
|
||||
public:
|
||||
explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
|
||||
: Array(Array), Index(Index) {}
|
||||
const_iterator &operator++() {
|
||||
++Index;
|
||||
return *this;
|
||||
}
|
||||
bool operator==(const_iterator Other) const { return Index == Other.Index; }
|
||||
bool operator!=(const_iterator Other) const { return !(*this == Other); }
|
||||
const T &operator*() const { return Array[Index % Array.size()]; }
|
||||
};
|
||||
|
||||
CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
|
||||
: Array(Array), Size(Size) {
|
||||
assert(Array.size() > 0);
|
||||
}
|
||||
|
||||
const_iterator begin() const { return const_iterator(Array); }
|
||||
const_iterator end() const { return const_iterator(Array, Size); }
|
||||
};
|
||||
|
||||
// A convenient helper to produce a CircularArrayRef from an ArrayRef.
|
||||
template <typename T>
|
||||
CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
|
||||
return {Array, Size};
|
||||
}
|
||||
|
||||
// Creates an std::array which storage size is constrained under `Bytes`.
|
||||
template <typename T, size_t Bytes>
|
||||
using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
|
||||
|
||||
// A convenient helper to produce a CircularArrayRef from a
|
||||
// ByteConstrainedArray.
|
||||
template <typename T, size_t N>
|
||||
CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
|
||||
return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
|
|
@ -0,0 +1,168 @@
|
|||
#include "LibcBenchmark.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
using std::chrono::nanoseconds;
|
||||
using ::testing::ElementsAre;
|
||||
using ::testing::Field;
|
||||
using ::testing::IsEmpty;
|
||||
using ::testing::SizeIs;
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
namespace {
|
||||
|
||||
// A simple parameter provider returning a zero initialized vector of size
|
||||
// `iterations`.
|
||||
struct DummyParameterProvider {
|
||||
std::vector<char> generateBatch(size_t iterations) {
|
||||
return std::vector<char>(iterations);
|
||||
}
|
||||
};
|
||||
|
||||
class LibcBenchmark : public ::testing::Test {
|
||||
public:
|
||||
// A Clock interface suitable for testing.
|
||||
// - Either it returns 0,
|
||||
// - Or a timepoint coming from the `setMeasurements` call.
|
||||
Duration now() {
|
||||
if (!MaybeTimepoints)
|
||||
return {};
|
||||
assert(!MaybeTimepoints->empty());
|
||||
const Duration timepoint = MaybeTimepoints->front();
|
||||
MaybeTimepoints->pop();
|
||||
return timepoint;
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override { Options.Log = BenchmarkLog::Full; }
|
||||
|
||||
void TearDown() override {
|
||||
// We make sure all the expected measurements were performed.
|
||||
if (MaybeTimepoints)
|
||||
EXPECT_THAT(*MaybeTimepoints, IsEmpty());
|
||||
}
|
||||
|
||||
BenchmarkResult run() {
|
||||
return benchmark(Options, ParameterProvider, DummyFunction, *this);
|
||||
}
|
||||
|
||||
void setMeasurements(llvm::ArrayRef<Duration> Durations) {
|
||||
MaybeTimepoints.emplace(); // Create the optional value.
|
||||
Duration CurrentTime = nanoseconds(1);
|
||||
for (const auto &Duration : Durations) {
|
||||
MaybeTimepoints->push(CurrentTime);
|
||||
CurrentTime += Duration;
|
||||
MaybeTimepoints->push(CurrentTime);
|
||||
CurrentTime += nanoseconds(1);
|
||||
}
|
||||
}
|
||||
|
||||
BenchmarkOptions Options;
|
||||
|
||||
private:
|
||||
DummyParameterProvider ParameterProvider;
|
||||
static char DummyFunction(char Payload) { return Payload; }
|
||||
llvm::Optional<std::queue<Duration>> MaybeTimepoints;
|
||||
};
|
||||
|
||||
TEST_F(LibcBenchmark, MaxSamplesReached) {
|
||||
Options.MaxSamples = 1;
|
||||
const auto Result = run();
|
||||
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
|
||||
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxSamplesReached);
|
||||
}
|
||||
|
||||
TEST_F(LibcBenchmark, MaxDurationReached) {
|
||||
Options.MaxDuration = nanoseconds(10);
|
||||
setMeasurements({nanoseconds(11)});
|
||||
const auto Result = run();
|
||||
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
|
||||
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxDurationReached);
|
||||
}
|
||||
|
||||
TEST_F(LibcBenchmark, MaxIterationsReached) {
|
||||
Options.InitialIterations = 1;
|
||||
Options.MaxIterations = 20;
|
||||
Options.ScalingFactor = 2;
|
||||
Options.Epsilon = 0; // unreachable.
|
||||
const auto Result = run();
|
||||
EXPECT_THAT(*Result.MaybeBenchmarkLog,
|
||||
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
|
||||
Field(&BenchmarkState::LastSampleIterations, 2),
|
||||
Field(&BenchmarkState::LastSampleIterations, 4),
|
||||
Field(&BenchmarkState::LastSampleIterations, 8),
|
||||
Field(&BenchmarkState::LastSampleIterations, 16),
|
||||
Field(&BenchmarkState::LastSampleIterations, 32)));
|
||||
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 6);
|
||||
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxIterationsReached);
|
||||
}
|
||||
|
||||
TEST_F(LibcBenchmark, MinSamples) {
|
||||
Options.MinSamples = 4;
|
||||
Options.ScalingFactor = 2;
|
||||
Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
|
||||
setMeasurements(
|
||||
{nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
|
||||
const auto Result = run();
|
||||
EXPECT_THAT(*Result.MaybeBenchmarkLog,
|
||||
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
|
||||
Field(&BenchmarkState::LastSampleIterations, 2),
|
||||
Field(&BenchmarkState::LastSampleIterations, 4),
|
||||
Field(&BenchmarkState::LastSampleIterations, 8)));
|
||||
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
|
||||
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
|
||||
}
|
||||
|
||||
TEST_F(LibcBenchmark, Epsilon) {
|
||||
Options.MinSamples = 4;
|
||||
Options.ScalingFactor = 2;
|
||||
Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
|
||||
setMeasurements(
|
||||
{nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
|
||||
const auto Result = run();
|
||||
EXPECT_THAT(*Result.MaybeBenchmarkLog,
|
||||
ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
|
||||
Field(&BenchmarkState::LastSampleIterations, 2),
|
||||
Field(&BenchmarkState::LastSampleIterations, 4),
|
||||
Field(&BenchmarkState::LastSampleIterations, 8)));
|
||||
EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
|
||||
EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
|
||||
}
|
||||
|
||||
TEST(ArrayRefLoop, Cycle) {
|
||||
std::array<int, 2> array = {1, 2};
|
||||
EXPECT_THAT(cycle(array, 0), ElementsAre());
|
||||
EXPECT_THAT(cycle(array, 1), ElementsAre(1));
|
||||
EXPECT_THAT(cycle(array, 2), ElementsAre(1, 2));
|
||||
EXPECT_THAT(cycle(array, 3), ElementsAre(1, 2, 1));
|
||||
EXPECT_THAT(cycle(array, 4), ElementsAre(1, 2, 1, 2));
|
||||
EXPECT_THAT(cycle(array, 5), ElementsAre(1, 2, 1, 2, 1));
|
||||
}
|
||||
|
||||
TEST(ByteConstrainedArray, Simple) {
|
||||
EXPECT_THAT((ByteConstrainedArray<char, 17>()), SizeIs(17));
|
||||
EXPECT_THAT((ByteConstrainedArray<uint16_t, 17>()), SizeIs(8));
|
||||
EXPECT_THAT((ByteConstrainedArray<uint32_t, 17>()), SizeIs(4));
|
||||
EXPECT_THAT((ByteConstrainedArray<uint64_t, 17>()), SizeIs(2));
|
||||
|
||||
EXPECT_LE(sizeof(ByteConstrainedArray<char, 17>), 17U);
|
||||
EXPECT_LE(sizeof(ByteConstrainedArray<uint16_t, 17>), 17U);
|
||||
EXPECT_LE(sizeof(ByteConstrainedArray<uint32_t, 17>), 17U);
|
||||
EXPECT_LE(sizeof(ByteConstrainedArray<uint64_t, 17>), 17U);
|
||||
}
|
||||
|
||||
TEST(ByteConstrainedArray, Cycle) {
|
||||
ByteConstrainedArray<uint64_t, 17> TwoValues{{1UL, 2UL}};
|
||||
EXPECT_THAT(cycle(TwoValues, 5), ElementsAre(1, 2, 1, 2, 1));
|
||||
}
|
||||
} // namespace
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,62 @@
|
|||
//===-------- Benchmark memory specific tools -----------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// Returns a distribution that samples the buffer to satisfy the required
|
||||
// alignment.
|
||||
// When alignment is set, the distribution is scaled down by `Factor` and scaled
|
||||
// up again by the same amount during sampling.
|
||||
static std::uniform_int_distribution<uint32_t>
|
||||
GetOffsetDistribution(const StudyConfiguration &Conf) {
|
||||
if (Conf.AddressAlignment &&
|
||||
*Conf.AddressAlignment > AlignedBuffer::Alignment)
|
||||
report_fatal_error(
|
||||
"AddressAlignment must be less or equal to AlignedBuffer::Alignment");
|
||||
if (!Conf.AddressAlignment)
|
||||
return std::uniform_int_distribution<uint32_t>(0, 0); // Always 0.
|
||||
// If we test up to Size bytes, the returned offset must stay under
|
||||
// BuffersSize - Size.
|
||||
int64_t MaxOffset = Conf.BufferSize;
|
||||
MaxOffset -= Conf.Size.To;
|
||||
MaxOffset -= 1;
|
||||
if (MaxOffset < 0)
|
||||
report_fatal_error(
|
||||
"BufferSize too small to exercise specified Size configuration");
|
||||
MaxOffset /= Conf.AddressAlignment->value();
|
||||
return std::uniform_int_distribution<uint32_t>(0, MaxOffset);
|
||||
}
|
||||
|
||||
OffsetDistribution::OffsetDistribution(const StudyConfiguration &Conf)
|
||||
: Distribution(GetOffsetDistribution(Conf)),
|
||||
Factor(Conf.AddressAlignment.valueOrOne().value()) {}
|
||||
|
||||
// Precomputes offset where to insert mismatches between the two buffers.
|
||||
MismatchOffsetDistribution::MismatchOffsetDistribution(
|
||||
const StudyConfiguration &Conf)
|
||||
: MismatchAt(Conf.MemcmpMismatchAt) {
|
||||
if (MismatchAt <= 1)
|
||||
return;
|
||||
const auto ToSize = Conf.Size.To;
|
||||
for (size_t I = ToSize + 1; I < Conf.BufferSize; I += ToSize)
|
||||
MismatchIndices.push_back(I);
|
||||
if (MismatchIndices.empty())
|
||||
llvm::report_fatal_error("Unable to generate mismatch");
|
||||
MismatchIndexSelector =
|
||||
std::uniform_int_distribution<size_t>(0, MismatchIndices.size() - 1);
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,183 @@
|
|||
//===-------- Benchmark memory specific tools -------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// This file complements the `benchmark` header with memory specific tools and
|
||||
// benchmarking facilities.
|
||||
|
||||
#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
|
||||
#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Alignment.h"
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
//--------------
|
||||
// Configuration
|
||||
//--------------
|
||||
|
||||
// Specifies a range of sizes to explore.
|
||||
struct SizeRange {
|
||||
uint32_t From = 0; // Inclusive
|
||||
uint32_t To = 1024; // Inclusive
|
||||
uint32_t Step = 1;
|
||||
};
|
||||
|
||||
// An object to define how to test a memory function.
|
||||
struct StudyConfiguration {
|
||||
// The number of run for the study.
|
||||
uint32_t Runs = 1;
|
||||
|
||||
// The size of the buffers (1 buffer for memset but 2 for memcpy or memcmp).
|
||||
// When testing small sizes, it's important to keep the total allocated
|
||||
// size under the size of the L1 cache (usually 16 or 32KiB). The framework
|
||||
// will also use 2KiB of additional L1 memory to store the function
|
||||
// parameters.
|
||||
uint32_t BufferSize = 8192;
|
||||
|
||||
// The range of sizes to exercise.
|
||||
SizeRange Size;
|
||||
|
||||
MaybeAlign AddressAlignment; // Unset : Use start of buffer which is at
|
||||
// least cache line aligned)
|
||||
// 1 : Use random address,
|
||||
// >1 : Use random address aligned to value.
|
||||
|
||||
// The value to use for memset.
|
||||
uint8_t MemsetValue = 0;
|
||||
|
||||
// The mismatch position for memcmp.
|
||||
uint32_t MemcmpMismatchAt = 0; // 0 : Buffer compare equal,
|
||||
// >0 : Buffer compare different at byte N-1.
|
||||
};
|
||||
|
||||
//--------
|
||||
// Results
|
||||
//--------
|
||||
|
||||
// The time to run one iteration of the function under test for the specified
|
||||
// Size.
|
||||
struct Measurement {
|
||||
uint32_t Size = 0;
|
||||
Duration Runtime = {};
|
||||
};
|
||||
|
||||
// The measurements for a specific function.
|
||||
struct FunctionMeasurements {
|
||||
std::string Name;
|
||||
std::vector<Measurement> Measurements;
|
||||
};
|
||||
|
||||
// The root object containing all the data (configuration and measurements).
|
||||
struct Study {
|
||||
HostState Host;
|
||||
BenchmarkOptions Options;
|
||||
StudyConfiguration Configuration;
|
||||
SmallVector<FunctionMeasurements, 4> Functions;
|
||||
};
|
||||
|
||||
// Provides an aligned, dynamically allocated buffer.
|
||||
class AlignedBuffer {
|
||||
char *const Buffer = nullptr;
|
||||
size_t Size = 0;
|
||||
|
||||
public:
|
||||
static constexpr size_t Alignment = 1024;
|
||||
|
||||
explicit AlignedBuffer(size_t Size)
|
||||
: Buffer(static_cast<char *>(aligned_alloc(1024, Size))), Size(Size) {}
|
||||
~AlignedBuffer() { free(Buffer); }
|
||||
|
||||
inline char *operator+(size_t Index) { return Buffer + Index; }
|
||||
inline const char *operator+(size_t Index) const { return Buffer + Index; }
|
||||
inline char &operator[](size_t Index) { return Buffer[Index]; }
|
||||
inline const char &operator[](size_t Index) const { return Buffer[Index]; }
|
||||
inline char *begin() { return Buffer; }
|
||||
inline char *end() { return Buffer + Size; }
|
||||
};
|
||||
|
||||
// Implements the ParameterProvider abstraction needed by the `benchmark`
|
||||
// function. This implementation makes sure that all parameters will fit into
|
||||
// `StorageSize` bytes. The total memory accessed during benchmark should be
|
||||
// less than the data L1 cache, that is the storage for the ParameterProvider
|
||||
// and the memory buffers.
|
||||
template <typename Context, size_t StorageSize = 8 * 1024>
|
||||
class SmallParameterProvider {
|
||||
using ParameterType = typename Context::ParameterType;
|
||||
ByteConstrainedArray<ParameterType, StorageSize> Parameters;
|
||||
size_t LastIterations;
|
||||
Context &Ctx;
|
||||
|
||||
public:
|
||||
explicit SmallParameterProvider(Context &C) : Ctx(C) {}
|
||||
SmallParameterProvider(const SmallParameterProvider &) = delete;
|
||||
SmallParameterProvider &operator=(const SmallParameterProvider &) = delete;
|
||||
|
||||
// Useful to compute the histogram of the size parameter.
|
||||
CircularArrayRef<ParameterType> getLastBatch() const {
|
||||
return cycle(Parameters, LastIterations);
|
||||
}
|
||||
|
||||
// Implements the interface needed by the `benchmark` function.
|
||||
CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
|
||||
LastIterations = Iterations;
|
||||
Ctx.Randomize(Parameters);
|
||||
return getLastBatch();
|
||||
}
|
||||
};
|
||||
|
||||
// Helper to generate random buffer offsets that satisfy the configuration
|
||||
// constraints.
|
||||
class OffsetDistribution {
|
||||
std::uniform_int_distribution<uint32_t> Distribution;
|
||||
uint32_t Factor;
|
||||
|
||||
public:
|
||||
explicit OffsetDistribution(const StudyConfiguration &Conf);
|
||||
|
||||
template <class Generator> uint32_t operator()(Generator &G) {
|
||||
return Distribution(G) * Factor;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper to generate random buffer offsets that satisfy the configuration
|
||||
// constraints. It is specifically designed to benchmark `memcmp` functions
|
||||
// where we may want the Nth byte to differ.
|
||||
class MismatchOffsetDistribution {
|
||||
std::uniform_int_distribution<size_t> MismatchIndexSelector;
|
||||
llvm::SmallVector<uint32_t, 16> MismatchIndices;
|
||||
const uint32_t MismatchAt;
|
||||
|
||||
public:
|
||||
explicit MismatchOffsetDistribution(const StudyConfiguration &Conf);
|
||||
|
||||
explicit operator bool() const { return !MismatchIndices.empty(); }
|
||||
|
||||
const llvm::SmallVectorImpl<uint32_t> &getMismatchIndices() const {
|
||||
return MismatchIndices;
|
||||
}
|
||||
|
||||
template <class Generator> uint32_t operator()(Generator &G, uint32_t Size) {
|
||||
const uint32_t MismatchIndex = MismatchIndices[MismatchIndexSelector(G)];
|
||||
// We need to position the offset so that a mismatch occurs at MismatchAt.
|
||||
if (Size >= MismatchAt)
|
||||
return MismatchIndex - MismatchAt;
|
||||
// Size is too small to trigger the mismatch.
|
||||
return MismatchIndex - Size - 1;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
|
|
@ -0,0 +1,100 @@
|
|||
//===-------- Benchmark --------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcMemoryBenchmarkMain.h"
|
||||
#include "JSON.h"
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/JSON.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
static cl::opt<std::string>
|
||||
Configuration("conf", cl::desc("Specify configuration filename"),
|
||||
cl::value_desc("filename"), cl::init(""));
|
||||
|
||||
static cl::opt<std::string> Output("o", cl::desc("Specify output filename"),
|
||||
cl::value_desc("filename"), cl::init("-"));
|
||||
|
||||
extern std::unique_ptr<BenchmarkRunner>
|
||||
getRunner(const StudyConfiguration &Conf);
|
||||
|
||||
void Main() {
|
||||
#ifndef NDEBUG
|
||||
static_assert(
|
||||
false,
|
||||
"For reproducibility benchmarks should not be compiled in DEBUG mode.");
|
||||
#endif
|
||||
checkRequirements();
|
||||
ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
|
||||
MemoryBuffer::getFileOrSTDIN(Configuration);
|
||||
if (!MB)
|
||||
report_fatal_error(
|
||||
Twine("Could not open configuration file: ").concat(Configuration));
|
||||
auto ErrorOrStudy = ParseJsonStudy((*MB)->getBuffer());
|
||||
if (!ErrorOrStudy)
|
||||
report_fatal_error(ErrorOrStudy.takeError());
|
||||
|
||||
const auto StudyPrototype = *ErrorOrStudy;
|
||||
|
||||
Study S;
|
||||
S.Host = HostState::get();
|
||||
S.Options = StudyPrototype.Options;
|
||||
S.Configuration = StudyPrototype.Configuration;
|
||||
|
||||
const auto Runs = S.Configuration.Runs;
|
||||
const auto &SR = S.Configuration.Size;
|
||||
std::unique_ptr<BenchmarkRunner> Runner = getRunner(S.Configuration);
|
||||
const size_t TotalSteps =
|
||||
Runner->getFunctionNames().size() * Runs * ((SR.To - SR.From) / SR.Step);
|
||||
size_t Steps = 0;
|
||||
for (auto FunctionName : Runner->getFunctionNames()) {
|
||||
FunctionMeasurements FM;
|
||||
FM.Name = FunctionName;
|
||||
for (size_t Run = 0; Run < Runs; ++Run) {
|
||||
for (uint32_t Size = SR.From; Size <= SR.To; Size += SR.Step) {
|
||||
const auto Result = Runner->benchmark(S.Options, FunctionName, Size);
|
||||
Measurement Measurement;
|
||||
Measurement.Runtime = Result.BestGuess;
|
||||
Measurement.Size = Size;
|
||||
FM.Measurements.push_back(Measurement);
|
||||
outs() << format("%3d%% run: %2d / %2d size: %5d ",
|
||||
(Steps * 100 / TotalSteps), Run, Runs, Size)
|
||||
<< FunctionName
|
||||
<< " \r";
|
||||
++Steps;
|
||||
}
|
||||
}
|
||||
S.Functions.push_back(std::move(FM));
|
||||
}
|
||||
|
||||
std::error_code EC;
|
||||
raw_fd_ostream FOS(Output, EC);
|
||||
if (EC)
|
||||
report_fatal_error(Twine("Could not open file: ")
|
||||
.concat(EC.message())
|
||||
.concat(", ")
|
||||
.concat(Output));
|
||||
json::OStream JOS(FOS);
|
||||
SerializeToJson(S, JOS);
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
llvm::cl::ParseCommandLineOptions(argc, argv);
|
||||
llvm::libc_benchmarks::Main();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
//===-------- BenchmarkRunner interface -------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
|
||||
#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// Each memory function benchmark implements this interface.
|
||||
// It is used by the main function to run all benchmarks in a uniform manner.
|
||||
class BenchmarkRunner {
|
||||
public:
|
||||
virtual ~BenchmarkRunner() {}
|
||||
|
||||
// Returns a list of all available functions to test.
|
||||
virtual ArrayRef<StringRef> getFunctionNames() const = 0;
|
||||
|
||||
// Performs the benchmarking for a particular FunctionName and Size.
|
||||
virtual BenchmarkResult benchmark(const BenchmarkOptions &Options,
|
||||
StringRef FunctionName, size_t Size) = 0;
|
||||
};
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
|
|
@ -0,0 +1,112 @@
|
|||
#include "LibcMemoryBenchmark.h"
|
||||
#include "llvm/Support/Alignment.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using testing::AllOf;
|
||||
using testing::AnyOf;
|
||||
using testing::ElementsAre;
|
||||
using testing::Ge;
|
||||
using testing::Gt;
|
||||
using testing::Le;
|
||||
using testing::Lt;
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
namespace {
|
||||
|
||||
TEST(AlignedBuffer, IsAligned) {
|
||||
AlignedBuffer AB(0);
|
||||
EXPECT_TRUE(isAddrAligned(Align(AlignedBuffer::Alignment), AB.begin()));
|
||||
}
|
||||
|
||||
TEST(AlignedBuffer, Empty) {
|
||||
AlignedBuffer AB(0);
|
||||
EXPECT_EQ(std::distance(AB.begin(), AB.end()), 0U);
|
||||
}
|
||||
|
||||
TEST(OffsetDistribution, AlignToBegin) {
|
||||
StudyConfiguration Conf;
|
||||
Conf.BufferSize = 8192;
|
||||
Conf.AddressAlignment = None;
|
||||
|
||||
OffsetDistribution OD(Conf);
|
||||
std::default_random_engine Gen;
|
||||
for (size_t I = 0; I <= 10; ++I)
|
||||
EXPECT_EQ(OD(Gen), 0U);
|
||||
}
|
||||
|
||||
TEST(OffsetDistribution, NoAlignment) {
|
||||
StudyConfiguration Conf;
|
||||
Conf.BufferSize = 8192;
|
||||
Conf.AddressAlignment = Align::None();
|
||||
Conf.Size.To = 1;
|
||||
|
||||
OffsetDistribution OD(Conf);
|
||||
std::default_random_engine Gen;
|
||||
for (size_t I = 0; I <= 10; ++I)
|
||||
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U)));
|
||||
}
|
||||
|
||||
MATCHER_P(IsDivisibleBy, n, "") {
|
||||
*result_listener << "where the remainder is " << (arg % n);
|
||||
return (arg % n) == 0;
|
||||
}
|
||||
|
||||
TEST(OffsetDistribution, Aligned) {
|
||||
StudyConfiguration Conf;
|
||||
Conf.BufferSize = 8192;
|
||||
Conf.AddressAlignment = Align(16);
|
||||
Conf.Size.To = 1;
|
||||
|
||||
OffsetDistribution OD(Conf);
|
||||
std::default_random_engine Gen;
|
||||
for (size_t I = 0; I <= 10; ++I)
|
||||
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U), IsDivisibleBy(16U)));
|
||||
}
|
||||
|
||||
TEST(MismatchOffsetDistribution, EqualBufferDisablesDistribution) {
|
||||
StudyConfiguration Conf;
|
||||
Conf.MemcmpMismatchAt = 0; // buffer are equal.
|
||||
|
||||
MismatchOffsetDistribution MOD(Conf);
|
||||
EXPECT_FALSE(MOD);
|
||||
}
|
||||
|
||||
TEST(MismatchOffsetDistribution, DifferentBufferDisablesDistribution) {
|
||||
StudyConfiguration Conf;
|
||||
Conf.MemcmpMismatchAt = 1; // buffer are different.
|
||||
|
||||
MismatchOffsetDistribution MOD(Conf);
|
||||
EXPECT_FALSE(MOD);
|
||||
}
|
||||
|
||||
TEST(MismatchOffsetDistribution, MismatchAt2) {
|
||||
const uint32_t MismatchAt = 2;
|
||||
const uint32_t ToSize = 4;
|
||||
StudyConfiguration Conf;
|
||||
Conf.BufferSize = 16;
|
||||
Conf.MemcmpMismatchAt = MismatchAt; // buffer are different at position 2.
|
||||
Conf.Size.To = ToSize;
|
||||
|
||||
MismatchOffsetDistribution MOD(Conf);
|
||||
EXPECT_TRUE(MOD);
|
||||
// We test equality up to ToSize (=4) so we need spans of 4 equal bytes spaced
|
||||
// by one mismatch.
|
||||
EXPECT_THAT(MOD.getMismatchIndices(), ElementsAre(5, 9, 13));
|
||||
std::default_random_engine Gen;
|
||||
for (size_t Iterations = 0; Iterations <= 10; ++Iterations) {
|
||||
for (size_t Size = Conf.Size.From; Size <= ToSize; ++Size) {
|
||||
if (Size >= MismatchAt)
|
||||
EXPECT_THAT(MOD(Gen, Size),
|
||||
AnyOf(5 - MismatchAt, 9 - MismatchAt, 13 - MismatchAt));
|
||||
else
|
||||
EXPECT_THAT(MOD(Gen, Size),
|
||||
AnyOf(5 - Size - 1, 9 - Size - 1, 13 - Size - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,87 @@
|
|||
//===-------- Benchmark memcmp implementation -----------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "LibcMemoryBenchmarkMain.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// The context encapsulates the buffers, parameters and the measure.
|
||||
struct MemcmpContext : public BenchmarkRunner {
|
||||
using FunctionPrototype = int (*)(const void *, const void *, size_t);
|
||||
|
||||
struct ParameterType {
|
||||
uint16_t Offset = 0;
|
||||
};
|
||||
|
||||
explicit MemcmpContext(const StudyConfiguration &Conf)
|
||||
: MOD(Conf), OD(Conf), ABuffer(Conf.BufferSize), BBuffer(Conf.BufferSize),
|
||||
PP(*this) {
|
||||
std::uniform_int_distribution<char> Dis;
|
||||
// Generate random buffer A.
|
||||
for (size_t I = 0; I < Conf.BufferSize; ++I)
|
||||
ABuffer[I] = Dis(Gen);
|
||||
// Copy buffer A to B.
|
||||
::memcpy(BBuffer.begin(), ABuffer.begin(), Conf.BufferSize);
|
||||
if (Conf.MemcmpMismatchAt == 0)
|
||||
return; // all same.
|
||||
else if (Conf.MemcmpMismatchAt == 1)
|
||||
for (char &c : BBuffer)
|
||||
++c; // all different.
|
||||
else
|
||||
for (const auto I : MOD.getMismatchIndices())
|
||||
++BBuffer[I];
|
||||
}
|
||||
|
||||
// Needed by the ParameterProvider to update the current batch of parameter.
|
||||
void Randomize(MutableArrayRef<ParameterType> Parameters) {
|
||||
if (MOD)
|
||||
for (auto &P : Parameters)
|
||||
P.Offset = MOD(Gen, CurrentSize);
|
||||
else
|
||||
for (auto &P : Parameters)
|
||||
P.Offset = OD(Gen);
|
||||
}
|
||||
|
||||
ArrayRef<StringRef> getFunctionNames() const override {
|
||||
static std::array<StringRef, 1> kFunctionNames = {"memcmp"};
|
||||
return kFunctionNames;
|
||||
}
|
||||
|
||||
BenchmarkResult benchmark(const BenchmarkOptions &Options,
|
||||
StringRef FunctionName, size_t Size) override {
|
||||
CurrentSize = Size;
|
||||
// FIXME: Add `bcmp` once we're guaranteed that the function is provided.
|
||||
FunctionPrototype Function =
|
||||
StringSwitch<FunctionPrototype>(FunctionName).Case("memcmp", &::memcmp);
|
||||
return llvm::libc_benchmarks::benchmark(
|
||||
Options, PP, [this, Function, Size](ParameterType p) {
|
||||
return Function(ABuffer + p.Offset, BBuffer + p.Offset, Size);
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
std::default_random_engine Gen;
|
||||
MismatchOffsetDistribution MOD;
|
||||
OffsetDistribution OD;
|
||||
size_t CurrentSize = 0;
|
||||
AlignedBuffer ABuffer;
|
||||
AlignedBuffer BBuffer;
|
||||
SmallParameterProvider<MemcmpContext> PP;
|
||||
};
|
||||
|
||||
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
|
||||
return std::make_unique<MemcmpContext>(Conf);
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,69 @@
|
|||
//===-------- Benchmark memcpy implementation -----------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "LibcMemoryBenchmarkMain.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <memory>
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// The context encapsulates the buffers, parameters and the measure.
|
||||
struct MemcpyContext : public BenchmarkRunner {
|
||||
using FunctionPrototype = void *(*)(void *, const void *, size_t);
|
||||
|
||||
struct ParameterType {
|
||||
uint16_t SrcOffset = 0;
|
||||
uint16_t DstOffset = 0;
|
||||
};
|
||||
|
||||
explicit MemcpyContext(const StudyConfiguration &Conf)
|
||||
: OD(Conf), SrcBuffer(Conf.BufferSize), DstBuffer(Conf.BufferSize),
|
||||
PP(*this) {}
|
||||
|
||||
// Needed by the ParameterProvider to update the current batch of parameter.
|
||||
void Randomize(MutableArrayRef<ParameterType> Parameters) {
|
||||
for (auto &P : Parameters) {
|
||||
P.DstOffset = OD(Gen);
|
||||
P.SrcOffset = OD(Gen);
|
||||
}
|
||||
}
|
||||
|
||||
ArrayRef<StringRef> getFunctionNames() const override {
|
||||
static std::array<StringRef, 1> kFunctionNames = {"memcpy"};
|
||||
return kFunctionNames;
|
||||
}
|
||||
|
||||
BenchmarkResult benchmark(const BenchmarkOptions &Options,
|
||||
StringRef FunctionName, size_t Size) override {
|
||||
FunctionPrototype Function =
|
||||
StringSwitch<FunctionPrototype>(FunctionName).Case("memcpy", &::memcpy);
|
||||
return llvm::libc_benchmarks::benchmark(
|
||||
Options, PP, [this, Function, Size](ParameterType p) {
|
||||
Function(DstBuffer + p.DstOffset, SrcBuffer + p.SrcOffset, Size);
|
||||
return DstBuffer + p.DstOffset;
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
std::default_random_engine Gen;
|
||||
OffsetDistribution OD;
|
||||
AlignedBuffer SrcBuffer;
|
||||
AlignedBuffer DstBuffer;
|
||||
SmallParameterProvider<MemcpyContext> PP;
|
||||
};
|
||||
|
||||
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
|
||||
return std::make_unique<MemcpyContext>(Conf);
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,66 @@
|
|||
//===-------- Benchmark memset implementation -----------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibcBenchmark.h"
|
||||
#include "LibcMemoryBenchmark.h"
|
||||
#include "LibcMemoryBenchmarkMain.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace libc_benchmarks {
|
||||
|
||||
// The context encapsulates the buffers, parameters and the measure.
|
||||
struct MemsetContext : public BenchmarkRunner {
|
||||
using FunctionPrototype = void *(*)(void *, int, size_t);
|
||||
|
||||
struct ParameterType {
|
||||
uint16_t DstOffset = 0;
|
||||
};
|
||||
|
||||
explicit MemsetContext(const StudyConfiguration &Conf)
|
||||
: OD(Conf), DstBuffer(Conf.BufferSize), MemsetValue(Conf.MemsetValue),
|
||||
PP(*this) {}
|
||||
|
||||
// Needed by the ParameterProvider to update the current batch of parameter.
|
||||
void Randomize(MutableArrayRef<ParameterType> Parameters) {
|
||||
for (auto &P : Parameters) {
|
||||
P.DstOffset = OD(Gen);
|
||||
}
|
||||
}
|
||||
|
||||
ArrayRef<StringRef> getFunctionNames() const override {
|
||||
static std::array<StringRef, 1> kFunctionNames = {"memset"};
|
||||
return kFunctionNames;
|
||||
}
|
||||
|
||||
BenchmarkResult benchmark(const BenchmarkOptions &Options,
|
||||
StringRef FunctionName, size_t Size) override {
|
||||
FunctionPrototype Function =
|
||||
StringSwitch<FunctionPrototype>(FunctionName).Case("memset", &::memset);
|
||||
return llvm::libc_benchmarks::benchmark(
|
||||
Options, PP, [this, Function, Size](ParameterType p) {
|
||||
Function(DstBuffer + p.DstOffset, MemsetValue, Size);
|
||||
return DstBuffer + p.DstOffset;
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
std::default_random_engine Gen;
|
||||
OffsetDistribution OD;
|
||||
AlignedBuffer DstBuffer;
|
||||
const uint8_t MemsetValue;
|
||||
SmallParameterProvider<MemsetContext> PP;
|
||||
};
|
||||
|
||||
std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
|
||||
return std::make_unique<MemsetContext>(Conf);
|
||||
}
|
||||
|
||||
} // namespace libc_benchmarks
|
||||
} // namespace llvm
|
|
@ -0,0 +1,243 @@
|
|||
# Benchmarking `llvm-libc`'s memory functions
|
||||
|
||||
## Foreword
|
||||
|
||||
Microbenchmarks are valuable tools to assess and compare the performance of
|
||||
isolated pieces of code. However they don't capture all interactions of complex
|
||||
systems; and so other metrics can be equally important:
|
||||
|
||||
- **code size** (to reduce instruction cache pressure),
|
||||
- **Profile Guided Optimization** friendliness,
|
||||
- **hyperthreading / multithreading** friendliness.
|
||||
|
||||
## Rationale
|
||||
|
||||
The goal here is to satisfy the [Benchmarking
|
||||
Principles](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Benchmarking_Principles).
|
||||
|
||||
1. **Relevance**: Benchmarks should measure relatively vital features.
|
||||
2. **Representativeness**: Benchmark performance metrics should be broadly
|
||||
accepted by industry and academia.
|
||||
3. **Equity**: All systems should be fairly compared.
|
||||
4. **Repeatability**: Benchmark results can be verified.
|
||||
5. **Cost-effectiveness**: Benchmark tests are economical.
|
||||
6. **Scalability**: Benchmark tests should measure from single server to
|
||||
multiple servers.
|
||||
7. **Transparency**: Benchmark metrics should be easy to understand.
|
||||
|
||||
Benchmarking is a [subtle
|
||||
art](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Challenges) and
|
||||
benchmarking memory functions is no exception. Here we'll dive into
|
||||
peculiarities of designing good microbenchmarks for `llvm-libc` memory
|
||||
functions.
|
||||
|
||||
## Challenges
|
||||
|
||||
As seen in the [README.md](README.md#benchmarking-regimes) the microbenchmarking
|
||||
facility should focus on measuring **low latency code**. If copying a few bytes
|
||||
takes in the order of a few cycles, the benchmark should be able to **measure
|
||||
accurately down to the cycle**.
|
||||
|
||||
### Measuring instruments
|
||||
|
||||
There are different sources of time in a computer (ordered from high to low resolution)
|
||||
- [Performance
|
||||
Counters](https://en.wikipedia.org/wiki/Hardware_performance_counter): used to
|
||||
introspect the internals of the CPU,
|
||||
- [High Precision Event
|
||||
Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer): used to
|
||||
trigger short lived actions,
|
||||
- [Real-Time Clocks (RTC)](https://en.wikipedia.org/wiki/Real-time_clock): used
|
||||
to keep track of the computer's time.
|
||||
|
||||
In theory **Performance Counters** provide cycle accurate measurement via the
|
||||
`cpu cycles` event. But as we'll see, they are not really practical in this
|
||||
context.
|
||||
|
||||
### Performance counters and modern processor architecture
|
||||
|
||||
Modern CPUs are [out of
|
||||
order](https://en.wikipedia.org/wiki/Out-of-order_execution) and
|
||||
[superscalar](https://en.wikipedia.org/wiki/Superscalar_processor) as a
|
||||
consequence it is [hard to know what is included when the counter is
|
||||
read](https://en.wikipedia.org/wiki/Hardware_performance_counter#Instruction_based_sampling),
|
||||
some instructions may still be **in flight**, some others may be executing
|
||||
[**speculatively**](https://en.wikipedia.org/wiki/Speculative_execution). As a
|
||||
matter of fact **on the same machine, measuring twice the same piece of code will yield
|
||||
different results.**
|
||||
|
||||
### Performance counters semantics inconsistencies and availability
|
||||
|
||||
Although they have the same name, the exact semantics of performance counters
|
||||
are micro-architecture dependent: **it is generally not possible to compare two
|
||||
micro-architectures exposing the same performance counters.**
|
||||
|
||||
Each vendor decides which performance counters to implement and their exact
|
||||
meaning. Although we want to benchmark `llvm-libc` memory functions for all
|
||||
available [target
|
||||
triples](https://clang.llvm.org/docs/CrossCompilation.html#target-triple), there
|
||||
are **no guarantees that the counter we're interested in is available.**
|
||||
|
||||
### Additional imprecisions
|
||||
|
||||
- Reading performance counters is done through Kernel [System
|
||||
calls](https://en.wikipedia.org/wiki/System_call). The System call itself
|
||||
is costly (hundreds of cycles) and will perturbate the counter's value.
|
||||
- [Interruptions](https://en.wikipedia.org/wiki/Interrupt#Processor_response)
|
||||
can occur during measurement.
|
||||
- If the system is already under monitoring (virtual machines or system wide
|
||||
profiling) the kernel can decide to multiplex the performance counters
|
||||
leading to lower precision or even completely missing the measurement.
|
||||
- The Kernel can decide to [migrate the
|
||||
process](https://en.wikipedia.org/wiki/Process_migration) to a different
|
||||
core.
|
||||
- [Dynamic frequency
|
||||
scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling) can kick
|
||||
in during the measurement and change the ticking duration. **Ultimately we
|
||||
care about the amount of work over a period of time**. This removes some
|
||||
legitimacy of measuring cycles rather than **raw time**.
|
||||
|
||||
### Cycle accuracy conclusion
|
||||
|
||||
We have seen that performance counters are: not widely available, semantically
|
||||
inconsistent across micro-architectures and imprecise on modern CPUs for small
|
||||
snippets of code.
|
||||
|
||||
## Design decisions
|
||||
|
||||
In order to achieve the needed precision we would need to resort on more widely
|
||||
available counters and derive the time from a high number of runs: going from a
|
||||
single deterministic measure to a probabilistic one.
|
||||
|
||||
**To get a good signal to noise ratio we need the running time of the piece of
|
||||
code to be orders of magnitude greater than the measurement precision.**
|
||||
|
||||
For instance, if measurement precision is of 10 cycles, we need the function
|
||||
runtime to take more than 1000 cycles to achieve 1%
|
||||
[SNR](https://en.wikipedia.org/wiki/Signal-to-noise_ratio).
|
||||
|
||||
### Repeating code N-times until precision is sufficient
|
||||
|
||||
The algorithm is as follows:
|
||||
|
||||
- We measure the time it takes to run the code _N_ times (Initially _N_ is 10
|
||||
for instance)
|
||||
- We deduce an approximation of the runtime of one iteration (= _runtime_ /
|
||||
_N_).
|
||||
- We increase _N_ by _X%_ and repeat the measurement (geometric progression).
|
||||
- We keep track of the _one iteration runtime approximation_ and build a
|
||||
weighted mean of all the samples so far (weight is proportional to _N_)
|
||||
- We stop the process when the difference between the weighted mean and the
|
||||
last estimation is smaller than _ε_ or when other stopping conditions are
|
||||
met (total runtime, maximum iterations or maximum sample count).
|
||||
|
||||
This method allows us to be as precise as needed provided that the measured
|
||||
runtime is proportional to _N_. Longer run times also smooth out imprecision
|
||||
related to _interrupts_ and _context switches_.
|
||||
|
||||
Note: When measuring longer runtimes (e.g. copying several megabytes of data)
|
||||
the above assumption doesn't hold anymore and the _ε_ precision cannot be
|
||||
reached by increasing iterations. The whole benchmarking process becomes
|
||||
prohibitively slow. In this case the algorithm is limited to a single sample and
|
||||
repeated several times to get a decent 95% confidence interval.
|
||||
|
||||
### Effect of branch prediction
|
||||
|
||||
When measuring code with branches, repeating the same call again and again will
|
||||
allow the processor to learn the branching patterns and perfectly predict all
|
||||
the branches, leading to unrealistic results.
|
||||
|
||||
**Decision: When benchmarking small buffer sizes, the function parameters should
|
||||
be randomized between calls to prevent perfect branch predictions.**
|
||||
|
||||
### Effect of the memory subsystem
|
||||
|
||||
The CPU is tightly coupled to the memory subsystem. It is common to see `L1`,
|
||||
`L2` and `L3` data caches.
|
||||
|
||||
We may be tempted to randomize data accesses widely to exercise all the caching
|
||||
layers down to RAM but the [cost of accessing lower layers of
|
||||
memory](https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html)
|
||||
completely dominates the runtime for small sizes.
|
||||
|
||||
So to respect **Equity** and **Repeatability** principles we should make sure we
|
||||
**do not** depend on the memory subsystem.
|
||||
|
||||
**Decision: When benchmarking small buffer sizes, the data accessed by the
|
||||
function should stay in `L1`.**
|
||||
|
||||
### Effect of prefetching
|
||||
|
||||
In case of small buffer sizes,
|
||||
[prefetching](https://en.wikipedia.org/wiki/Cache_prefetching) should not kick
|
||||
in but in case of large buffers it may introduce a bias.
|
||||
|
||||
**Decision: When benchmarking large buffer sizes, the data should be accessed in
|
||||
a random fashion to lower the impact of prefetching between calls.**
|
||||
|
||||
### Effect of dynamic frequency scaling
|
||||
|
||||
Modern processors implement [dynamic frequency
|
||||
scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling). In so-called
|
||||
`performance` mode the CPU will increase its frequency and run faster than usual
|
||||
within [some limits](https://en.wikipedia.org/wiki/Intel_Turbo_Boost) : _"The
|
||||
increased clock rate is limited by the processor's power, current, and thermal
|
||||
limits, the number of cores currently in use, and the maximum frequency of the
|
||||
active cores."_
|
||||
|
||||
**Decision: When benchmarking we want to make sure the dynamic frequency scaling
|
||||
is always set to `performance`. We also want to make sure that the time based
|
||||
events are not impacted by frequency scaling.**
|
||||
|
||||
See [REAME.md](REAME.md) on how to set this up.
|
||||
|
||||
### Reserved and pinned cores
|
||||
|
||||
Some operating systems allow [core
|
||||
reservation](https://stackoverflow.com/questions/13583146/whole-one-core-dedicated-to-single-process).
|
||||
It removes a set of perturbation sources like: process migration, context
|
||||
switches and interrupts. When a core is hyperthreaded, both cores should be
|
||||
reserved.
|
||||
|
||||
## Microbenchmarks limitations
|
||||
|
||||
As stated in the Foreword section a number of effects do play a role in
|
||||
production but are not directly measurable through microbenchmarks. The code
|
||||
size of the benchmark is (much) smaller than the hot code of real applications
|
||||
and **doesn't exhibit instruction cache pressure as much**.
|
||||
|
||||
### iCache pressure
|
||||
|
||||
Fundamental functions that are called frequently will occupy the L1 iCache
|
||||
([illustration](https://en.wikipedia.org/wiki/CPU_cache#Example:_the_K8)). If
|
||||
they are too big they will prevent other hot code to stay in the cache and incur
|
||||
[stalls](https://en.wikipedia.org/wiki/CPU_cache#CPU_stalls). So the memory
|
||||
functions should be as small as possible.
|
||||
|
||||
### iTLB pressure
|
||||
|
||||
The same reasoning goes for instruction Translation Lookaside Buffer
|
||||
([iTLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer)) incurring
|
||||
[TLB
|
||||
misses](https://en.wikipedia.org/wiki/Translation_lookaside_buffer#TLB-miss_handling).
|
||||
|
||||
## FAQ
|
||||
|
||||
1. Why don't you use Google Benchmark directly?
|
||||
|
||||
We reuse some parts of Google Benchmark (detection of frequency scaling, CPU
|
||||
cache hierarchy informations) but when it comes to measuring memory
|
||||
functions Google Benchmark have a few issues:
|
||||
|
||||
- Google Benchmark privileges code based configuration via macros and
|
||||
builders. It is typically done in a static manner. In our case the
|
||||
parameters we need to setup are a mix of what's usually controlled by
|
||||
the framework (number of trials, maximum number of iterations, size
|
||||
ranges) and parameters that are more tied to the function under test
|
||||
(randomization strategies, custom values). Achieving this with Google
|
||||
Benchmark is cumbersome as it involves templated benchmarks and
|
||||
duplicated code. In the end, the configuration would be spread across
|
||||
command line flags (via framework's option or custom flags), and code
|
||||
constants.
|
||||
- Output of the measurements is done through a `BenchmarkReporter` class,
|
||||
that makes it hard to access the parameters discussed above.
|
|
@ -0,0 +1,103 @@
|
|||
# Libc mem* benchmarks
|
||||
|
||||
This framework has been designed to evaluate and compare relative performance of
|
||||
memory function implementations on a particular host.
|
||||
|
||||
It will also be use to track implementations performances over time.
|
||||
|
||||
## Quick start
|
||||
|
||||
### Setup
|
||||
|
||||
**Python 2** [being deprecated](https://www.python.org/doc/sunset-python-2/) it is
|
||||
advised to used **Python 3**.
|
||||
|
||||
Then make sure to have `matplotlib`, `scipy` and `numpy` setup correctly:
|
||||
|
||||
```shell
|
||||
apt-get install python3-pip
|
||||
pip3 install matplotlib scipy numpy
|
||||
```
|
||||
|
||||
To get good reproducibility it is important to make sure that the system runs in
|
||||
`performance` mode. This is achieved by running:
|
||||
|
||||
```shell
|
||||
cpupower frequency-set --governor performance
|
||||
```
|
||||
|
||||
### Run and display `memcpy` benchmark
|
||||
|
||||
The following commands will run the benchmark and display a 95 percentile
|
||||
confidence interval curve of **time per copied bytes**. It also features **host
|
||||
informations** and **benchmarking configuration**.
|
||||
|
||||
```shell
|
||||
cd llvm-project
|
||||
cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS=libc -DCMAKE_BUILD_TYPE=Release
|
||||
make -C /tmp/build -j display-libc-memcpy-benchmark-small
|
||||
```
|
||||
|
||||
## Benchmarking regimes
|
||||
|
||||
Using a profiler to observe size distributions for calls into libc functions, it
|
||||
was found most operations act on a small number of bytes.
|
||||
|
||||
Function | % of calls with size ≤ 128 | % of calls with size ≤ 1024
|
||||
------------------ | --------------------------: | ---------------------------:
|
||||
memcpy | 96% | 99%
|
||||
memset | 91% | 99.9%
|
||||
memcmp<sup>1</sup> | 99.5% | ~100%
|
||||
|
||||
Benchmarking configurations come in two flavors:
|
||||
|
||||
- [small](libc/utils/benchmarks/configuration_small.json)
|
||||
- Exercises sizes up to `1KiB`, representative of normal usage
|
||||
- The data is kept in the `L1` cache to prevent measuring the memory
|
||||
subsystem
|
||||
- [big](libc/utils/benchmarks/configuration_big.json)
|
||||
- Exercises sizes up to `32MiB` to test large operations
|
||||
- Caching effects can show up here which prevents comparing different hosts
|
||||
|
||||
_<sup>1</sup> - The size refers to the size of the buffers to compare and not
|
||||
the number of bytes until the first difference._
|
||||
|
||||
## Benchmarking targets
|
||||
|
||||
The benchmarking process occurs in two steps:
|
||||
|
||||
1. Benchmark the functions and produce a `json` file
|
||||
2. Display (or renders) the `json` file
|
||||
|
||||
Targets are of the form `<action>-libc-<function>-benchmark-<configuration>`
|
||||
|
||||
- `action` is one of :
|
||||
- `run`, runs the benchmark and writes the `json` file
|
||||
- `display`, displays the graph on screen
|
||||
- `render`, renders the graph on disk as a `png` file
|
||||
- `function` is one of : `memcpy`, `memcmp`, `memset`
|
||||
- `configuration` is one of : `small`, `big`
|
||||
|
||||
## Superposing curves
|
||||
|
||||
It is possible to **merge** several `json` files into a single graph. This is
|
||||
useful to **compare** implementations.
|
||||
|
||||
In the following example we superpose the curves for `memcpy`, `memset` and
|
||||
`memcmp`:
|
||||
|
||||
```shell
|
||||
> make -C /tmp/build run-libc-memcpy-benchmark-small run-libc-memcmp-benchmark-small run-libc-memset-benchmark-small
|
||||
> python libc/utils/benchmarks/render.py3 /tmp/last-libc-memcpy-benchmark-small.json /tmp/last-libc-memcmp-benchmark-small.json /tmp/last-libc-memset-benchmark-small.json
|
||||
```
|
||||
|
||||
## Useful `render.py3` flags
|
||||
|
||||
- To save the produced graph `--output=/tmp/benchmark_curve.png`.
|
||||
- To prevent the graph from appearing on the screen `--headless`.
|
||||
|
||||
|
||||
## Under the hood
|
||||
|
||||
To learn more about the design decisions behind the benchmarking framework,
|
||||
have a look at the [RATIONALE.md](RATIONALE.md) file.
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"Options":{
|
||||
"MinDuration":0.001,
|
||||
"MaxDuration":1,
|
||||
"InitialIterations":100,
|
||||
"MaxIterations":10000000,
|
||||
"MinSamples":1,
|
||||
"MaxSamples":1,
|
||||
"Epsilon":0.01,
|
||||
"ScalingFactor":1.4
|
||||
},
|
||||
"Configuration":{
|
||||
"Runs":5,
|
||||
"BufferSize":134217728,
|
||||
"Size":{
|
||||
"From":0,
|
||||
"To":33554432,
|
||||
"Step":1048576
|
||||
},
|
||||
"AddressAlignment":1,
|
||||
"MemsetValue":0,
|
||||
"MemcmpMismatchAt":0
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"Options":{
|
||||
"MinDuration":0.001,
|
||||
"MaxDuration":1,
|
||||
"InitialIterations":100,
|
||||
"MaxIterations":10000000,
|
||||
"MinSamples":4,
|
||||
"MaxSamples":1000,
|
||||
"Epsilon":0.01,
|
||||
"ScalingFactor":1.4
|
||||
},
|
||||
"Configuration":{
|
||||
"Runs":10,
|
||||
"BufferSize":8192,
|
||||
"Size":{
|
||||
"From":0,
|
||||
"To":1024,
|
||||
"Step":1
|
||||
},
|
||||
"AddressAlignment":1,
|
||||
"MemsetValue":0,
|
||||
"MemcmpMismatchAt":0
|
||||
}
|
||||
}
|
|
@ -0,0 +1,175 @@
|
|||
"""Reads JSON files produced by the benchmarking framework and renders them.
|
||||
|
||||
Installation:
|
||||
> apt-get install python3-pip
|
||||
> pip3 install matplotlib scipy numpy
|
||||
|
||||
Run:
|
||||
> python3 render.py3 <files>
|
||||
|
||||
Rendering can occur on disk by specifying the --output option or on screen if
|
||||
the --headless flag is not set.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import json
|
||||
import math
|
||||
import pprint
|
||||
import sys
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import EngFormatter
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
|
||||
|
||||
def format_freq(number):
|
||||
"""Returns a human readable frequency."""
|
||||
magnitude = 0
|
||||
while math.fabs(number) >= 1000:
|
||||
number /= 1000.0
|
||||
magnitude += 1
|
||||
return "%g%sHz" % (number, ["", "k", "M", "G"][magnitude])
|
||||
|
||||
|
||||
def format_size(number):
|
||||
"""Returns number in human readable form."""
|
||||
magnitude = 0
|
||||
while number >= 1000 and number % 1000 == 0:
|
||||
number /= 1000
|
||||
magnitude += 1
|
||||
return "%g%s" % (number, ["", "K", "M", "G"][magnitude])
|
||||
|
||||
|
||||
def mean_confidence_interval(dataset, confidence=0.95):
|
||||
"""Returns the mean and half confidence interval for the dataset."""
|
||||
a = 1.0 * np.array(dataset)
|
||||
n = len(a)
|
||||
m, se = np.mean(a), scipy.stats.sem(a)
|
||||
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
|
||||
return m, h
|
||||
|
||||
|
||||
def add_plot(function_name, points):
|
||||
"""Plots measurements for a function."""
|
||||
n = len(points.keys())
|
||||
x = np.zeros(n)
|
||||
y = np.zeros(n)
|
||||
yerr = np.zeros(n)
|
||||
|
||||
for i, key in enumerate(sorted(points.keys())):
|
||||
values = points[key]
|
||||
m, e = mean_confidence_interval(values)
|
||||
x[i] = key
|
||||
y[i] = m
|
||||
yerr[i] = e
|
||||
|
||||
plt.plot(x, y, linewidth=1, label=function_name)
|
||||
plt.fill_between(x, y - yerr, y + yerr, alpha=0.5)
|
||||
|
||||
|
||||
def get_title(host):
|
||||
"""Formats the Host object into a title for the plot."""
|
||||
cpu_name = host["CpuName"]
|
||||
cpu_freq = format_freq(host["CpuFrequency"])
|
||||
cache_strings = []
|
||||
for cache in host["Caches"]:
|
||||
prefix = {
|
||||
"Instruction": "i",
|
||||
"Data": "d",
|
||||
"Unified": "u",
|
||||
}.get(cache["Type"])
|
||||
cache_strings.append(r"%sL_%d %s_{/%d}" %
|
||||
(prefix, cache["Level"], format_size(
|
||||
cache["Size"]), cache["NumSharing"]))
|
||||
title = "%s (%s)" % (cpu_name, cpu_freq)
|
||||
subtitle = r"$" + ", ".join(sorted(cache_strings)) + r"$"
|
||||
return title + "\n" + subtitle
|
||||
|
||||
|
||||
def get_host(jsons):
|
||||
"""Returns the host of the different json objects iff they are all the same.
|
||||
"""
|
||||
host = None
|
||||
for root in jsons:
|
||||
if host and host != root["Host"]:
|
||||
sys.exit("The datasets are not coming from the same Host")
|
||||
if not host:
|
||||
host = root["Host"]
|
||||
return host
|
||||
|
||||
|
||||
def get_configuration(jsons):
|
||||
"""Returns the configuration of the different json objects iff they are all
|
||||
the same.
|
||||
"""
|
||||
config = None
|
||||
for root in jsons:
|
||||
if config and config != root["Configuration"]:
|
||||
return None
|
||||
if not config:
|
||||
config = root["Configuration"]
|
||||
return config
|
||||
|
||||
|
||||
def setup_graphs(files):
|
||||
"""Setups the graphs to render from the json files."""
|
||||
jsons = []
|
||||
for file in files:
|
||||
with open(file) as json_file:
|
||||
jsons.append(json.load(json_file))
|
||||
if not jsons:
|
||||
sys.exit("Nothing to process")
|
||||
|
||||
for root in jsons:
|
||||
for function in root["Functions"]:
|
||||
function_name = function["Name"]
|
||||
sizes = function["Sizes"]
|
||||
runtimes = function["Runtimes"]
|
||||
assert len(sizes) == len(runtimes)
|
||||
values = collections.defaultdict(lambda: [])
|
||||
for i in range(len(sizes)):
|
||||
values[sizes[i]].append(runtimes[i])
|
||||
add_plot(function_name, values)
|
||||
|
||||
config = get_configuration(jsons)
|
||||
if config:
|
||||
plt.figtext(
|
||||
0.95,
|
||||
0.15,
|
||||
pprint.pformat(config),
|
||||
verticalalignment="bottom",
|
||||
horizontalalignment="right",
|
||||
multialignment="left",
|
||||
fontsize="small",
|
||||
bbox=dict(boxstyle="round", facecolor="wheat"))
|
||||
|
||||
axes = plt.gca()
|
||||
axes.set_title(get_title(get_host(jsons)))
|
||||
axes.set_ylim(bottom=0)
|
||||
axes.set_xlabel("Size")
|
||||
axes.set_ylabel("Time")
|
||||
axes.xaxis.set_major_formatter(EngFormatter(unit="B"))
|
||||
axes.yaxis.set_major_formatter(EngFormatter(unit="s"))
|
||||
plt.legend()
|
||||
plt.grid()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process benchmark json files.")
|
||||
parser.add_argument("files", nargs="+", help="The json files to read from.")
|
||||
parser.add_argument("--output", help="The output file to write the graph.")
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
help="If set do not display the graph.",
|
||||
action="store_true")
|
||||
args = parser.parse_args()
|
||||
setup_graphs(args.files)
|
||||
if args.output:
|
||||
plt.savefig(args.output)
|
||||
if not args.headless:
|
||||
plt.show()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue