[llvm-libc] Add memory function benchmarks

Summary: This patch adds a benchmarking infrastructure for llvm-libc memory functions. In a nutshell, the code can benchmark small and large buffers for the memcpy, memset and memcmp functions. It also produces graphs of size vs latency by running targets of the form `render-libc-{memcpy|memset|memcmp}-benchmark-{small|big}`. The configurations are provided as JSON files and the benchmark also produces a JSON file. This file is then parsed and rendered as a PNG file via the `render.py` script (make sure to run `pip3 install matplotlib scipy numpy`). The script can take several JSON files as input and will superimpose the curves if they are from the same host. TODO: - The code benchmarks whatever is available on the host but should be configured to benchmark the -to be added- llvm-libc memory functions. - Add a README file with instructions and rationale. - Produce scores to track the performance of the functions over time to allow for regression detection. Reviewers: sivachandra, ckennelly Subscribers: mgorny, MaskRay, libc-commits Tags: #libc-project Differential Revision: https://reviews.llvm.org/D72516
2020-01-06 13:17:04 +01:00 · 2020-01-06 13:17:04 +01:00 · aba80d0734
parent b901335193
commit aba80d0734
22 changed files with 2588 additions and 0 deletions
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@ -1,2 +1,3 @@
 add_subdirectory(HdrGen)
 add_subdirectory(UnitTest)
+add_subdirectory(benchmarks)
--- a/libc/utils/HdrGen/CMakeLists.txt
+++ b/libc/utils/HdrGen/CMakeLists.txt
@ -1,3 +1,5 @@
+set(LLVM_LINK_COMPONENTS Support)
+
 add_tablegen(libc-hdrgen llvm-libc
  Command.h
  Command.cpp
--- a/libc/utils/benchmarks/CMakeLists.txt
+++ b/libc/utils/benchmarks/CMakeLists.txt
@ -0,0 +1,184 @@
+find_package(Threads)
+
+include(ExternalProject)
+
+set(LLVM_LINK_COMPONENTS Support)
+
+#==============================================================================
+# Build Google Benchmark
+#==============================================================================
+set(GOOGLE_BENCHMARK_TARGET_FLAGS ${BENCHMARK_DIALECT_FLAG})
+if (LIBCXX_BENCHMARK_GCC_TOOLCHAIN)
+  set(GOOGLE_BENCHMARK_TARGET_FLAGS
+      -gcc-toolchain ${LIBCXX_BENCHMARK_GCC_TOOLCHAIN})
+endif()
+string(REPLACE ";" " " GOOGLE_BENCHMARK_TARGET_FLAGS "${GOOGLE_BENCHMARK_TARGET_FLAGS}")
+
+ExternalProject_Add(google-benchmark
+    EXCLUDE_FROM_ALL ON
+    PREFIX google-benchmark
+    SOURCE_DIR ${LIBC_SOURCE_DIR}/../llvm/utils/benchmark
+    INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark
+    CMAKE_CACHE_ARGS
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+        -DCMAKE_CXX_FLAGS:STRING=${GOOGLE_BENCHMARK_TARGET_FLAGS}
+        -DCMAKE_BUILD_TYPE:STRING=RELEASE
+        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+        -DBENCHMARK_ENABLE_TESTING:BOOL=OFF)
+
+set(GOOGLE_BENCHMARK_LIBC_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark)
+set(GOOGLE_BENCHMARK_LINK_FLAGS    -L${GOOGLE_BENCHMARK_LIBC_INSTALL}/lib/)
+
+#==============================================================================
+# Add Unit Testing Support
+#==============================================================================
+
+function(add_libc_benchmark_unittest target_name)
+  if(NOT LLVM_INCLUDE_TESTS)
+    return()
+  endif()
+
+  cmake_parse_arguments(
+    "LIBC_BENCHMARKS_UNITTEST"
+    "" # No optional arguments
+    "SUITE" # Single value arguments
+    "SRCS;DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+
+  add_executable(${target_name}
+    EXCLUDE_FROM_ALL
+    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+  )
+  target_include_directories(${target_name}
+    PRIVATE
+    ${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include
+    ${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include
+  )
+  target_link_libraries(${target_name}
+    PRIVATE
+    gtest_main
+    gtest
+    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+  )
+
+  add_custom_command(
+    TARGET ${target_name}
+    POST_BUILD
+    COMMAND $<TARGET_FILE:${target_name}>
+  )
+  add_dependencies(check-libc-benchmark ${target_name})
+endfunction()
+
+#==============================================================================
+# Build Google Benchmark for libc
+#==============================================================================
+
+add_custom_target(check-libc-benchmark)
+
+function(fix_rtti target)
+    # TODO: Make this portable and inline with rtti mode from llvm/
+    target_compile_options(${target} PUBLIC -fno-rtti)
+endfunction()
+
+# libc-benchmark
+add_library(libc-benchmark
+    STATIC
+    EXCLUDE_FROM_ALL
+    LibcBenchmark.cpp
+    LibcBenchmark.h
+)
+add_dependencies(libc-benchmark google-benchmark)
+target_include_directories(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LIBC_INSTALL}/include")
+target_link_options(libc-benchmark PUBLIC "${GOOGLE_BENCHMARK_LINK_FLAGS}")
+target_link_libraries(libc-benchmark PUBLIC LLVMSupport -lbenchmark Threads::Threads)
+fix_rtti(libc-benchmark)
+
+add_libc_benchmark_unittest(libc-benchmark-test
+    SRCS LibcBenchmarkTest.cpp
+    DEPENDS libc-benchmark
+)
+
+# libc-memory-benchmark
+add_library(libc-memory-benchmark
+    STATIC
+    EXCLUDE_FROM_ALL
+    LibcMemoryBenchmark.cpp
+    LibcMemoryBenchmark.h
+)
+target_link_libraries(libc-memory-benchmark PUBLIC libc-benchmark)
+fix_rtti(libc-memory-benchmark)
+
+add_libc_benchmark_unittest(libc-memory-benchmark-test
+    SRCS LibcMemoryBenchmarkTest.cpp
+    DEPENDS libc-memory-benchmark
+)
+
+# json
+add_library(json
+    STATIC
+    EXCLUDE_FROM_ALL
+    JSON.cpp
+    JSON.h
+)
+target_link_libraries(json PUBLIC libc-memory-benchmark)
+fix_rtti(json)
+
+add_libc_benchmark_unittest(json-test
+    SRCS JSONTest.cpp
+    DEPENDS json
+)
+
+#==============================================================================
+# Benchmark tests configuration
+#==============================================================================
+
+function(add_libc_benchmark_analysis conf_target run_target)
+    set(png_file "/tmp/last-${conf_target}.png")
+    set(render_target render-${conf_target})
+    add_custom_target(${render_target}
+        COMMAND python3 render.py3 ${json_file} --headless --output=${png_file}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "render ${libc_target} to ${png_file}"
+    )
+    add_dependencies(${render_target} ${run_target})
+
+    set(display_target display-${conf_target})
+    add_custom_target(${display_target}
+        COMMAND python3 render.py3 ${json_file}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "display ${libc_target}"
+    )
+    add_dependencies(${display_target} ${run_target})
+endfunction()
+
+function(add_libc_benchmark_configuration target configuration)
+    set(conf_target ${target}-${configuration})
+    set(json_file "/tmp/last-${conf_target}.json")
+    set(run_target run-${conf_target})
+    add_custom_target(${run_target}
+        COMMAND ${libc_target} --conf=configuration_${configuration}.json -o ${json_file}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+    add_libc_benchmark_analysis(${conf_target} ${run_target})
+endfunction()
+
+function(add_libc_benchmark name file)
+    set(libc_target libc-${name}-benchmark)
+    add_executable(${libc_target}
+        EXCLUDE_FROM_ALL
+        ${file}
+        LibcMemoryBenchmarkMain.h
+        LibcMemoryBenchmarkMain.cpp
+    )
+    target_link_libraries(${libc_target} PUBLIC json)
+    foreach(configuration "small" "big")
+        add_libc_benchmark_configuration(${libc_target} ${configuration})
+    endforeach()
+endfunction()
+
+add_libc_benchmark(memcpy Memcpy.cpp)
+add_libc_benchmark(memcmp Memcmp.cpp)
+add_libc_benchmark(memset Memset.cpp)
--- a/libc/utils/benchmarks/JSON.cpp
+++ b/libc/utils/benchmarks/JSON.cpp
@ -0,0 +1,367 @@
+//===-------- JSON serialization routines ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "JSON.h"
+#include "LibcBenchmark.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MathExtras.h"
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace llvm {
+namespace libc_benchmarks {
+
+template <typename T>
+static Error intFromJsonTemplate(const json::Value &V, T &Out) {
+  if (const auto &MaybeInt64 = V.getAsInteger()) {
+    int64_t Value = *MaybeInt64;
+    if (Value < std::numeric_limits<T>::min() ||
+        Value > std::numeric_limits<T>::max())
+      return createStringError(errc::io_error, "Out of bound Integer");
+    Out = Value;
+    return Error::success();
+  }
+  return createStringError(errc::io_error, "Can't parse Integer");
+}
+
+static Error fromJson(const json::Value &V, double &Out) {
+  if (auto S = V.getAsNumber()) {
+    Out = *S;
+    return Error::success();
+  }
+  return createStringError(errc::io_error, "Can't parse Double");
+}
+
+static Error fromJson(const json::Value &V, std::string &Out) {
+  if (auto S = V.getAsString()) {
+    Out = *S;
+    return Error::success();
+  }
+  return createStringError(errc::io_error, "Can't parse String");
+}
+
+static Error fromJson(const json::Value &V, uint32_t &Out) {
+  return intFromJsonTemplate(V, Out);
+}
+
+static Error fromJson(const json::Value &V, uint8_t &Out) {
+  return intFromJsonTemplate(V, Out);
+}
+
+static Error fromJson(const json::Value &V, int &Out) {
+  return intFromJsonTemplate(V, Out);
+}
+
+static Error fromJson(const json::Value &V, libc_benchmarks::Duration &D) {
+  if (V.kind() != json::Value::Kind::Number)
+    return createStringError(errc::io_error, "Can't parse Duration");
+  D = libc_benchmarks::Duration(*V.getAsNumber());
+  return Error::success();
+}
+
+static Error fromJson(const json::Value &V, MaybeAlign &Out) {
+  const auto MaybeInt = V.getAsInteger();
+  if (!MaybeInt)
+    return createStringError(errc::io_error,
+                             "Can't parse Align, not an Integer");
+  const int64_t Value = *MaybeInt;
+  if (!Value) {
+    Out = None;
+    return Error::success();
+  }
+  if (isPowerOf2_64(Value)) {
+    Out = Align(Value);
+    return Error::success();
+  }
+  return createStringError(errc::io_error,
+                           "Can't parse Align, not a power of two");
+}
+
+static Error fromJson(const json::Value &V,
+                      libc_benchmarks::BenchmarkLog &Out) {
+  if (V.kind() != json::Value::Kind::String)
+    return createStringError(errc::io_error,
+                             "Can't parse BenchmarkLog, not a String");
+  const auto String = *V.getAsString();
+  auto Parsed =
+      llvm::StringSwitch<Optional<libc_benchmarks::BenchmarkLog>>(String)
+          .Case("None", libc_benchmarks::BenchmarkLog::None)
+          .Case("Last", libc_benchmarks::BenchmarkLog::Last)
+          .Case("Full", libc_benchmarks::BenchmarkLog::Full)
+          .Default(None);
+  if (!Parsed)
+    return createStringError(errc::io_error,
+                             Twine("Can't parse BenchmarkLog, invalid value '")
+                                 .concat(String)
+                                 .concat("'"));
+  Out = *Parsed;
+  return Error::success();
+}
+
+template <typename C>
+Error vectorFromJsonTemplate(const json::Value &V, C &Out) {
+  auto *A = V.getAsArray();
+  if (!A)
+    return createStringError(errc::io_error, "Can't parse Array");
+  Out.clear();
+  Out.resize(A->size());
+  for (auto InOutPair : llvm::zip(*A, Out))
+    if (auto E = fromJson(std::get<0>(InOutPair), std::get<1>(InOutPair)))
+      return std::move(E);
+  return Error::success();
+}
+
+template <typename T>
+static Error fromJson(const json::Value &V, std::vector<T> &Out) {
+  return vectorFromJsonTemplate(V, Out);
+}
+
+template <typename T>
+static Error fromJson(const json::Value &V, SmallVectorImpl<T> &Out) {
+  return vectorFromJsonTemplate(V, Out);
+}
+
+// Same as llvm::json::ObjectMapper but adds a finer error reporting mechanism.
+class JsonObjectMapper {
+  const json::Object *O;
+  Error E;
+  SmallDenseSet<StringRef> SeenFields;
+
+public:
+  explicit JsonObjectMapper(const json::Value &V)
+      : O(V.getAsObject()),
+        E(O ? Error::success()
+            : createStringError(errc::io_error, "Expected JSON Object")) {}
+
+  Error takeError() {
+    if (E)
+      return std::move(E);
+    for (const auto &Itr : *O) {
+      const StringRef Key = Itr.getFirst();
+      if (!SeenFields.count(Key))
+        E = createStringError(errc::io_error,
+                              Twine("Unknown field: ").concat(Key));
+    }
+    return std::move(E);
+  }
+
+  template <typename T> void map(StringRef Key, T &Out) {
+    if (E)
+      return;
+    if (const json::Value *Value = O->get(Key)) {
+      SeenFields.insert(Key);
+      E = fromJson(*Value, Out);
+    }
+  }
+};
+
+static Error fromJson(const json::Value &V,
+                      libc_benchmarks::BenchmarkOptions &Out) {
+  JsonObjectMapper O(V);
+  O.map("MinDuration", Out.MinDuration);
+  O.map("MaxDuration", Out.MaxDuration);
+  O.map("InitialIterations", Out.InitialIterations);
+  O.map("MaxIterations", Out.MaxIterations);
+  O.map("MinSamples", Out.MinSamples);
+  O.map("MaxSamples", Out.MaxSamples);
+  O.map("Epsilon", Out.Epsilon);
+  O.map("ScalingFactor", Out.ScalingFactor);
+  O.map("Log", Out.Log);
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V, libc_benchmarks::SizeRange &Out) {
+  JsonObjectMapper O(V);
+  O.map("From", Out.From);
+  O.map("To", Out.To);
+  O.map("Step", Out.Step);
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V,
+                      libc_benchmarks::StudyConfiguration &Out) {
+  JsonObjectMapper O(V);
+  O.map("Runs", Out.Runs);
+  O.map("BufferSize", Out.BufferSize);
+  O.map("Size", Out.Size);
+  O.map("AddressAlignment", Out.AddressAlignment);
+  O.map("MemsetValue", Out.MemsetValue);
+  O.map("MemcmpMismatchAt", Out.MemcmpMismatchAt);
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V, libc_benchmarks::CacheInfo &Out) {
+  JsonObjectMapper O(V);
+  O.map("Type", Out.Type);
+  O.map("Level", Out.Level);
+  O.map("Size", Out.Size);
+  O.map("NumSharing", Out.NumSharing);
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V, libc_benchmarks::HostState &Out) {
+  JsonObjectMapper O(V);
+  O.map("CpuName", Out.CpuName);
+  O.map("CpuFrequency", Out.CpuFrequency);
+  O.map("Caches", Out.Caches);
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V,
+                      libc_benchmarks::FunctionMeasurements &Out) {
+  JsonObjectMapper O(V);
+  O.map("Name", Out.Name);
+  std::vector<uint32_t> Sizes;
+  O.map("Sizes", Sizes);
+  std::vector<libc_benchmarks::Duration> Runtimes;
+  O.map("Runtimes", Runtimes);
+  if (Sizes.size() != Runtimes.size())
+    return createStringError(errc::io_error,
+                             "Measurement Size and Runtime mistmatch");
+  Out.Measurements.resize(Sizes.size());
+  for (size_t I = 0; I < Sizes.size(); ++I) {
+    Out.Measurements[I].Size = Sizes[I];
+    Out.Measurements[I].Runtime = Runtimes[I];
+  }
+  return O.takeError();
+}
+
+static Error fromJson(const json::Value &V, libc_benchmarks::Study &Out) {
+  JsonObjectMapper O(V);
+  O.map("Host", Out.Host);
+  O.map("Options", Out.Options);
+  O.map("Configuration", Out.Configuration);
+  O.map("Functions", Out.Functions);
+  return O.takeError();
+}
+
+static double Seconds(const Duration &D) {
+  return std::chrono::duration<double>(D).count();
+}
+
+Expected<Study> ParseJsonStudy(StringRef Content) {
+  Expected<json::Value> EV = json::parse(Content);
+  if (!EV)
+    return EV.takeError();
+  Study S;
+  if (Error E = fromJson(*EV, S))
+    return std::move(E);
+  return S;
+}
+
+static StringRef Serialize(const BenchmarkLog &L) {
+  switch (L) {
+  case BenchmarkLog::None:
+    return "None";
+  case BenchmarkLog::Last:
+    return "Last";
+  case BenchmarkLog::Full:
+    return "Full";
+  }
+  llvm_unreachable("Unhandled BenchmarkLog value");
+}
+
+static void Serialize(const BenchmarkOptions &BO, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attribute("MinDuration", Seconds(BO.MinDuration));
+    JOS.attribute("MaxDuration", Seconds(BO.MaxDuration));
+    JOS.attribute("InitialIterations", BO.InitialIterations);
+    JOS.attribute("MaxIterations", BO.MaxIterations);
+    JOS.attribute("MinSamples", BO.MinSamples);
+    JOS.attribute("MaxSamples", BO.MaxSamples);
+    JOS.attribute("Epsilon", BO.Epsilon);
+    JOS.attribute("ScalingFactor", BO.ScalingFactor);
+    JOS.attribute("Log", Serialize(BO.Log));
+  });
+}
+
+static void Serialize(const CacheInfo &CI, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attribute("Type", CI.Type);
+    JOS.attribute("Level", CI.Level);
+    JOS.attribute("Size", CI.Size);
+    JOS.attribute("NumSharing", CI.NumSharing);
+  });
+}
+
+static void Serialize(const HostState &HS, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attribute("CpuName", HS.CpuName);
+    JOS.attribute("CpuFrequency", HS.CpuFrequency);
+    JOS.attributeArray("Caches", [&]() {
+      for (const auto &CI : HS.Caches)
+        Serialize(CI, JOS);
+    });
+  });
+}
+
+static void Serialize(const StudyConfiguration &SC, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attribute("Runs", SC.Runs);
+    JOS.attribute("BufferSize", SC.BufferSize);
+    JOS.attributeObject("Size", [&]() {
+      JOS.attribute("From", SC.Size.From);
+      JOS.attribute("To", SC.Size.To);
+      JOS.attribute("Step", SC.Size.Step);
+    });
+    if (SC.AddressAlignment)
+      JOS.attribute("AddressAlignment",
+                    static_cast<int64_t>(SC.AddressAlignment->value()));
+    JOS.attribute("MemsetValue", SC.MemsetValue);
+    JOS.attribute("MemcmpMismatchAt", SC.MemcmpMismatchAt);
+  });
+}
+
+static void Serialize(const FunctionMeasurements &FM, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attribute("Name", FM.Name);
+    JOS.attributeArray("Sizes", [&]() {
+      for (const auto &M : FM.Measurements)
+        JOS.value(M.Size);
+    });
+    JOS.attributeArray("Runtimes", [&]() {
+      for (const auto &M : FM.Measurements)
+        JOS.value(Seconds(M.Runtime));
+    });
+  });
+}
+
+void SerializeToJson(const Study &S, json::OStream &JOS) {
+  JOS.object([&]() {
+    JOS.attributeBegin("Host");
+    Serialize(S.Host, JOS);
+    JOS.attributeEnd();
+
+    JOS.attributeBegin("Options");
+    Serialize(S.Options, JOS);
+    JOS.attributeEnd();
+
+    JOS.attributeBegin("Configuration");
+    Serialize(S.Configuration, JOS);
+    JOS.attributeEnd();
+
+    if (!S.Functions.empty()) {
+      JOS.attributeArray("Functions", [&]() {
+        for (const auto &FM : S.Functions)
+          Serialize(FM, JOS);
+      });
+    }
+  });
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/JSON.h
+++ b/libc/utils/benchmarks/JSON.h
@ -0,0 +1,28 @@
+//===-------- JSON serialization routines -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_BENCHMARK_JSON_H
+#define LLVM_LIBC_UTILS_BENCHMARK_JSON_H
+
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "llvm/Support/JSON.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// Parses a Study from a json string.
+Expected<Study> ParseJsonStudy(StringRef Content);
+
+// Serialize a Study as json.
+void SerializeToJson(const Study &S, llvm::json::OStream &JOS);
+
+} // namespace libc_benchmarks
+} // namespace llvm
+
+#endif // LLVM_LIBC_UTILS_BENCHMARK_JSON_H
--- a/libc/utils/benchmarks/JSONTest.cpp
+++ b/libc/utils/benchmarks/JSONTest.cpp
@ -0,0 +1,190 @@
+#include "JSON.h"
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::AllOf;
+using testing::ExplainMatchResult;
+using testing::Field;
+using testing::Pointwise;
+
+namespace llvm {
+namespace libc_benchmarks {
+namespace {
+
+Study getStudy() {
+  return Study{
+      HostState{
+          "CpuName", 123, {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
+      BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2), 10,
+                       100, 6, 100, 0.1, 2, BenchmarkLog::Full},
+      StudyConfiguration{2, 3, SizeRange{4, 5, 6}, Align(8), 9, 10},
+      {FunctionMeasurements{"A",
+                            {Measurement{3, std::chrono::seconds(3)},
+                             Measurement{3, std::chrono::seconds(4)}}},
+       FunctionMeasurements{"B", {}}}};
+}
+
+static std::string SerializeToString(const Study &S) {
+  std::string Buffer;
+  raw_string_ostream RSO(Buffer);
+  json::OStream JOS(RSO);
+  SerializeToJson(S, JOS);
+  return Buffer;
+}
+
+MATCHER(EqualsCacheInfo, "") {
+  const CacheInfo &A = ::testing::get<0>(arg);
+  const CacheInfo &B = ::testing::get<1>(arg);
+  return ExplainMatchResult(AllOf(Field(&CacheInfo::Type, B.Type),
+                                  Field(&CacheInfo::Level, B.Level),
+                                  Field(&CacheInfo::Size, B.Size),
+                                  Field(&CacheInfo::NumSharing, B.NumSharing)),
+                            A, result_listener);
+}
+
+auto Equals(const HostState &H) -> auto {
+  return AllOf(
+      Field(&HostState::CpuName, H.CpuName),
+      Field(&HostState::CpuFrequency, H.CpuFrequency),
+      Field(&HostState::Caches, Pointwise(EqualsCacheInfo(), H.Caches)));
+}
+
+auto Equals(const BenchmarkOptions &BO) -> auto {
+  return AllOf(
+      Field(&BenchmarkOptions::MinDuration, BO.MinDuration),
+      Field(&BenchmarkOptions::MaxDuration, BO.MaxDuration),
+      Field(&BenchmarkOptions::InitialIterations, BO.InitialIterations),
+      Field(&BenchmarkOptions::MaxIterations, BO.MaxIterations),
+      Field(&BenchmarkOptions::MinSamples, BO.MinSamples),
+      Field(&BenchmarkOptions::MaxSamples, BO.MaxSamples),
+      Field(&BenchmarkOptions::Epsilon, BO.Epsilon),
+      Field(&BenchmarkOptions::ScalingFactor, BO.ScalingFactor),
+      Field(&BenchmarkOptions::Log, BO.Log));
+}
+
+auto Equals(const SizeRange &SR) -> auto {
+  return AllOf(Field(&SizeRange::From, SR.From), Field(&SizeRange::To, SR.To),
+               Field(&SizeRange::Step, SR.Step));
+}
+
+auto Equals(const StudyConfiguration &SC) -> auto {
+  return AllOf(
+      Field(&StudyConfiguration::Runs, SC.Runs),
+      Field(&StudyConfiguration::BufferSize, SC.BufferSize),
+      Field(&StudyConfiguration::Size, Equals(SC.Size)),
+      Field(&StudyConfiguration::AddressAlignment, SC.AddressAlignment),
+      Field(&StudyConfiguration::MemsetValue, SC.MemsetValue),
+      Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
+}
+
+MATCHER(EqualsMeasurement, "") {
+  const Measurement &A = ::testing::get<0>(arg);
+  const Measurement &B = ::testing::get<1>(arg);
+  return ExplainMatchResult(AllOf(Field(&Measurement::Size, B.Size),
+                                  Field(&Measurement::Runtime, B.Runtime)),
+                            A, result_listener);
+}
+
+MATCHER(EqualsFunctions, "") {
+  const FunctionMeasurements &A = ::testing::get<0>(arg);
+  const FunctionMeasurements &B = ::testing::get<1>(arg);
+  return ExplainMatchResult(
+      AllOf(Field(&FunctionMeasurements::Name, B.Name),
+            Field(&FunctionMeasurements::Measurements,
+                  Pointwise(EqualsMeasurement(), B.Measurements))),
+      A, result_listener);
+}
+
+auto Equals(const Study &S) -> auto {
+  return AllOf(
+      Field(&Study::Host, Equals(S.Host)),
+      Field(&Study::Options, Equals(S.Options)),
+      Field(&Study::Configuration, Equals(S.Configuration)),
+      Field(&Study::Functions, Pointwise(EqualsFunctions(), S.Functions)));
+}
+
+TEST(JsonTest, RoundTrip) {
+  const Study S = getStudy();
+  auto StudyOrError = ParseJsonStudy(SerializeToString(S));
+  if (auto Err = StudyOrError.takeError())
+    EXPECT_FALSE(Err) << "Unexpected error";
+  const Study &Parsed = *StudyOrError;
+  EXPECT_THAT(Parsed, Equals(S));
+}
+
+TEST(JsonTest, SupplementaryField) {
+  auto Failure = ParseJsonStudy(R"({
+      "UnknownField": 10
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()), "Unknown field: UnknownField");
+}
+
+TEST(JsonTest, InvalidType) {
+  auto Failure = ParseJsonStudy(R"({
+      "Options": 1
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()), "Expected JSON Object");
+}
+
+TEST(JsonTest, InvalidDuration) {
+  auto Failure = ParseJsonStudy(R"({
+      "Options": {
+        "MinDuration": "Duration should be a Number"
+      }
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()), "Can't parse Duration");
+}
+
+TEST(JsonTest, InvalidAlignType) {
+  auto Failure = ParseJsonStudy(R"({
+      "Configuration":{
+        "AddressAlignment": "Align should be an Integer"
+      }
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()), "Can't parse Align, not an Integer");
+}
+
+TEST(JsonTest, InvalidAlign) {
+  auto Failure = ParseJsonStudy(R"({
+      "Configuration":{
+        "AddressAlignment":3
+      }
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()),
+            "Can't parse Align, not a power of two");
+}
+
+TEST(JsonTest, InvalidBenchmarkLogType) {
+  auto Failure = ParseJsonStudy(R"({
+      "Options":{
+        "Log": 3
+      }
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()),
+            "Can't parse BenchmarkLog, not a String");
+}
+
+TEST(JsonTest, InvalidBenchmarkLog) {
+  auto Failure = ParseJsonStudy(R"({
+      "Options":{
+        "Log": "Unknown"
+      }
+    }
+  )");
+  EXPECT_EQ(toString(Failure.takeError()),
+            "Can't parse BenchmarkLog, invalid value 'Unknown'");
+}
+
+} // namespace
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/LibcBenchmark.cpp
+++ b/libc/utils/benchmarks/LibcBenchmark.cpp
@ -0,0 +1,40 @@
+//===-------- `Benchmark` function ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcBenchmark.h"
+#include "llvm/Support/Host.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+void checkRequirements() {
+  const auto &CpuInfo = benchmark::CPUInfo::Get();
+  if (CpuInfo.scaling_enabled)
+    report_fatal_error(
+        "CPU scaling is enabled, the benchmark real time measurements may be "
+        "noisy and will incur extra overhead.");
+}
+
+HostState HostState::get() {
+  const auto &CpuInfo = benchmark::CPUInfo::Get();
+  HostState H;
+  H.CpuFrequency = CpuInfo.cycles_per_second;
+  H.CpuName = llvm::sys::getHostCPUName().str();
+  for (const auto &BenchmarkCacheInfo : CpuInfo.caches) {
+    CacheInfo CI;
+    CI.Type = BenchmarkCacheInfo.type;
+    CI.Level = BenchmarkCacheInfo.level;
+    CI.Size = BenchmarkCacheInfo.size;
+    CI.NumSharing = BenchmarkCacheInfo.num_sharing;
+    H.Caches.push_back(std::move(CI));
+  }
+  return H;
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/LibcBenchmark.h
+++ b/libc/utils/benchmarks/LibcBenchmark.h
@ -0,0 +1,324 @@
+//===-------- `Benchmark` function ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This file mainly defines a `Benchmark` function.
+//
+// The benchmarking process is as follows:
+// - We start by measuring the time it takes to run the function
+// `InitialIterations` times. This is called a Sample. From this we can derive
+// the time it took to run a single iteration.
+//
+// - We repeat the previous step with a greater number of iterations to lower
+// the impact of the measurement. We can derive a more precise estimation of the
+// runtime for a single iteration.
+//
+// - Each sample gives a more accurate estimation of the runtime for a single
+// iteration but also takes more time to run. We stop the process when:
+//   * The measure stabilize under a certain precision (Epsilon),
+//   * The overall benchmarking time is greater than MaxDuration,
+//   * The overall sample count is greater than MaxSamples,
+//   * The last sample used more than MaxIterations iterations.
+//
+// - We also makes sure that the benchmark doesn't run for a too short period of
+// time by defining MinDuration and MinSamples.
+
+#ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
+#define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
+
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include <array>
+#include <chrono>
+#include <cstdint>
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// Makes sure the binary was compiled in release mode and that frequency
+// governor is set on performance.
+void checkRequirements();
+
+using Duration = std::chrono::duration<double>;
+
+enum class BenchmarkLog {
+  None, // Don't keep the internal state of the benchmark.
+  Last, // Keep only the last batch.
+  Full  // Keep all iterations states, useful for testing or debugging.
+};
+
+// An object to configure the benchmark stopping conditions.
+// See documentation at the beginning of the file for the overall algorithm and
+// meaning of each field.
+struct BenchmarkOptions {
+  // The minimum time for which the benchmark is running.
+  Duration MinDuration = std::chrono::seconds(0);
+  // The maximum time for which the benchmark is running.
+  Duration MaxDuration = std::chrono::seconds(10);
+  // The number of iterations in the first sample.
+  uint32_t InitialIterations = 1;
+  // The maximum number of iterations for any given sample.
+  uint32_t MaxIterations = 10000000;
+  // The minimum number of samples.
+  uint32_t MinSamples = 4;
+  // The maximum number of samples.
+  uint32_t MaxSamples = 1000;
+  // The benchmark will stop is the relative difference between the current and
+  // the last estimation is less than epsilon. This is 1% by default.
+  double Epsilon = 0.01;
+  // The number of iterations grows exponentially between each sample.
+  // Must be greater or equal to 1.
+  double ScalingFactor = 1.4;
+  BenchmarkLog Log = BenchmarkLog::None;
+};
+
+// The state of a benchmark.
+enum class BenchmarkStatus {
+  Running,
+  MaxDurationReached,
+  MaxIterationsReached,
+  MaxSamplesReached,
+  PrecisionReached,
+};
+
+// The internal state of the benchmark, useful to debug, test or report
+// statistics.
+struct BenchmarkState {
+  size_t LastSampleIterations;
+  Duration LastBatchElapsed;
+  BenchmarkStatus CurrentStatus;
+  Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
+  double ChangeRatio; // The change in time estimation between previous and
+                      // current samples.
+};
+
+// A lightweight result for a benchmark.
+struct BenchmarkResult {
+  BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
+  Duration BestGuess = {};
+  llvm::Optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
+};
+
+// Stores information about a cache in the host memory system.
+struct CacheInfo {
+  std::string Type; //  e.g. "Instruction", "Data", "Unified".
+  int Level;        // 0 is closest to processing unit.
+  int Size;         // In bytes.
+  int NumSharing;   // The number of processing units (Hyper-Threading Thread)
+                    // with which this cache is shared.
+};
+
+// Stores information about the host.
+struct HostState {
+  std::string CpuName; // returns a string compatible with the -march option.
+  double CpuFrequency; // in Hertz.
+  std::vector<CacheInfo> Caches;
+
+  static HostState get();
+};
+
+namespace internal {
+
+struct Measurement {
+  size_t Iterations = 0;
+  Duration Elapsed = {};
+};
+
+// Updates the estimation of the elapsed time for a single iteration.
+class RefinableRuntimeEstimation {
+  Duration TotalTime = {};
+  size_t TotalIterations = 0;
+
+public:
+  Duration update(const Measurement &M) {
+    assert(M.Iterations > 0);
+    // Duration is encoded as a double (see definition).
+    // `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
+    // loss of precision due to radically different scales.
+    TotalTime += M.Elapsed;
+    TotalIterations += M.Iterations;
+    return TotalTime / TotalIterations;
+  }
+};
+
+// This class tracks the progression of the runtime estimation.
+class RuntimeEstimationProgression {
+  RefinableRuntimeEstimation RRE;
+
+public:
+  Duration CurrentEstimation = {};
+
+  // Returns the change ratio between our best guess so far and the one from the
+  // new measurement.
+  double computeImprovement(const Measurement &M) {
+    const Duration NewEstimation = RRE.update(M);
+    const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
+    CurrentEstimation = NewEstimation;
+    return Ratio;
+  }
+};
+
+} // namespace internal
+
+// Measures the runtime of `foo` until conditions defined by `Options` are met.
+//
+// To avoid measurement's imprecisions we measure batches of `foo`.
+// The batch size is growing by `ScalingFactor` to minimize the effect of
+// measuring.
+//
+// Note: The benchmark is not responsible for serializing the executions of
+// `foo`. It is not suitable for measuring, very small & side effect free
+// functions, as the processor is free to execute serveral executions in
+// parallel.
+//
+// - Options: A set of parameters controlling the stopping conditions for the
+//     benchmark.
+// - foo: The function under test. It takes one value and returns one value.
+//     The input value is used to randomize the execution of `foo` as part of a
+//     batch to mitigate the effect of the branch predictor. Signature:
+//     `ProductType foo(ParameterProvider::value_type value);`
+//     The output value is a product of the execution of `foo` and prevents the
+//     compiler from optimizing out foo's body.
+// - ParameterProvider: An object responsible for providing a range of
+//     `Iterations` values to use as input for `foo`. The `value_type` of the
+//     returned container has to be compatible with `foo` argument.
+//     Must implement one of:
+//     `Container<ParameterType> generateBatch(size_t Iterations);`
+//     `const Container<ParameterType>& generateBatch(size_t Iterations);`
+// - Clock: An object providing the current time. Must implement:
+//     `std::chrono::time_point now();`
+template <typename Function, typename ParameterProvider,
+          typename BenchmarkClock = const std::chrono::high_resolution_clock>
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                          ParameterProvider &PP, Function foo,
+                          BenchmarkClock &Clock = BenchmarkClock()) {
+  BenchmarkResult Result;
+  internal::RuntimeEstimationProgression REP;
+  Duration TotalBenchmarkDuration = {};
+  size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
+  size_t Samples = 0;
+  if (Options.ScalingFactor < 1.0)
+    report_fatal_error("ScalingFactor should be >= 1");
+  if (Options.Log != BenchmarkLog::None)
+    Result.MaybeBenchmarkLog.emplace();
+  for (;;) {
+    // Request a new Batch of size `Iterations`.
+    const auto &Batch = PP.generateBatch(Iterations);
+
+    // Measuring this Batch.
+    const auto StartTime = Clock.now();
+    for (const auto Parameter : Batch) {
+      const auto Production = foo(Parameter);
+      benchmark::DoNotOptimize(Production);
+    }
+    const auto EndTime = Clock.now();
+    const Duration Elapsed = EndTime - StartTime;
+
+    // Updating statistics.
+    ++Samples;
+    TotalBenchmarkDuration += Elapsed;
+    const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
+    Result.BestGuess = REP.CurrentEstimation;
+
+    // Stopping condition.
+    if (TotalBenchmarkDuration >= Options.MinDuration &&
+        Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
+      Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
+    else if (Samples >= Options.MaxSamples)
+      Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
+    else if (TotalBenchmarkDuration >= Options.MaxDuration)
+      Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
+    else if (Iterations >= Options.MaxIterations)
+      Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
+
+    if (Result.MaybeBenchmarkLog) {
+      auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
+      if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
+        BenchmarkLog.pop_back();
+      BenchmarkState BS;
+      BS.LastSampleIterations = Iterations;
+      BS.LastBatchElapsed = Elapsed;
+      BS.CurrentStatus = Result.TerminationStatus;
+      BS.CurrentBestGuess = Result.BestGuess;
+      BS.ChangeRatio = ChangeRatio;
+      BenchmarkLog.push_back(BS);
+    }
+
+    if (Result.TerminationStatus != BenchmarkStatus::Running)
+      return Result;
+
+    if (Options.ScalingFactor > 1 &&
+        Iterations * Options.ScalingFactor == Iterations)
+      report_fatal_error(
+          "`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
+          "or InitialIterations.");
+
+    Iterations *= Options.ScalingFactor;
+  }
+}
+
+// Interprets `Array` as a circular buffer of `Size` elements.
+template <typename T> class CircularArrayRef {
+  llvm::ArrayRef<T> Array;
+  size_t Size;
+
+public:
+  using value_type = T;
+  using reference = T &;
+  using const_reference = const T &;
+  using difference_type = ssize_t;
+  using size_type = size_t;
+
+  class const_iterator
+      : public std::iterator<std::input_iterator_tag, T, ssize_t> {
+    llvm::ArrayRef<T> Array;
+    size_t Index;
+
+  public:
+    explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
+        : Array(Array), Index(Index) {}
+    const_iterator &operator++() {
+      ++Index;
+      return *this;
+    }
+    bool operator==(const_iterator Other) const { return Index == Other.Index; }
+    bool operator!=(const_iterator Other) const { return !(*this == Other); }
+    const T &operator*() const { return Array[Index % Array.size()]; }
+  };
+
+  CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
+      : Array(Array), Size(Size) {
+    assert(Array.size() > 0);
+  }
+
+  const_iterator begin() const { return const_iterator(Array); }
+  const_iterator end() const { return const_iterator(Array, Size); }
+};
+
+// A convenient helper to produce a CircularArrayRef from an ArrayRef.
+template <typename T>
+CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
+  return {Array, Size};
+}
+
+// Creates an std::array which storage size is constrained under `Bytes`.
+template <typename T, size_t Bytes>
+using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
+
+// A convenient helper to produce a CircularArrayRef from a
+// ByteConstrainedArray.
+template <typename T, size_t N>
+CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
+  return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
+
+#endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
--- a/libc/utils/benchmarks/LibcBenchmarkTest.cpp
+++ b/libc/utils/benchmarks/LibcBenchmarkTest.cpp
@ -0,0 +1,168 @@
+#include "LibcBenchmark.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <chrono>
+#include <limits>
+#include <queue>
+#include <vector>
+
+using std::chrono::nanoseconds;
+using ::testing::ElementsAre;
+using ::testing::Field;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+namespace llvm {
+namespace libc_benchmarks {
+namespace {
+
+// A simple parameter provider returning a zero initialized vector of size
+// `iterations`.
+struct DummyParameterProvider {
+  std::vector<char> generateBatch(size_t iterations) {
+    return std::vector<char>(iterations);
+  }
+};
+
+class LibcBenchmark : public ::testing::Test {
+public:
+  // A Clock interface suitable for testing.
+  // - Either it returns 0,
+  // - Or a timepoint coming from the `setMeasurements` call.
+  Duration now() {
+    if (!MaybeTimepoints)
+      return {};
+    assert(!MaybeTimepoints->empty());
+    const Duration timepoint = MaybeTimepoints->front();
+    MaybeTimepoints->pop();
+    return timepoint;
+  }
+
+protected:
+  void SetUp() override { Options.Log = BenchmarkLog::Full; }
+
+  void TearDown() override {
+    // We make sure all the expected measurements were performed.
+    if (MaybeTimepoints)
+      EXPECT_THAT(*MaybeTimepoints, IsEmpty());
+  }
+
+  BenchmarkResult run() {
+    return benchmark(Options, ParameterProvider, DummyFunction, *this);
+  }
+
+  void setMeasurements(llvm::ArrayRef<Duration> Durations) {
+    MaybeTimepoints.emplace(); // Create the optional value.
+    Duration CurrentTime = nanoseconds(1);
+    for (const auto &Duration : Durations) {
+      MaybeTimepoints->push(CurrentTime);
+      CurrentTime += Duration;
+      MaybeTimepoints->push(CurrentTime);
+      CurrentTime += nanoseconds(1);
+    }
+  }
+
+  BenchmarkOptions Options;
+
+private:
+  DummyParameterProvider ParameterProvider;
+  static char DummyFunction(char Payload) { return Payload; }
+  llvm::Optional<std::queue<Duration>> MaybeTimepoints;
+};
+
+TEST_F(LibcBenchmark, MaxSamplesReached) {
+  Options.MaxSamples = 1;
+  const auto Result = run();
+  EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
+  EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxSamplesReached);
+}
+
+TEST_F(LibcBenchmark, MaxDurationReached) {
+  Options.MaxDuration = nanoseconds(10);
+  setMeasurements({nanoseconds(11)});
+  const auto Result = run();
+  EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 1);
+  EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxDurationReached);
+}
+
+TEST_F(LibcBenchmark, MaxIterationsReached) {
+  Options.InitialIterations = 1;
+  Options.MaxIterations = 20;
+  Options.ScalingFactor = 2;
+  Options.Epsilon = 0; // unreachable.
+  const auto Result = run();
+  EXPECT_THAT(*Result.MaybeBenchmarkLog,
+              ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
+                          Field(&BenchmarkState::LastSampleIterations, 2),
+                          Field(&BenchmarkState::LastSampleIterations, 4),
+                          Field(&BenchmarkState::LastSampleIterations, 8),
+                          Field(&BenchmarkState::LastSampleIterations, 16),
+                          Field(&BenchmarkState::LastSampleIterations, 32)));
+  EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 6);
+  EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::MaxIterationsReached);
+}
+
+TEST_F(LibcBenchmark, MinSamples) {
+  Options.MinSamples = 4;
+  Options.ScalingFactor = 2;
+  Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
+  setMeasurements(
+      {nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
+  const auto Result = run();
+  EXPECT_THAT(*Result.MaybeBenchmarkLog,
+              ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
+                          Field(&BenchmarkState::LastSampleIterations, 2),
+                          Field(&BenchmarkState::LastSampleIterations, 4),
+                          Field(&BenchmarkState::LastSampleIterations, 8)));
+  EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
+  EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
+}
+
+TEST_F(LibcBenchmark, Epsilon) {
+  Options.MinSamples = 4;
+  Options.ScalingFactor = 2;
+  Options.Epsilon = std::numeric_limits<double>::max(); // always reachable.
+  setMeasurements(
+      {nanoseconds(1), nanoseconds(2), nanoseconds(4), nanoseconds(8)});
+  const auto Result = run();
+  EXPECT_THAT(*Result.MaybeBenchmarkLog,
+              ElementsAre(Field(&BenchmarkState::LastSampleIterations, 1),
+                          Field(&BenchmarkState::LastSampleIterations, 2),
+                          Field(&BenchmarkState::LastSampleIterations, 4),
+                          Field(&BenchmarkState::LastSampleIterations, 8)));
+  EXPECT_THAT(Result.MaybeBenchmarkLog->size(), 4);
+  EXPECT_THAT(Result.TerminationStatus, BenchmarkStatus::PrecisionReached);
+}
+
+TEST(ArrayRefLoop, Cycle) {
+  std::array<int, 2> array = {1, 2};
+  EXPECT_THAT(cycle(array, 0), ElementsAre());
+  EXPECT_THAT(cycle(array, 1), ElementsAre(1));
+  EXPECT_THAT(cycle(array, 2), ElementsAre(1, 2));
+  EXPECT_THAT(cycle(array, 3), ElementsAre(1, 2, 1));
+  EXPECT_THAT(cycle(array, 4), ElementsAre(1, 2, 1, 2));
+  EXPECT_THAT(cycle(array, 5), ElementsAre(1, 2, 1, 2, 1));
+}
+
+TEST(ByteConstrainedArray, Simple) {
+  EXPECT_THAT((ByteConstrainedArray<char, 17>()), SizeIs(17));
+  EXPECT_THAT((ByteConstrainedArray<uint16_t, 17>()), SizeIs(8));
+  EXPECT_THAT((ByteConstrainedArray<uint32_t, 17>()), SizeIs(4));
+  EXPECT_THAT((ByteConstrainedArray<uint64_t, 17>()), SizeIs(2));
+
+  EXPECT_LE(sizeof(ByteConstrainedArray<char, 17>), 17U);
+  EXPECT_LE(sizeof(ByteConstrainedArray<uint16_t, 17>), 17U);
+  EXPECT_LE(sizeof(ByteConstrainedArray<uint32_t, 17>), 17U);
+  EXPECT_LE(sizeof(ByteConstrainedArray<uint64_t, 17>), 17U);
+}
+
+TEST(ByteConstrainedArray, Cycle) {
+  ByteConstrainedArray<uint64_t, 17> TwoValues{{1UL, 2UL}};
+  EXPECT_THAT(cycle(TwoValues, 5), ElementsAre(1, 2, 1, 2, 1));
+}
+} // namespace
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/LibcMemoryBenchmark.cpp
+++ b/libc/utils/benchmarks/LibcMemoryBenchmark.cpp
@ -0,0 +1,62 @@
+//===-------- Benchmark memory specific tools -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcMemoryBenchmark.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// Returns a distribution that samples the buffer to satisfy the required
+// alignment.
+// When alignment is set, the distribution is scaled down by `Factor` and scaled
+// up again by the same amount during sampling.
+static std::uniform_int_distribution<uint32_t>
+GetOffsetDistribution(const StudyConfiguration &Conf) {
+  if (Conf.AddressAlignment &&
+      *Conf.AddressAlignment > AlignedBuffer::Alignment)
+    report_fatal_error(
+        "AddressAlignment must be less or equal to AlignedBuffer::Alignment");
+  if (!Conf.AddressAlignment)
+    return std::uniform_int_distribution<uint32_t>(0, 0); // Always 0.
+  // If we test up to Size bytes, the returned offset must stay under
+  // BuffersSize - Size.
+  int64_t MaxOffset = Conf.BufferSize;
+  MaxOffset -= Conf.Size.To;
+  MaxOffset -= 1;
+  if (MaxOffset < 0)
+    report_fatal_error(
+        "BufferSize too small to exercise specified Size configuration");
+  MaxOffset /= Conf.AddressAlignment->value();
+  return std::uniform_int_distribution<uint32_t>(0, MaxOffset);
+}
+
+OffsetDistribution::OffsetDistribution(const StudyConfiguration &Conf)
+    : Distribution(GetOffsetDistribution(Conf)),
+      Factor(Conf.AddressAlignment.valueOrOne().value()) {}
+
+// Precomputes offset where to insert mismatches between the two buffers.
+MismatchOffsetDistribution::MismatchOffsetDistribution(
+    const StudyConfiguration &Conf)
+    : MismatchAt(Conf.MemcmpMismatchAt) {
+  if (MismatchAt <= 1)
+    return;
+  const auto ToSize = Conf.Size.To;
+  for (size_t I = ToSize + 1; I < Conf.BufferSize; I += ToSize)
+    MismatchIndices.push_back(I);
+  if (MismatchIndices.empty())
+    llvm::report_fatal_error("Unable to generate mismatch");
+  MismatchIndexSelector =
+      std::uniform_int_distribution<size_t>(0, MismatchIndices.size() - 1);
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/LibcMemoryBenchmark.h
+++ b/libc/utils/benchmarks/LibcMemoryBenchmark.h
@ -0,0 +1,183 @@
+//===-------- Benchmark memory specific tools -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This file complements the `benchmark` header with memory specific tools and
+// benchmarking facilities.
+
+#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
+#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
+
+#include "LibcBenchmark.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Alignment.h"
+#include <cstdint>
+#include <random>
+
+namespace llvm {
+namespace libc_benchmarks {
+
+//--------------
+// Configuration
+//--------------
+
+// Specifies a range of sizes to explore.
+struct SizeRange {
+  uint32_t From = 0;  // Inclusive
+  uint32_t To = 1024; // Inclusive
+  uint32_t Step = 1;
+};
+
+// An object to define how to test a memory function.
+struct StudyConfiguration {
+  // The number of run for the study.
+  uint32_t Runs = 1;
+
+  // The size of the buffers (1 buffer for memset but 2 for memcpy or memcmp).
+  // When testing small sizes, it's important to keep the total allocated
+  // size under the size of the L1 cache (usually 16 or 32KiB). The framework
+  // will also use 2KiB of additional L1 memory to store the function
+  // parameters.
+  uint32_t BufferSize = 8192;
+
+  // The range of sizes to exercise.
+  SizeRange Size;
+
+  MaybeAlign AddressAlignment; //  Unset : Use start of buffer which is at
+                               //         least cache line aligned)
+                               //     1 : Use random address,
+                               //    >1 : Use random address aligned to value.
+
+  // The value to use for memset.
+  uint8_t MemsetValue = 0;
+
+  // The mismatch position for memcmp.
+  uint32_t MemcmpMismatchAt = 0; //  0 : Buffer compare equal,
+                                 // >0 : Buffer compare different at byte N-1.
+};
+
+//--------
+// Results
+//--------
+
+// The time to run one iteration of the function under test for the specified
+// Size.
+struct Measurement {
+  uint32_t Size = 0;
+  Duration Runtime = {};
+};
+
+// The measurements for a specific function.
+struct FunctionMeasurements {
+  std::string Name;
+  std::vector<Measurement> Measurements;
+};
+
+// The root object containing all the data (configuration and measurements).
+struct Study {
+  HostState Host;
+  BenchmarkOptions Options;
+  StudyConfiguration Configuration;
+  SmallVector<FunctionMeasurements, 4> Functions;
+};
+
+// Provides an aligned, dynamically allocated buffer.
+class AlignedBuffer {
+  char *const Buffer = nullptr;
+  size_t Size = 0;
+
+public:
+  static constexpr size_t Alignment = 1024;
+
+  explicit AlignedBuffer(size_t Size)
+      : Buffer(static_cast<char *>(aligned_alloc(1024, Size))), Size(Size) {}
+  ~AlignedBuffer() { free(Buffer); }
+
+  inline char *operator+(size_t Index) { return Buffer + Index; }
+  inline const char *operator+(size_t Index) const { return Buffer + Index; }
+  inline char &operator[](size_t Index) { return Buffer[Index]; }
+  inline const char &operator[](size_t Index) const { return Buffer[Index]; }
+  inline char *begin() { return Buffer; }
+  inline char *end() { return Buffer + Size; }
+};
+
+// Implements the ParameterProvider abstraction needed by the `benchmark`
+// function. This implementation makes sure that all parameters will fit into
+// `StorageSize` bytes. The total memory accessed during benchmark should be
+// less than the data L1 cache, that is the storage for the ParameterProvider
+// and the memory buffers.
+template <typename Context, size_t StorageSize = 8 * 1024>
+class SmallParameterProvider {
+  using ParameterType = typename Context::ParameterType;
+  ByteConstrainedArray<ParameterType, StorageSize> Parameters;
+  size_t LastIterations;
+  Context &Ctx;
+
+public:
+  explicit SmallParameterProvider(Context &C) : Ctx(C) {}
+  SmallParameterProvider(const SmallParameterProvider &) = delete;
+  SmallParameterProvider &operator=(const SmallParameterProvider &) = delete;
+
+  // Useful to compute the histogram of the size parameter.
+  CircularArrayRef<ParameterType> getLastBatch() const {
+    return cycle(Parameters, LastIterations);
+  }
+
+  // Implements the interface needed by the `benchmark` function.
+  CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
+    LastIterations = Iterations;
+    Ctx.Randomize(Parameters);
+    return getLastBatch();
+  }
+};
+
+// Helper to generate random buffer offsets that satisfy the configuration
+// constraints.
+class OffsetDistribution {
+  std::uniform_int_distribution<uint32_t> Distribution;
+  uint32_t Factor;
+
+public:
+  explicit OffsetDistribution(const StudyConfiguration &Conf);
+
+  template <class Generator> uint32_t operator()(Generator &G) {
+    return Distribution(G) * Factor;
+  }
+};
+
+// Helper to generate random buffer offsets that satisfy the configuration
+// constraints. It is specifically designed to benchmark `memcmp` functions
+// where we may want the Nth byte to differ.
+class MismatchOffsetDistribution {
+  std::uniform_int_distribution<size_t> MismatchIndexSelector;
+  llvm::SmallVector<uint32_t, 16> MismatchIndices;
+  const uint32_t MismatchAt;
+
+public:
+  explicit MismatchOffsetDistribution(const StudyConfiguration &Conf);
+
+  explicit operator bool() const { return !MismatchIndices.empty(); }
+
+  const llvm::SmallVectorImpl<uint32_t> &getMismatchIndices() const {
+    return MismatchIndices;
+  }
+
+  template <class Generator> uint32_t operator()(Generator &G, uint32_t Size) {
+    const uint32_t MismatchIndex = MismatchIndices[MismatchIndexSelector(G)];
+    // We need to position the offset so that a mismatch occurs at MismatchAt.
+    if (Size >= MismatchAt)
+      return MismatchIndex - MismatchAt;
+    // Size is too small to trigger the mismatch.
+    return MismatchIndex - Size - 1;
+  }
+};
+
+} // namespace libc_benchmarks
+} // namespace llvm
+
+#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
--- a/libc/utils/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/utils/benchmarks/LibcMemoryBenchmarkMain.cpp
@ -0,0 +1,100 @@
+//===-------- Benchmark  --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcMemoryBenchmarkMain.h"
+#include "JSON.h"
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+static cl::opt<std::string>
+    Configuration("conf", cl::desc("Specify configuration filename"),
+                  cl::value_desc("filename"), cl::init(""));
+
+static cl::opt<std::string> Output("o", cl::desc("Specify output filename"),
+                                   cl::value_desc("filename"), cl::init("-"));
+
+extern std::unique_ptr<BenchmarkRunner>
+getRunner(const StudyConfiguration &Conf);
+
+void Main() {
+#ifndef NDEBUG
+  static_assert(
+      false,
+      "For reproducibility benchmarks should not be compiled in DEBUG mode.");
+#endif
+  checkRequirements();
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Configuration);
+  if (!MB)
+    report_fatal_error(
+        Twine("Could not open configuration file: ").concat(Configuration));
+  auto ErrorOrStudy = ParseJsonStudy((*MB)->getBuffer());
+  if (!ErrorOrStudy)
+    report_fatal_error(ErrorOrStudy.takeError());
+
+  const auto StudyPrototype = *ErrorOrStudy;
+
+  Study S;
+  S.Host = HostState::get();
+  S.Options = StudyPrototype.Options;
+  S.Configuration = StudyPrototype.Configuration;
+
+  const auto Runs = S.Configuration.Runs;
+  const auto &SR = S.Configuration.Size;
+  std::unique_ptr<BenchmarkRunner> Runner = getRunner(S.Configuration);
+  const size_t TotalSteps =
+      Runner->getFunctionNames().size() * Runs * ((SR.To - SR.From) / SR.Step);
+  size_t Steps = 0;
+  for (auto FunctionName : Runner->getFunctionNames()) {
+    FunctionMeasurements FM;
+    FM.Name = FunctionName;
+    for (size_t Run = 0; Run < Runs; ++Run) {
+      for (uint32_t Size = SR.From; Size <= SR.To; Size += SR.Step) {
+        const auto Result = Runner->benchmark(S.Options, FunctionName, Size);
+        Measurement Measurement;
+        Measurement.Runtime = Result.BestGuess;
+        Measurement.Size = Size;
+        FM.Measurements.push_back(Measurement);
+        outs() << format("%3d%% run: %2d / %2d size: %5d ",
+                         (Steps * 100 / TotalSteps), Run, Runs, Size)
+               << FunctionName
+               << "                                                  \r";
+        ++Steps;
+      }
+    }
+    S.Functions.push_back(std::move(FM));
+  }
+
+  std::error_code EC;
+  raw_fd_ostream FOS(Output, EC);
+  if (EC)
+    report_fatal_error(Twine("Could not open file: ")
+                           .concat(EC.message())
+                           .concat(", ")
+                           .concat(Output));
+  json::OStream JOS(FOS);
+  SerializeToJson(S, JOS);
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
+
+int main(int argc, char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  llvm::libc_benchmarks::Main();
+  return EXIT_SUCCESS;
+}
--- a/libc/utils/benchmarks/LibcMemoryBenchmarkMain.h
+++ b/libc/utils/benchmarks/LibcMemoryBenchmarkMain.h
@ -0,0 +1,36 @@
+//===-------- BenchmarkRunner interface -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
+#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
+
+#include "LibcBenchmark.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// Each memory function benchmark implements this interface.
+// It is used by the main function to run all benchmarks in a uniform manner.
+class BenchmarkRunner {
+public:
+  virtual ~BenchmarkRunner() {}
+
+  // Returns a list of all available functions to test.
+  virtual ArrayRef<StringRef> getFunctionNames() const = 0;
+
+  // Performs the benchmarking for a particular FunctionName and Size.
+  virtual BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                                    StringRef FunctionName, size_t Size) = 0;
+};
+
+} // namespace libc_benchmarks
+} // namespace llvm
+
+#endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
--- a/libc/utils/benchmarks/LibcMemoryBenchmarkTest.cpp
+++ b/libc/utils/benchmarks/LibcMemoryBenchmarkTest.cpp
@ -0,0 +1,112 @@
+#include "LibcMemoryBenchmark.h"
+#include "llvm/Support/Alignment.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::AllOf;
+using testing::AnyOf;
+using testing::ElementsAre;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+
+namespace llvm {
+namespace libc_benchmarks {
+namespace {
+
+TEST(AlignedBuffer, IsAligned) {
+  AlignedBuffer AB(0);
+  EXPECT_TRUE(isAddrAligned(Align(AlignedBuffer::Alignment), AB.begin()));
+}
+
+TEST(AlignedBuffer, Empty) {
+  AlignedBuffer AB(0);
+  EXPECT_EQ(std::distance(AB.begin(), AB.end()), 0U);
+}
+
+TEST(OffsetDistribution, AlignToBegin) {
+  StudyConfiguration Conf;
+  Conf.BufferSize = 8192;
+  Conf.AddressAlignment = None;
+
+  OffsetDistribution OD(Conf);
+  std::default_random_engine Gen;
+  for (size_t I = 0; I <= 10; ++I)
+    EXPECT_EQ(OD(Gen), 0U);
+}
+
+TEST(OffsetDistribution, NoAlignment) {
+  StudyConfiguration Conf;
+  Conf.BufferSize = 8192;
+  Conf.AddressAlignment = Align::None();
+  Conf.Size.To = 1;
+
+  OffsetDistribution OD(Conf);
+  std::default_random_engine Gen;
+  for (size_t I = 0; I <= 10; ++I)
+    EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U)));
+}
+
+MATCHER_P(IsDivisibleBy, n, "") {
+  *result_listener << "where the remainder is " << (arg % n);
+  return (arg % n) == 0;
+}
+
+TEST(OffsetDistribution, Aligned) {
+  StudyConfiguration Conf;
+  Conf.BufferSize = 8192;
+  Conf.AddressAlignment = Align(16);
+  Conf.Size.To = 1;
+
+  OffsetDistribution OD(Conf);
+  std::default_random_engine Gen;
+  for (size_t I = 0; I <= 10; ++I)
+    EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U), IsDivisibleBy(16U)));
+}
+
+TEST(MismatchOffsetDistribution, EqualBufferDisablesDistribution) {
+  StudyConfiguration Conf;
+  Conf.MemcmpMismatchAt = 0; // buffer are equal.
+
+  MismatchOffsetDistribution MOD(Conf);
+  EXPECT_FALSE(MOD);
+}
+
+TEST(MismatchOffsetDistribution, DifferentBufferDisablesDistribution) {
+  StudyConfiguration Conf;
+  Conf.MemcmpMismatchAt = 1; // buffer are different.
+
+  MismatchOffsetDistribution MOD(Conf);
+  EXPECT_FALSE(MOD);
+}
+
+TEST(MismatchOffsetDistribution, MismatchAt2) {
+  const uint32_t MismatchAt = 2;
+  const uint32_t ToSize = 4;
+  StudyConfiguration Conf;
+  Conf.BufferSize = 16;
+  Conf.MemcmpMismatchAt = MismatchAt; // buffer are different at position 2.
+  Conf.Size.To = ToSize;
+
+  MismatchOffsetDistribution MOD(Conf);
+  EXPECT_TRUE(MOD);
+  // We test equality up to ToSize (=4) so we need spans of 4 equal bytes spaced
+  // by one mismatch.
+  EXPECT_THAT(MOD.getMismatchIndices(), ElementsAre(5, 9, 13));
+  std::default_random_engine Gen;
+  for (size_t Iterations = 0; Iterations <= 10; ++Iterations) {
+    for (size_t Size = Conf.Size.From; Size <= ToSize; ++Size) {
+      if (Size >= MismatchAt)
+        EXPECT_THAT(MOD(Gen, Size),
+                    AnyOf(5 - MismatchAt, 9 - MismatchAt, 13 - MismatchAt));
+      else
+        EXPECT_THAT(MOD(Gen, Size),
+                    AnyOf(5 - Size - 1, 9 - Size - 1, 13 - Size - 1));
+    }
+  }
+}
+
+} // namespace
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/Memcmp.cpp
+++ b/libc/utils/benchmarks/Memcmp.cpp
@ -0,0 +1,87 @@
+//===-------- Benchmark memcmp implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "LibcMemoryBenchmarkMain.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// The context encapsulates the buffers, parameters and the measure.
+struct MemcmpContext : public BenchmarkRunner {
+  using FunctionPrototype = int (*)(const void *, const void *, size_t);
+
+  struct ParameterType {
+    uint16_t Offset = 0;
+  };
+
+  explicit MemcmpContext(const StudyConfiguration &Conf)
+      : MOD(Conf), OD(Conf), ABuffer(Conf.BufferSize), BBuffer(Conf.BufferSize),
+        PP(*this) {
+    std::uniform_int_distribution<char> Dis;
+    // Generate random buffer A.
+    for (size_t I = 0; I < Conf.BufferSize; ++I)
+      ABuffer[I] = Dis(Gen);
+    // Copy buffer A to B.
+    ::memcpy(BBuffer.begin(), ABuffer.begin(), Conf.BufferSize);
+    if (Conf.MemcmpMismatchAt == 0)
+      return; // all same.
+    else if (Conf.MemcmpMismatchAt == 1)
+      for (char &c : BBuffer)
+        ++c; // all different.
+    else
+      for (const auto I : MOD.getMismatchIndices())
+        ++BBuffer[I];
+  }
+
+  // Needed by the ParameterProvider to update the current batch of parameter.
+  void Randomize(MutableArrayRef<ParameterType> Parameters) {
+    if (MOD)
+      for (auto &P : Parameters)
+        P.Offset = MOD(Gen, CurrentSize);
+    else
+      for (auto &P : Parameters)
+        P.Offset = OD(Gen);
+  }
+
+  ArrayRef<StringRef> getFunctionNames() const override {
+    static std::array<StringRef, 1> kFunctionNames = {"memcmp"};
+    return kFunctionNames;
+  }
+
+  BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                            StringRef FunctionName, size_t Size) override {
+    CurrentSize = Size;
+    // FIXME: Add `bcmp` once we're guaranteed that the function is provided.
+    FunctionPrototype Function =
+        StringSwitch<FunctionPrototype>(FunctionName).Case("memcmp", &::memcmp);
+    return llvm::libc_benchmarks::benchmark(
+        Options, PP, [this, Function, Size](ParameterType p) {
+          return Function(ABuffer + p.Offset, BBuffer + p.Offset, Size);
+        });
+  }
+
+private:
+  std::default_random_engine Gen;
+  MismatchOffsetDistribution MOD;
+  OffsetDistribution OD;
+  size_t CurrentSize = 0;
+  AlignedBuffer ABuffer;
+  AlignedBuffer BBuffer;
+  SmallParameterProvider<MemcmpContext> PP;
+};
+
+std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
+  return std::make_unique<MemcmpContext>(Conf);
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/Memcpy.cpp
+++ b/libc/utils/benchmarks/Memcpy.cpp
@ -0,0 +1,69 @@
+//===-------- Benchmark memcpy implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "LibcMemoryBenchmarkMain.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// The context encapsulates the buffers, parameters and the measure.
+struct MemcpyContext : public BenchmarkRunner {
+  using FunctionPrototype = void *(*)(void *, const void *, size_t);
+
+  struct ParameterType {
+    uint16_t SrcOffset = 0;
+    uint16_t DstOffset = 0;
+  };
+
+  explicit MemcpyContext(const StudyConfiguration &Conf)
+      : OD(Conf), SrcBuffer(Conf.BufferSize), DstBuffer(Conf.BufferSize),
+        PP(*this) {}
+
+  // Needed by the ParameterProvider to update the current batch of parameter.
+  void Randomize(MutableArrayRef<ParameterType> Parameters) {
+    for (auto &P : Parameters) {
+      P.DstOffset = OD(Gen);
+      P.SrcOffset = OD(Gen);
+    }
+  }
+
+  ArrayRef<StringRef> getFunctionNames() const override {
+    static std::array<StringRef, 1> kFunctionNames = {"memcpy"};
+    return kFunctionNames;
+  }
+
+  BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                            StringRef FunctionName, size_t Size) override {
+    FunctionPrototype Function =
+        StringSwitch<FunctionPrototype>(FunctionName).Case("memcpy", &::memcpy);
+    return llvm::libc_benchmarks::benchmark(
+        Options, PP, [this, Function, Size](ParameterType p) {
+          Function(DstBuffer + p.DstOffset, SrcBuffer + p.SrcOffset, Size);
+          return DstBuffer + p.DstOffset;
+        });
+  }
+
+private:
+  std::default_random_engine Gen;
+  OffsetDistribution OD;
+  AlignedBuffer SrcBuffer;
+  AlignedBuffer DstBuffer;
+  SmallParameterProvider<MemcpyContext> PP;
+};
+
+std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
+  return std::make_unique<MemcpyContext>(Conf);
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/Memset.cpp
+++ b/libc/utils/benchmarks/Memset.cpp
@ -0,0 +1,66 @@
+//===-------- Benchmark memset implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibcBenchmark.h"
+#include "LibcMemoryBenchmark.h"
+#include "LibcMemoryBenchmarkMain.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace libc_benchmarks {
+
+// The context encapsulates the buffers, parameters and the measure.
+struct MemsetContext : public BenchmarkRunner {
+  using FunctionPrototype = void *(*)(void *, int, size_t);
+
+  struct ParameterType {
+    uint16_t DstOffset = 0;
+  };
+
+  explicit MemsetContext(const StudyConfiguration &Conf)
+      : OD(Conf), DstBuffer(Conf.BufferSize), MemsetValue(Conf.MemsetValue),
+        PP(*this) {}
+
+  // Needed by the ParameterProvider to update the current batch of parameter.
+  void Randomize(MutableArrayRef<ParameterType> Parameters) {
+    for (auto &P : Parameters) {
+      P.DstOffset = OD(Gen);
+    }
+  }
+
+  ArrayRef<StringRef> getFunctionNames() const override {
+    static std::array<StringRef, 1> kFunctionNames = {"memset"};
+    return kFunctionNames;
+  }
+
+  BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                            StringRef FunctionName, size_t Size) override {
+    FunctionPrototype Function =
+        StringSwitch<FunctionPrototype>(FunctionName).Case("memset", &::memset);
+    return llvm::libc_benchmarks::benchmark(
+        Options, PP, [this, Function, Size](ParameterType p) {
+          Function(DstBuffer + p.DstOffset, MemsetValue, Size);
+          return DstBuffer + p.DstOffset;
+        });
+  }
+
+private:
+  std::default_random_engine Gen;
+  OffsetDistribution OD;
+  AlignedBuffer DstBuffer;
+  const uint8_t MemsetValue;
+  SmallParameterProvider<MemsetContext> PP;
+};
+
+std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
+  return std::make_unique<MemsetContext>(Conf);
+}
+
+} // namespace libc_benchmarks
+} // namespace llvm
--- a/libc/utils/benchmarks/RATIONALE.md
+++ b/libc/utils/benchmarks/RATIONALE.md
@ -0,0 +1,243 @@
+# Benchmarking `llvm-libc`'s memory functions
+
+## Foreword
+
+Microbenchmarks are valuable tools to assess and compare the performance of
+isolated pieces of code. However they don't capture all interactions of complex
+systems; and so other metrics can be equally important:
+
+-   **code size** (to reduce instruction cache pressure),
+-   **Profile Guided Optimization** friendliness,
+-   **hyperthreading / multithreading** friendliness.
+
+## Rationale
+
+The goal here is to satisfy the [Benchmarking
+Principles](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Benchmarking_Principles).
+
+1.  **Relevance**: Benchmarks should measure relatively vital features.
+2.  **Representativeness**: Benchmark performance metrics should be broadly
+    accepted by industry and academia.
+3.  **Equity**: All systems should be fairly compared.
+4.  **Repeatability**: Benchmark results can be verified.
+5.  **Cost-effectiveness**: Benchmark tests are economical.
+6.  **Scalability**: Benchmark tests should measure from single server to
+    multiple servers.
+7.  **Transparency**: Benchmark metrics should be easy to understand.
+
+Benchmarking is a [subtle
+art](https://en.wikipedia.org/wiki/Benchmark_\(computing\)#Challenges) and
+benchmarking memory functions is no exception. Here we'll dive into
+peculiarities of designing good microbenchmarks for `llvm-libc` memory
+functions.
+
+## Challenges
+
+As seen in the [README.md](README.md#benchmarking-regimes) the microbenchmarking
+facility should focus on measuring **low latency code**. If copying a few bytes
+takes in the order of a few cycles, the benchmark should be able to **measure
+accurately down to the cycle**.
+
+### Measuring instruments
+
+There are different sources of time in a computer (ordered from high to low resolution)
+ - [Performance
+   Counters](https://en.wikipedia.org/wiki/Hardware_performance_counter): used to
+   introspect the internals of the CPU,
+ - [High Precision Event
+   Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer): used to
+   trigger short lived actions,
+ - [Real-Time Clocks (RTC)](https://en.wikipedia.org/wiki/Real-time_clock): used
+   to keep track of the computer's time.
+
+In theory **Performance Counters** provide cycle accurate measurement via the
+`cpu cycles` event. But as we'll see, they are not really practical in this
+context.
+
+### Performance counters and modern processor architecture
+
+Modern CPUs are [out of
+order](https://en.wikipedia.org/wiki/Out-of-order_execution) and
+[superscalar](https://en.wikipedia.org/wiki/Superscalar_processor) as a
+consequence it is [hard to know what is included when the counter is
+read](https://en.wikipedia.org/wiki/Hardware_performance_counter#Instruction_based_sampling),
+some instructions may still be **in flight**, some others may be executing
+[**speculatively**](https://en.wikipedia.org/wiki/Speculative_execution). As a
+matter of fact **on the same machine, measuring twice the same piece of code will yield
+different results.**
+
+### Performance counters semantics inconsistencies and availability
+
+Although they have the same name, the exact semantics of performance counters
+are micro-architecture dependent: **it is generally not possible to compare two
+micro-architectures exposing the same performance counters.**
+
+Each vendor decides which performance counters to implement and their exact
+meaning. Although we want to benchmark `llvm-libc` memory functions for all
+available [target
+triples](https://clang.llvm.org/docs/CrossCompilation.html#target-triple), there
+are **no guarantees that the counter we're interested in is available.** 
+
+### Additional imprecisions
+
+-   Reading performance counters is done through Kernel [System
+    calls](https://en.wikipedia.org/wiki/System_call). The System call itself
+    is costly (hundreds of cycles) and will perturbate the counter's value.
+-   [Interruptions](https://en.wikipedia.org/wiki/Interrupt#Processor_response)
+    can occur during measurement.
+-   If the system is already under monitoring (virtual machines or system wide
+    profiling) the kernel can decide to multiplex the performance counters
+    leading to lower precision or even completely missing the measurement.
+-   The Kernel can decide to [migrate the
+    process](https://en.wikipedia.org/wiki/Process_migration) to a different
+    core.
+-   [Dynamic frequency
+    scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling) can kick
+    in during the measurement and change the ticking duration. **Ultimately we
+    care about the amount of work over a period of time**. This removes some
+    legitimacy of measuring cycles rather than **raw time**.
+
+### Cycle accuracy conclusion
+
+We have seen that performance counters are: not widely available, semantically
+inconsistent across micro-architectures and imprecise on modern CPUs for small
+snippets of code.
+
+## Design decisions
+
+In order to achieve the needed precision we would need to resort on more widely
+available counters and derive the time from a high number of runs: going from a
+single deterministic measure to a probabilistic one.
+
+**To get a good signal to noise ratio we need the running time of the piece of
+code to be orders of magnitude greater than the measurement precision.**
+
+For instance, if measurement precision is of 10 cycles, we need the function
+runtime to take more than 1000 cycles to achieve 1%
+[SNR](https://en.wikipedia.org/wiki/Signal-to-noise_ratio).
+
+### Repeating code N-times until precision is sufficient
+
+The algorithm is as follows:
+
+-   We measure the time it takes to run the code _N_ times (Initially _N_ is 10
+    for instance)
+-   We deduce an approximation of the runtime of one iteration (= _runtime_ /
+    _N_).
+-   We increase _N_ by _X%_ and repeat the measurement (geometric progression).
+-   We keep track of the _one iteration runtime approximation_ and build a
+    weighted mean of all the samples so far (weight is proportional to _N_)
+-   We stop the process when the difference between the weighted mean and the
+    last estimation is smaller than _ε_ or when other stopping conditions are
+    met (total runtime, maximum iterations or maximum sample count).
+
+This method allows us to be as precise as needed provided that the measured
+runtime is proportional to _N_. Longer run times also smooth out imprecision
+related to _interrupts_ and _context switches_.
+
+Note: When measuring longer runtimes (e.g. copying several megabytes of data)
+the above assumption doesn't hold anymore and the _ε_ precision cannot be
+reached by increasing iterations. The whole benchmarking process becomes
+prohibitively slow. In this case the algorithm is limited to a single sample and
+repeated several times to get a decent 95% confidence interval.
+
+### Effect of branch prediction
+
+When measuring code with branches, repeating the same call again and again will
+allow the processor to learn the branching patterns and perfectly predict all
+the branches, leading to unrealistic results.
+
+**Decision: When benchmarking small buffer sizes, the function parameters should
+be randomized between calls to prevent perfect branch predictions.**
+
+### Effect of the memory subsystem
+
+The CPU is tightly coupled to the memory subsystem. It is common to see `L1`,
+`L2` and `L3` data caches.
+
+We may be tempted to randomize data accesses widely to exercise all the caching
+layers down to RAM but the [cost of accessing lower layers of
+memory](https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html)
+completely dominates the runtime for small sizes.
+
+So to respect **Equity** and **Repeatability** principles we should make sure we
+**do not** depend on the memory subsystem.
+
+**Decision: When benchmarking small buffer sizes, the data accessed by the
+function should stay in `L1`.**
+
+### Effect of prefetching
+
+In case of small buffer sizes,
+[prefetching](https://en.wikipedia.org/wiki/Cache_prefetching) should not kick
+in but in case of large buffers it may introduce a bias.
+
+**Decision: When benchmarking large buffer sizes, the data should be accessed in
+a random fashion to lower the impact of prefetching between calls.**
+
+### Effect of dynamic frequency scaling
+
+Modern processors implement [dynamic frequency
+scaling](https://en.wikipedia.org/wiki/Dynamic_frequency_scaling). In so-called
+`performance` mode the CPU will increase its frequency and run faster than usual
+within [some limits](https://en.wikipedia.org/wiki/Intel_Turbo_Boost) : _"The
+increased clock rate is limited by the processor's power, current, and thermal
+limits, the number of cores currently in use, and the maximum frequency of the
+active cores."_
+
+**Decision: When benchmarking we want to make sure the dynamic frequency scaling
+is always set to `performance`. We also want to make sure that the time based
+events are not impacted by frequency scaling.**
+
+See [REAME.md](REAME.md) on how to set this up.
+
+### Reserved and pinned cores
+
+Some operating systems allow [core
+reservation](https://stackoverflow.com/questions/13583146/whole-one-core-dedicated-to-single-process).
+It removes a set of perturbation sources like: process migration, context
+switches and interrupts. When a core is hyperthreaded, both cores should be
+reserved.
+
+## Microbenchmarks limitations
+
+As stated in the Foreword section a number of effects do play a role in
+production but are not directly measurable through microbenchmarks. The code
+size of the benchmark is (much) smaller than the hot code of real applications
+and **doesn't exhibit instruction cache pressure as much**.
+
+### iCache pressure
+
+Fundamental functions that are called frequently will occupy the L1 iCache
+([illustration](https://en.wikipedia.org/wiki/CPU_cache#Example:_the_K8)). If
+they are too big they will prevent other hot code to stay in the cache and incur
+[stalls](https://en.wikipedia.org/wiki/CPU_cache#CPU_stalls). So the memory
+functions should be as small as possible.
+
+### iTLB pressure
+
+The same reasoning goes for instruction Translation Lookaside Buffer
+([iTLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer)) incurring
+[TLB
+misses](https://en.wikipedia.org/wiki/Translation_lookaside_buffer#TLB-miss_handling).
+
+## FAQ
+
+1.  Why don't you use Google Benchmark directly?
+
+    We reuse some parts of Google Benchmark (detection of frequency scaling, CPU
+    cache hierarchy informations) but when it comes to measuring memory
+    functions Google Benchmark have a few issues:
+
+    -   Google Benchmark privileges code based configuration via macros and
+        builders. It is typically done in a static manner. In our case the
+        parameters we need to setup are a mix of what's usually controlled by
+        the framework (number of trials, maximum number of iterations, size
+        ranges) and parameters that are more tied to the function under test
+        (randomization strategies, custom values). Achieving this with Google
+        Benchmark is cumbersome as it involves templated benchmarks and
+        duplicated code. In the end, the configuration would be spread across
+        command line flags (via framework's option or custom flags), and code
+        constants.
+    -   Output of the measurements is done through a `BenchmarkReporter` class,
+        that makes it hard to access the parameters discussed above.
--- a/libc/utils/benchmarks/README.md
+++ b/libc/utils/benchmarks/README.md
@ -0,0 +1,103 @@
+# Libc mem* benchmarks
+
+This framework has been designed to evaluate and compare relative performance of
+memory function implementations on a particular host.
+
+It will also be use to track implementations performances over time.
+
+## Quick start
+
+### Setup
+
+**Python 2** [being deprecated](https://www.python.org/doc/sunset-python-2/) it is
+advised to used **Python 3**.
+
+Then make sure to have `matplotlib`, `scipy` and `numpy` setup correctly:
+
+```shell
+apt-get install python3-pip
+pip3 install matplotlib scipy numpy
+```
+
+To get good reproducibility it is important to make sure that the system runs in
+`performance` mode. This is achieved by running:
+
+```shell
+cpupower frequency-set --governor performance
+```
+
+### Run and display `memcpy` benchmark
+
+The following commands will run the benchmark and display a 95 percentile
+confidence interval curve of **time per copied bytes**. It also features **host
+informations** and **benchmarking configuration**.
+
+```shell
+cd llvm-project
+cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS=libc -DCMAKE_BUILD_TYPE=Release
+make -C /tmp/build -j display-libc-memcpy-benchmark-small
+```
+
+## Benchmarking regimes
+
+Using a profiler to observe size distributions for calls into libc functions, it
+was found most operations act on a small number of bytes.
+
+Function           | % of calls with size ≤ 128 | % of calls with size ≤ 1024
+------------------ | --------------------------: | ---------------------------:
+memcpy             | 96%                         | 99%
+memset             | 91%                         | 99.9%
+memcmp<sup>1</sup> | 99.5%                       | ~100%
+
+Benchmarking configurations come in two flavors:
+
+ - [small](libc/utils/benchmarks/configuration_small.json)
+    - Exercises sizes up to `1KiB`, representative of normal usage
+    - The data is kept in the `L1` cache to prevent measuring the memory
+      subsystem
+ - [big](libc/utils/benchmarks/configuration_big.json)
+    - Exercises sizes up to `32MiB` to test large operations
+    - Caching effects can show up here which prevents comparing different hosts
+
+_<sup>1</sup> - The size refers to the size of the buffers to compare and not
+the number of bytes until the first difference._
+
+## Benchmarking targets
+
+The benchmarking process occurs in two steps:
+
+1. Benchmark the functions and produce a `json` file
+2. Display (or renders) the `json` file
+
+Targets are of the form `<action>-libc-<function>-benchmark-<configuration>`
+
+ - `action` is one of :
+    - `run`, runs the benchmark and writes the `json` file
+    - `display`, displays the graph on screen
+    - `render`, renders the graph on disk as a `png` file
+ - `function` is one of : `memcpy`, `memcmp`, `memset`
+ - `configuration` is one of : `small`, `big`
+
+## Superposing curves
+
+It is possible to **merge** several `json` files into a single graph. This is
+useful to **compare** implementations.
+
+In the following example we superpose the curves for `memcpy`, `memset` and
+`memcmp`:
+
+```shell
+> make -C /tmp/build run-libc-memcpy-benchmark-small run-libc-memcmp-benchmark-small run-libc-memset-benchmark-small
+> python libc/utils/benchmarks/render.py3 /tmp/last-libc-memcpy-benchmark-small.json /tmp/last-libc-memcmp-benchmark-small.json /tmp/last-libc-memset-benchmark-small.json
+```
+
+## Useful `render.py3` flags
+
+ - To save the produced graph `--output=/tmp/benchmark_curve.png`.
+ - To prevent the graph from appearing on the screen `--headless`.
+
+
+## Under the hood
+
+ To learn more about the design decisions behind the benchmarking framework,
+ have a look at the [RATIONALE.md](RATIONALE.md) file.
--- a/libc/utils/benchmarks/configuration_big.json
+++ b/libc/utils/benchmarks/configuration_big.json
@ -0,0 +1,24 @@
+{
+   "Options":{
+      "MinDuration":0.001,
+      "MaxDuration":1,
+      "InitialIterations":100,
+      "MaxIterations":10000000,
+      "MinSamples":1,
+      "MaxSamples":1,
+      "Epsilon":0.01,
+      "ScalingFactor":1.4
+   },
+   "Configuration":{
+      "Runs":5,
+      "BufferSize":134217728,
+      "Size":{
+        "From":0,
+        "To":33554432,
+        "Step":1048576
+      },
+      "AddressAlignment":1,
+      "MemsetValue":0,
+      "MemcmpMismatchAt":0
+   }
+}
--- a/libc/utils/benchmarks/configuration_small.json
+++ b/libc/utils/benchmarks/configuration_small.json
@ -0,0 +1,24 @@
+{
+   "Options":{
+      "MinDuration":0.001,
+      "MaxDuration":1,
+      "InitialIterations":100,
+      "MaxIterations":10000000,
+      "MinSamples":4,
+      "MaxSamples":1000,
+      "Epsilon":0.01,
+      "ScalingFactor":1.4
+   },
+   "Configuration":{
+      "Runs":10,
+      "BufferSize":8192,
+      "Size":{
+        "From":0,
+        "To":1024,
+        "Step":1
+      },
+      "AddressAlignment":1,
+      "MemsetValue":0,
+      "MemcmpMismatchAt":0
+   }
+}
--- a/libc/utils/benchmarks/render.py3
+++ b/libc/utils/benchmarks/render.py3
@ -0,0 +1,175 @@
+"""Reads JSON files produced by the benchmarking framework and renders them.
+
+Installation:
+> apt-get install python3-pip
+> pip3 install matplotlib scipy numpy
+
+Run:
+> python3 render.py3 <files>
+
+Rendering can occur on disk by specifying the --output option or on screen if
+the --headless flag is not set.
+"""
+
+import argparse
+import collections
+import json
+import math
+import pprint
+import sys
+import matplotlib.pyplot as plt
+from matplotlib.ticker import EngFormatter
+import numpy as np
+import scipy.stats
+
+
+def format_freq(number):
+    """Returns a human readable frequency."""
+    magnitude = 0
+    while math.fabs(number) >= 1000:
+        number /= 1000.0
+        magnitude += 1
+    return "%g%sHz" % (number, ["", "k", "M", "G"][magnitude])
+
+
+def format_size(number):
+    """Returns number in human readable form."""
+    magnitude = 0
+    while number >= 1000 and number % 1000 == 0:
+        number /= 1000
+        magnitude += 1
+    return "%g%s" % (number, ["", "K", "M", "G"][magnitude])
+
+
+def mean_confidence_interval(dataset, confidence=0.95):
+    """Returns the mean and half confidence interval for the dataset."""
+    a = 1.0 * np.array(dataset)
+    n = len(a)
+    m, se = np.mean(a), scipy.stats.sem(a)
+    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
+    return m, h
+
+
+def add_plot(function_name, points):
+    """Plots measurements for a function."""
+    n = len(points.keys())
+    x = np.zeros(n)
+    y = np.zeros(n)
+    yerr = np.zeros(n)
+
+    for i, key in enumerate(sorted(points.keys())):
+        values = points[key]
+        m, e = mean_confidence_interval(values)
+        x[i] = key
+        y[i] = m
+        yerr[i] = e
+
+    plt.plot(x, y, linewidth=1, label=function_name)
+    plt.fill_between(x, y - yerr, y + yerr, alpha=0.5)
+
+
+def get_title(host):
+    """Formats the Host object into a title for the plot."""
+    cpu_name = host["CpuName"]
+    cpu_freq = format_freq(host["CpuFrequency"])
+    cache_strings = []
+    for cache in host["Caches"]:
+        prefix = {
+            "Instruction": "i",
+            "Data": "d",
+            "Unified": "u",
+        }.get(cache["Type"])
+        cache_strings.append(r"%sL_%d %s_{/%d}" %
+                             (prefix, cache["Level"], format_size(
+                                 cache["Size"]), cache["NumSharing"]))
+    title = "%s (%s)" % (cpu_name, cpu_freq)
+    subtitle = r"$" + ", ".join(sorted(cache_strings)) + r"$"
+    return title + "\n" + subtitle
+
+
+def get_host(jsons):
+    """Returns the host of the different json objects iff they are all the same.
+    """
+    host = None
+    for root in jsons:
+        if host and host != root["Host"]:
+            sys.exit("The datasets are not coming from the same Host")
+        if not host:
+            host = root["Host"]
+    return host
+
+
+def get_configuration(jsons):
+    """Returns the configuration of the different json objects iff they are all
+    the same.
+    """
+    config = None
+    for root in jsons:
+        if config and config != root["Configuration"]:
+            return None
+        if not config:
+            config = root["Configuration"]
+    return config
+
+
+def setup_graphs(files):
+    """Setups the graphs to render from the json files."""
+    jsons = []
+    for file in files:
+        with open(file) as json_file:
+            jsons.append(json.load(json_file))
+    if not jsons:
+        sys.exit("Nothing to process")
+
+    for root in jsons:
+        for function in root["Functions"]:
+            function_name = function["Name"]
+            sizes = function["Sizes"]
+            runtimes = function["Runtimes"]
+            assert len(sizes) == len(runtimes)
+            values = collections.defaultdict(lambda: [])
+            for i in range(len(sizes)):
+              values[sizes[i]].append(runtimes[i])
+            add_plot(function_name, values)
+
+    config = get_configuration(jsons)
+    if config:
+        plt.figtext(
+            0.95,
+            0.15,
+            pprint.pformat(config),
+            verticalalignment="bottom",
+            horizontalalignment="right",
+            multialignment="left",
+            fontsize="small",
+            bbox=dict(boxstyle="round", facecolor="wheat"))
+
+    axes = plt.gca()
+    axes.set_title(get_title(get_host(jsons)))
+    axes.set_ylim(bottom=0)
+    axes.set_xlabel("Size")
+    axes.set_ylabel("Time")
+    axes.xaxis.set_major_formatter(EngFormatter(unit="B"))
+    axes.yaxis.set_major_formatter(EngFormatter(unit="s"))
+    plt.legend()
+    plt.grid()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process benchmark json files.")
+    parser.add_argument("files", nargs="+", help="The json files to read from.")
+    parser.add_argument("--output", help="The output file to write the graph.")
+    parser.add_argument(
+        "--headless",
+        help="If set do not display the graph.",
+        action="store_true")
+    args = parser.parse_args()
+    setup_graphs(args.files)
+    if args.output:
+        plt.savefig(args.output)
+    if not args.headless:
+        plt.show()
+
+if __name__ == "__main__":
+    main()