From e77f9701f32b32238d75db1e917cabcb1539ce9f Mon Sep 17 00:00:00 2001 From: Russell Sears Date: Fri, 1 May 2020 13:23:20 -0700 Subject: [PATCH] Settle on using rte_memcpy when we do not know the copy size at runtime, and builtin memcpy otherwise --- cmake/ConfigureCompiler.cmake | 9 ++++++++- flow/flow.cpp | 11 +++++++++++ flow/rte_memcpy.h | 3 +++ flow/test_memcpy.cpp | 1 + flow/test_memcpy_perf.cpp | 10 +++++++--- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 1c37e8930f..d34a4a52aa 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -255,7 +255,14 @@ else() if (GCC) add_compile_options(-Wno-pragmas) add_compile_options(-mavx) -# add_compile_options(-fno-builtin-memcpy) + # Intentionally using builtin memcpy. G++ does a good job on small memcpy's when the size is known at runtime. + # If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this + # writing; see flow.cpp). + # + # The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large + # copies of sizes that are known at compile time, this might not be a win. See the output of performance/memcpy + # for more information. + #add_compile_options(-fno-builtin-memcpy) # Otherwise `state [[maybe_unused]] int x;` will issue a warning. # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is add_compile_options(-Wno-attributes) diff --git a/flow/flow.cpp b/flow/flow.cpp index fb1cf81f60..c0d8a40f61 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -21,9 +21,20 @@ #include "flow/flow.h" #include "flow/DeterministicRandom.h" #include "flow/UnitTest.h" +#include "flow/rte_memcpy.h" +#include "flow/folly_memcpy.h" #include #include +void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { + return rte_memcpy(__dest, __src, __n); +} + +// This compilation unit will be linked in to the main binary, so this should override glibc memcpy +__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) { + return rte_memcpy(__dest, __src, __n); +} + INetwork *g_network = 0; FILE* randLog = 0; diff --git a/flow/rte_memcpy.h b/flow/rte_memcpy.h index ba0ab3cf6c..b6eedd65b3 100644 --- a/flow/rte_memcpy.h +++ b/flow/rte_memcpy.h @@ -50,6 +50,9 @@ extern "C" { static force_inline void * rte_memcpy(void *dst, const void *src, size_t n); +//#define RTE_MACHINE_CPUFLAG_AVX512F +#define RTE_MACHINE_CPUFLAG_AVX2 + #ifdef RTE_MACHINE_CPUFLAG_AVX512F #define ALIGNMENT_MASK 0x3F diff --git a/flow/test_memcpy.cpp b/flow/test_memcpy.cpp index 19ab66e35b..bb9fc74343 100644 --- a/flow/test_memcpy.cpp +++ b/flow/test_memcpy.cpp @@ -7,6 +7,7 @@ #include #include +#include "flow/folly_memcpy.h" #include "flow/rte_memcpy.h" #include "flow/IRandom.h" diff --git a/flow/test_memcpy_perf.cpp b/flow/test_memcpy_perf.cpp index c54f70aacf..7138d04599 100644 --- a/flow/test_memcpy_perf.cpp +++ b/flow/test_memcpy_perf.cpp @@ -11,11 +11,15 @@ #include "flow/rte_memcpy.h" #include "flow/IRandom.h" #include "flow/UnitTest.h" +#include "flow/flow.h" extern "C" { - void* folly_memcpy(void* dst, void* src, uint32_t length); + void* folly_memcpy(void* dst, const void* src, uint32_t length); } + +void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons + /* * Set this to the maximum buffer size you want to test. If it is 0, then the * values in the buf_sizes[] array below will be used. @@ -170,7 +174,7 @@ do_uncached_write(uint8_t *dst, int is_dst_cached, fill_addr_arrays(dst_addrs, is_dst_cached, 0, src_addrs, is_src_cached, 0); for (j = 0; j < TEST_BATCH_SIZE; j++) { - rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size); + memcpy(dst+dst_addrs[j], src+src_addrs[j], size); } } } @@ -191,7 +195,7 @@ do { \ src_addrs, is_src_cached, src_uoffset); \ start_time = rte_rdtsc(); \ for (t = 0; t < TEST_BATCH_SIZE; t++) \ - rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \ + rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size); \ total_time += rte_rdtsc() - start_time; \ } \ for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \