Settle on using rte_memcpy when we do not know the copy size at runtime, and builtin memcpy otherwise

This commit is contained in:
Russell Sears 2020-05-01 13:23:20 -07:00
parent b84fcbc828
commit e77f9701f3
5 changed files with 30 additions and 4 deletions

View File

@ -255,7 +255,14 @@ else()
if (GCC)
add_compile_options(-Wno-pragmas)
add_compile_options(-mavx)
# add_compile_options(-fno-builtin-memcpy)
# Intentionally using builtin memcpy. G++ does a good job on small memcpy's when the size is known at runtime.
# If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
# writing; see flow.cpp).
#
# The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
# copies of sizes that are known at compile time, this might not be a win. See the output of performance/memcpy
# for more information.
#add_compile_options(-fno-builtin-memcpy)
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
add_compile_options(-Wno-attributes)

View File

@ -21,9 +21,20 @@
#include "flow/flow.h"
#include "flow/DeterministicRandom.h"
#include "flow/UnitTest.h"
#include "flow/rte_memcpy.h"
#include "flow/folly_memcpy.h"
#include <stdarg.h>
#include <cinttypes>
void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
return rte_memcpy(__dest, __src, __n);
}
// This compilation unit will be linked in to the main binary, so this should override glibc memcpy
__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) {
return rte_memcpy(__dest, __src, __n);
}
INetwork *g_network = 0;
FILE* randLog = 0;

View File

@ -50,6 +50,9 @@ extern "C" {
static force_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
//#define RTE_MACHINE_CPUFLAG_AVX512F
#define RTE_MACHINE_CPUFLAG_AVX2
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
#define ALIGNMENT_MASK 0x3F

View File

@ -7,6 +7,7 @@
#include <string.h>
#include <stdlib.h>
#include "flow/folly_memcpy.h"
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"

View File

@ -11,11 +11,15 @@
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
#include "flow/flow.h"
extern "C" {
void* folly_memcpy(void* dst, void* src, uint32_t length);
void* folly_memcpy(void* dst, const void* src, uint32_t length);
}
void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons
/*
* Set this to the maximum buffer size you want to test. If it is 0, then the
* values in the buf_sizes[] array below will be used.
@ -170,7 +174,7 @@ do_uncached_write(uint8_t *dst, int is_dst_cached,
fill_addr_arrays(dst_addrs, is_dst_cached, 0,
src_addrs, is_src_cached, 0);
for (j = 0; j < TEST_BATCH_SIZE; j++) {
rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
}
}
}
@ -191,7 +195,7 @@ do { \
src_addrs, is_src_cached, src_uoffset); \
start_time = rte_rdtsc(); \
for (t = 0; t < TEST_BATCH_SIZE; t++) \
rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \
rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size); \
total_time += rte_rdtsc() - start_time; \
} \
for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \