Settle on using rte_memcpy when we do not know the copy size at runtime, and builtin memcpy otherwise
This commit is contained in:
parent
b84fcbc828
commit
e77f9701f3
|
@ -255,7 +255,14 @@ else()
|
|||
if (GCC)
|
||||
add_compile_options(-Wno-pragmas)
|
||||
add_compile_options(-mavx)
|
||||
# add_compile_options(-fno-builtin-memcpy)
|
||||
# Intentionally using builtin memcpy. G++ does a good job on small memcpy's when the size is known at runtime.
|
||||
# If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
|
||||
# writing; see flow.cpp).
|
||||
#
|
||||
# The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
|
||||
# copies of sizes that are known at compile time, this might not be a win. See the output of performance/memcpy
|
||||
# for more information.
|
||||
#add_compile_options(-fno-builtin-memcpy)
|
||||
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
|
||||
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
|
||||
add_compile_options(-Wno-attributes)
|
||||
|
|
|
@ -21,9 +21,20 @@
|
|||
#include "flow/flow.h"
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "flow/rte_memcpy.h"
|
||||
#include "flow/folly_memcpy.h"
|
||||
#include <stdarg.h>
|
||||
#include <cinttypes>
|
||||
|
||||
void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
|
||||
return rte_memcpy(__dest, __src, __n);
|
||||
}
|
||||
|
||||
// This compilation unit will be linked in to the main binary, so this should override glibc memcpy
|
||||
__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) {
|
||||
return rte_memcpy(__dest, __src, __n);
|
||||
}
|
||||
|
||||
INetwork *g_network = 0;
|
||||
|
||||
FILE* randLog = 0;
|
||||
|
|
|
@ -50,6 +50,9 @@ extern "C" {
|
|||
static force_inline void *
|
||||
rte_memcpy(void *dst, const void *src, size_t n);
|
||||
|
||||
//#define RTE_MACHINE_CPUFLAG_AVX512F
|
||||
#define RTE_MACHINE_CPUFLAG_AVX2
|
||||
|
||||
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
|
||||
|
||||
#define ALIGNMENT_MASK 0x3F
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "flow/folly_memcpy.h"
|
||||
#include "flow/rte_memcpy.h"
|
||||
#include "flow/IRandom.h"
|
||||
|
||||
|
|
|
@ -11,11 +11,15 @@
|
|||
#include "flow/rte_memcpy.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "flow/flow.h"
|
||||
|
||||
extern "C" {
|
||||
void* folly_memcpy(void* dst, void* src, uint32_t length);
|
||||
void* folly_memcpy(void* dst, const void* src, uint32_t length);
|
||||
}
|
||||
|
||||
|
||||
void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons
|
||||
|
||||
/*
|
||||
* Set this to the maximum buffer size you want to test. If it is 0, then the
|
||||
* values in the buf_sizes[] array below will be used.
|
||||
|
@ -170,7 +174,7 @@ do_uncached_write(uint8_t *dst, int is_dst_cached,
|
|||
fill_addr_arrays(dst_addrs, is_dst_cached, 0,
|
||||
src_addrs, is_src_cached, 0);
|
||||
for (j = 0; j < TEST_BATCH_SIZE; j++) {
|
||||
rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
|
||||
memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -191,7 +195,7 @@ do { \
|
|||
src_addrs, is_src_cached, src_uoffset); \
|
||||
start_time = rte_rdtsc(); \
|
||||
for (t = 0; t < TEST_BATCH_SIZE; t++) \
|
||||
rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \
|
||||
rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size); \
|
||||
total_time += rte_rdtsc() - start_time; \
|
||||
} \
|
||||
for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
|
||||
|
|
Loading…
Reference in New Issue