port rte_memcpy to flow; add -mavx compiler flag

This commit is contained in:
Russell Sears 2020-04-27 11:00:46 -07:00
parent a910fa9ac7
commit 678b57c0d9
6 changed files with 81 additions and 78 deletions

View File

@ -254,7 +254,7 @@ else()
endif()
if (GCC)
add_compile_options(-Wno-pragmas)
add_compile_options(-mavx)
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
add_compile_options(-Wno-attributes)

View File

@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
void forceLinkDequeTests();
void forceLinkFlowTests();
void forceLinkVersionedMapTests();
void forceLinkMemcpyTests();
void forceLinkMemcpyPerfTests();
struct UnitTestWorkload : TestWorkload {
bool enabled;
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
forceLinkDequeTests();
forceLinkFlowTests();
forceLinkVersionedMapTests();
forceLinkMemcpyTests();
forceLinkMemcpyPerfTests();
}
virtual std::string description() { return "UnitTests"; }

View File

@ -67,7 +67,7 @@ set(FLOW_SRCS
XmlTraceLogFormatter.cpp
XmlTraceLogFormatter.h
actorcompiler.h
crc32c.h
crc32c.h
crc32c.cpp
error_definitions.h
${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
@ -79,10 +79,13 @@ set(FLOW_SRCS
genericactors.actor.h
network.cpp
network.h
rte_memcpy.h
serialize.cpp
serialize.h
stacktrace.amalgamation.cpp
stacktrace.h
test_memcpy.cpp
test_memcpy_perf.cpp
version.cpp)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)

View File

@ -25,9 +25,8 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <rte_vect.h>
#include <rte_common.h>
#include <rte_config.h>
#include <flow/Platform.h>
#ifdef __cplusplus
extern "C" {
@ -48,7 +47,7 @@ extern "C" {
* @return
* Pointer to the destination data.
*/
static __rte_always_inline void *
static force_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
@ -63,7 +62,7 @@ rte_memcpy(void *dst, const void *src, size_t n);
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
@ -76,7 +75,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
__m256i ymm0;
@ -89,7 +88,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
__m512i zmm0;
@ -102,7 +101,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov64(dst + 0 * 64, src + 0 * 64);
@ -113,7 +112,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
* Copy 256 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov256(uint8_t *dst, const uint8_t *src)
{
rte_mov64(dst + 0 * 64, src + 0 * 64);
@ -126,7 +125,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
* Copy 128-byte blocks from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m512i zmm0, zmm1;
@ -174,7 +173,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
static __rte_always_inline void *
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
@ -304,7 +303,7 @@ COPY_BLOCK_128_BACK63:
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
@ -317,7 +316,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
__m256i ymm0;
@ -330,7 +329,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
@ -341,7 +340,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
@ -354,7 +353,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
* Copy 128-byte blocks from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m256i ymm0, ymm1, ymm2, ymm3;
@ -374,7 +373,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
}
}
static __rte_always_inline void *
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
@ -497,7 +496,7 @@ COPY_BLOCK_128_BACK31:
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
@ -510,7 +509,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
@ -521,7 +520,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
@ -534,7 +533,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static __rte_always_inline void
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
@ -666,7 +665,7 @@ __extension__ ({ \
} \
})
static __rte_always_inline void *
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
@ -811,7 +810,7 @@ COPY_BLOCK_64_BACK15:
#endif /* RTE_MACHINE_CPUFLAG */
static __rte_always_inline void *
static force_inline void *
rte_memcpy_aligned(void *dst, const void *src, size_t n)
{
void *ret = dst;
@ -871,7 +870,7 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
return ret;
}
static __rte_always_inline void *
static force_inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
@ -880,6 +879,23 @@ rte_memcpy(void *dst, const void *src, size_t n)
return rte_memcpy_generic(dst, src, n);
}
static inline uint64_t
rte_rdtsc(void)
{
union {
uint64_t tsc_64;
struct {
uint32_t lo_32;
uint32_t hi_32;
};
} tsc;
asm volatile("rdtsc" :
"=a" (tsc.lo_32),
"=d" (tsc.hi_32));
return tsc.tsc_64;
}
#ifdef __cplusplus
}
#endif

View File

@ -7,11 +7,10 @@
#include <string.h>
#include <stdlib.h>
#include <rte_common.h>
#include <rte_random.h>
#include <rte_memcpy.h>
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"
#include "test.h"
#include "flow/UnitTest.h"
/*
* Set this to the maximum buffer size you want to test. If it is 0, then the
@ -54,7 +53,7 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
/* Setup buffers */
for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
dest[i] = 0;
src[i] = (uint8_t) rte_rand();
src[i] = (uint8_t) deterministicRandom()->randomUInt32();
}
/* Do the copy */
@ -99,9 +98,7 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
/*
* Check functionality for various buffer sizes and data offsets/alignments.
*/
static int
func_test(void)
{
TEST_CASE("/rte/memcpy") {
unsigned int off_src, off_dst, i;
unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
int ret;
@ -111,23 +108,11 @@ func_test(void)
for (i = 0; i < num_buf_sizes; i++) {
ret = test_single_memcpy(off_src, off_dst,
buf_sizes[i]);
if (ret != 0)
return -1;
ASSERT(ret == 0);
}
}
}
return 0;
return Void();
}
static int
test_memcpy(void)
{
int ret;
ret = func_test();
if (ret != 0)
return -1;
return 0;
}
REGISTER_TEST_COMMAND(memcpy_autotest, test_memcpy);
void forceLinkMemcpyTests() { }

View File

@ -8,14 +8,9 @@
#include <stdlib.h>
#include <sys/time.h>
#include <rte_common.h>
#include <rte_cycles.h>
#include <rte_random.h>
#include <rte_malloc.h>
#include <rte_memcpy.h>
#include "test.h"
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
/*
* Set this to the maximum buffer size you want to test. If it is 0, then the
@ -67,6 +62,20 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
static uint8_t *large_buf_read, *large_buf_write;
static uint8_t *small_buf_read, *small_buf_write;
static size_t round_up(size_t sz, size_t alignment) {
return (((sz - 1) / alignment) + 1) * alignment;
}
static uint8_t * rte_malloc(char const * ignored, size_t sz, size_t align) {
return (uint8_t*) aligned_alloc(align, round_up(sz, align));
}
static void rte_free(void * ptr) {
if (!!ptr) {
free(ptr);
}
}
/* Initialise data buffers. */
static int
init_buffers(void)
@ -90,9 +99,9 @@ init_buffers(void)
goto error_small_buf_write;
for (i = 0; i < LARGE_BUFFER_SIZE; i++)
large_buf_read[i] = rte_rand();
large_buf_read[i] = deterministicRandom()->randomUInt32();
for (i = 0; i < SMALL_BUFFER_SIZE; i++)
small_buf_read[i] = rte_rand();
small_buf_read[i] = deterministicRandom()->randomUInt32();
return 0;
@ -124,7 +133,7 @@ free_buffers(void)
static inline size_t
get_rand_offset(size_t uoffset)
{
return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
return ((deterministicRandom()->randomUInt32() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
~(ALIGNMENT_UNIT - 1)) + uoffset;
}
@ -269,17 +278,14 @@ perf_test_variable_unaligned(void)
}
/* Run all memcpy tests */
static int
perf_test(void)
{
TEST_CASE("performance/memcpy/rte") {
int ret;
struct timeval tv_begin, tv_end;
double time_aligned, time_unaligned;
double time_aligned_const, time_unaligned_const;
ret = init_buffers();
if (ret != 0)
return ret;
ASSERT(ret == 0);
#if TEST_VALUE_RANGE != 0
/* Set up buf_sizes array, if required */
@ -335,18 +341,7 @@ perf_test(void)
printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
free_buffers();
return 0;
return Void();
}
static int
test_memcpy_perf(void)
{
int ret;
ret = perf_test();
if (ret != 0)
return -1;
return 0;
}
REGISTER_TEST_COMMAND(memcpy_perf_autotest, test_memcpy_perf);
void forceLinkMemcpyPerfTests() {}