port rte_memcpy to flow; add -mavx compiler flag
This commit is contained in:
parent
a910fa9ac7
commit
678b57c0d9
|
@ -254,7 +254,7 @@ else()
|
|||
endif()
|
||||
if (GCC)
|
||||
add_compile_options(-Wno-pragmas)
|
||||
|
||||
add_compile_options(-mavx)
|
||||
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
|
||||
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
|
||||
add_compile_options(-Wno-attributes)
|
||||
|
|
|
@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
|
|||
void forceLinkDequeTests();
|
||||
void forceLinkFlowTests();
|
||||
void forceLinkVersionedMapTests();
|
||||
void forceLinkMemcpyTests();
|
||||
void forceLinkMemcpyPerfTests();
|
||||
|
||||
struct UnitTestWorkload : TestWorkload {
|
||||
bool enabled;
|
||||
|
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
|
|||
forceLinkDequeTests();
|
||||
forceLinkFlowTests();
|
||||
forceLinkVersionedMapTests();
|
||||
forceLinkMemcpyTests();
|
||||
forceLinkMemcpyPerfTests();
|
||||
}
|
||||
|
||||
virtual std::string description() { return "UnitTests"; }
|
||||
|
|
|
@ -67,7 +67,7 @@ set(FLOW_SRCS
|
|||
XmlTraceLogFormatter.cpp
|
||||
XmlTraceLogFormatter.h
|
||||
actorcompiler.h
|
||||
crc32c.h
|
||||
crc32c.h
|
||||
crc32c.cpp
|
||||
error_definitions.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
|
||||
|
@ -79,10 +79,13 @@ set(FLOW_SRCS
|
|||
genericactors.actor.h
|
||||
network.cpp
|
||||
network.h
|
||||
rte_memcpy.h
|
||||
serialize.cpp
|
||||
serialize.h
|
||||
stacktrace.amalgamation.cpp
|
||||
stacktrace.h
|
||||
test_memcpy.cpp
|
||||
test_memcpy_perf.cpp
|
||||
version.cpp)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
|
||||
|
|
|
@ -25,9 +25,8 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <rte_vect.h>
|
||||
#include <rte_common.h>
|
||||
#include <rte_config.h>
|
||||
|
||||
#include <flow/Platform.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -48,7 +47,7 @@ extern "C" {
|
|||
* @return
|
||||
* Pointer to the destination data.
|
||||
*/
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy(void *dst, const void *src, size_t n);
|
||||
|
||||
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
|
||||
|
@ -63,7 +62,7 @@ rte_memcpy(void *dst, const void *src, size_t n);
|
|||
* Copy 16 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov16(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m128i xmm0;
|
||||
|
@ -76,7 +75,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 32 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov32(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m256i ymm0;
|
||||
|
@ -89,7 +88,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 64 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov64(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m512i zmm0;
|
||||
|
@ -102,7 +101,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 128 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov128(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov64(dst + 0 * 64, src + 0 * 64);
|
||||
|
@ -113,7 +112,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 256 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov256(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov64(dst + 0 * 64, src + 0 * 64);
|
||||
|
@ -126,7 +125,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 128-byte blocks from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
|
||||
{
|
||||
__m512i zmm0, zmm1;
|
||||
|
@ -174,7 +173,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
|
|||
}
|
||||
}
|
||||
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy_generic(void *dst, const void *src, size_t n)
|
||||
{
|
||||
uintptr_t dstu = (uintptr_t)dst;
|
||||
|
@ -304,7 +303,7 @@ COPY_BLOCK_128_BACK63:
|
|||
* Copy 16 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov16(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m128i xmm0;
|
||||
|
@ -317,7 +316,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 32 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov32(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m256i ymm0;
|
||||
|
@ -330,7 +329,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 64 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov64(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
|
||||
|
@ -341,7 +340,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 128 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov128(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
|
||||
|
@ -354,7 +353,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 128-byte blocks from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
|
||||
{
|
||||
__m256i ymm0, ymm1, ymm2, ymm3;
|
||||
|
@ -374,7 +373,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
|
|||
}
|
||||
}
|
||||
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy_generic(void *dst, const void *src, size_t n)
|
||||
{
|
||||
uintptr_t dstu = (uintptr_t)dst;
|
||||
|
@ -497,7 +496,7 @@ COPY_BLOCK_128_BACK31:
|
|||
* Copy 16 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov16(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
__m128i xmm0;
|
||||
|
@ -510,7 +509,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 32 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov32(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
|
||||
|
@ -521,7 +520,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 64 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov64(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
|
||||
|
@ -534,7 +533,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
|
|||
* Copy 128 bytes from one location to another,
|
||||
* locations should not overlap.
|
||||
*/
|
||||
static __rte_always_inline void
|
||||
static force_inline void
|
||||
rte_mov128(uint8_t *dst, const uint8_t *src)
|
||||
{
|
||||
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
|
||||
|
@ -666,7 +665,7 @@ __extension__ ({ \
|
|||
} \
|
||||
})
|
||||
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy_generic(void *dst, const void *src, size_t n)
|
||||
{
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
|
||||
|
@ -811,7 +810,7 @@ COPY_BLOCK_64_BACK15:
|
|||
|
||||
#endif /* RTE_MACHINE_CPUFLAG */
|
||||
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy_aligned(void *dst, const void *src, size_t n)
|
||||
{
|
||||
void *ret = dst;
|
||||
|
@ -871,7 +870,7 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static __rte_always_inline void *
|
||||
static force_inline void *
|
||||
rte_memcpy(void *dst, const void *src, size_t n)
|
||||
{
|
||||
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
|
||||
|
@ -880,6 +879,23 @@ rte_memcpy(void *dst, const void *src, size_t n)
|
|||
return rte_memcpy_generic(dst, src, n);
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
rte_rdtsc(void)
|
||||
{
|
||||
union {
|
||||
uint64_t tsc_64;
|
||||
struct {
|
||||
uint32_t lo_32;
|
||||
uint32_t hi_32;
|
||||
};
|
||||
} tsc;
|
||||
|
||||
asm volatile("rdtsc" :
|
||||
"=a" (tsc.lo_32),
|
||||
"=d" (tsc.hi_32));
|
||||
return tsc.tsc_64;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -7,11 +7,10 @@
|
|||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <rte_common.h>
|
||||
#include <rte_random.h>
|
||||
#include <rte_memcpy.h>
|
||||
#include "flow/rte_memcpy.h"
|
||||
#include "flow/IRandom.h"
|
||||
|
||||
#include "test.h"
|
||||
#include "flow/UnitTest.h"
|
||||
|
||||
/*
|
||||
* Set this to the maximum buffer size you want to test. If it is 0, then the
|
||||
|
@ -54,7 +53,7 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
|
|||
/* Setup buffers */
|
||||
for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
|
||||
dest[i] = 0;
|
||||
src[i] = (uint8_t) rte_rand();
|
||||
src[i] = (uint8_t) deterministicRandom()->randomUInt32();
|
||||
}
|
||||
|
||||
/* Do the copy */
|
||||
|
@ -99,9 +98,7 @@ test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
|
|||
/*
|
||||
* Check functionality for various buffer sizes and data offsets/alignments.
|
||||
*/
|
||||
static int
|
||||
func_test(void)
|
||||
{
|
||||
TEST_CASE("/rte/memcpy") {
|
||||
unsigned int off_src, off_dst, i;
|
||||
unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
|
||||
int ret;
|
||||
|
@ -111,23 +108,11 @@ func_test(void)
|
|||
for (i = 0; i < num_buf_sizes; i++) {
|
||||
ret = test_single_memcpy(off_src, off_dst,
|
||||
buf_sizes[i]);
|
||||
if (ret != 0)
|
||||
return -1;
|
||||
ASSERT(ret == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return Void();
|
||||
}
|
||||
|
||||
static int
|
||||
test_memcpy(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = func_test();
|
||||
if (ret != 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_TEST_COMMAND(memcpy_autotest, test_memcpy);
|
||||
void forceLinkMemcpyTests() { }
|
|
@ -8,14 +8,9 @@
|
|||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <rte_common.h>
|
||||
#include <rte_cycles.h>
|
||||
#include <rte_random.h>
|
||||
#include <rte_malloc.h>
|
||||
|
||||
#include <rte_memcpy.h>
|
||||
|
||||
#include "test.h"
|
||||
#include "flow/rte_memcpy.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/UnitTest.h"
|
||||
|
||||
/*
|
||||
* Set this to the maximum buffer size you want to test. If it is 0, then the
|
||||
|
@ -67,6 +62,20 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
|
|||
static uint8_t *large_buf_read, *large_buf_write;
|
||||
static uint8_t *small_buf_read, *small_buf_write;
|
||||
|
||||
static size_t round_up(size_t sz, size_t alignment) {
|
||||
return (((sz - 1) / alignment) + 1) * alignment;
|
||||
}
|
||||
|
||||
static uint8_t * rte_malloc(char const * ignored, size_t sz, size_t align) {
|
||||
return (uint8_t*) aligned_alloc(align, round_up(sz, align));
|
||||
}
|
||||
|
||||
static void rte_free(void * ptr) {
|
||||
if (!!ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialise data buffers. */
|
||||
static int
|
||||
init_buffers(void)
|
||||
|
@ -90,9 +99,9 @@ init_buffers(void)
|
|||
goto error_small_buf_write;
|
||||
|
||||
for (i = 0; i < LARGE_BUFFER_SIZE; i++)
|
||||
large_buf_read[i] = rte_rand();
|
||||
large_buf_read[i] = deterministicRandom()->randomUInt32();
|
||||
for (i = 0; i < SMALL_BUFFER_SIZE; i++)
|
||||
small_buf_read[i] = rte_rand();
|
||||
small_buf_read[i] = deterministicRandom()->randomUInt32();
|
||||
|
||||
return 0;
|
||||
|
||||
|
@ -124,7 +133,7 @@ free_buffers(void)
|
|||
static inline size_t
|
||||
get_rand_offset(size_t uoffset)
|
||||
{
|
||||
return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
|
||||
return ((deterministicRandom()->randomUInt32() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
|
||||
~(ALIGNMENT_UNIT - 1)) + uoffset;
|
||||
}
|
||||
|
||||
|
@ -269,17 +278,14 @@ perf_test_variable_unaligned(void)
|
|||
}
|
||||
|
||||
/* Run all memcpy tests */
|
||||
static int
|
||||
perf_test(void)
|
||||
{
|
||||
TEST_CASE("performance/memcpy/rte") {
|
||||
int ret;
|
||||
struct timeval tv_begin, tv_end;
|
||||
double time_aligned, time_unaligned;
|
||||
double time_aligned_const, time_unaligned_const;
|
||||
|
||||
ret = init_buffers();
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
ASSERT(ret == 0);
|
||||
|
||||
#if TEST_VALUE_RANGE != 0
|
||||
/* Set up buf_sizes array, if required */
|
||||
|
@ -335,18 +341,7 @@ perf_test(void)
|
|||
printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
|
||||
free_buffers();
|
||||
|
||||
return 0;
|
||||
return Void();
|
||||
}
|
||||
|
||||
static int
|
||||
test_memcpy_perf(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = perf_test();
|
||||
if (ret != 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_TEST_COMMAND(memcpy_perf_autotest, test_memcpy_perf);
|
||||
void forceLinkMemcpyPerfTests() {}
|
Loading…
Reference in New Issue