Merge pull request #3089 from sears/memcpy

Memcpy
This commit is contained in:
Russell Sears 2020-06-04 15:50:23 -07:00 committed by GitHub
commit e7d72f458c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1695 additions and 2 deletions

View File

@ -536,3 +536,51 @@ sse2neon Authors (sse2neon)
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
rte_memcpy.h (from DPDK):
SPDX-License-Identifier: BSD-3-Clause
Copyright(c) 2010-2014 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
folly_memcpy:
Copyright (c) Facebook, Inc. and its affiliates.
Author: Bin Liu <binliu@fb.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -209,6 +209,25 @@ else()
# -mavx
# -msse4.2)
# Tentatively re-enabling vector instructions
set(USE_AVX512F OFF CACHE BOOL "Enable AVX 512F instructions")
if (USE_AVX512F)
add_compile_options(-mavx512f)
endif()
set(USE_AVX ON CACHE BOOL "Enable AVX instructions")
if (USE_AVX)
add_compile_options(-mavx)
endif()
# Intentionally using builtin memcpy. G++ does a good job on small memcpy's when the size is known at runtime.
# If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
# writing; see flow.cpp).
#
# The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
# copies of sizes that are known at compile time, this might not be a win. See the output of performance/memcpy
# for more information.
#add_compile_options(-fno-builtin-memcpy)
if (USE_VALGRIND)
add_compile_options(-DVALGRIND -DUSE_VALGRIND)
endif()
@ -254,7 +273,6 @@ else()
endif()
if (GCC)
add_compile_options(-Wno-pragmas)
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
add_compile_options(-Wno-attributes)

View File

@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
void forceLinkDequeTests();
void forceLinkFlowTests();
void forceLinkVersionedMapTests();
void forceLinkMemcpyTests();
void forceLinkMemcpyPerfTests();
struct UnitTestWorkload : TestWorkload {
bool enabled;
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
forceLinkDequeTests();
forceLinkFlowTests();
forceLinkVersionedMapTests();
forceLinkMemcpyTests();
forceLinkMemcpyPerfTests();
}
virtual std::string description() { return "UnitTests"; }

View File

@ -67,7 +67,7 @@ set(FLOW_SRCS
XmlTraceLogFormatter.cpp
XmlTraceLogFormatter.h
actorcompiler.h
crc32c.h
crc32c.h
crc32c.cpp
error_definitions.h
${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
@ -75,14 +75,18 @@ set(FLOW_SRCS
flat_buffers.h
flow.cpp
flow.h
folly_memcpy.S
genericactors.actor.cpp
genericactors.actor.h
network.cpp
network.h
rte_memcpy.h
serialize.cpp
serialize.h
stacktrace.amalgamation.cpp
stacktrace.h
test_memcpy.cpp
test_memcpy_perf.cpp
version.cpp)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)

View File

@ -21,9 +21,28 @@
#include "flow/flow.h"
#include "flow/DeterministicRandom.h"
#include "flow/UnitTest.h"
#include "flow/rte_memcpy.h"
#include "flow/folly_memcpy.h"
#include <stdarg.h>
#include <cinttypes>
#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
// For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
return rte_memcpy(__dest, __src, __n);
}
// This compilation unit will be linked in to the main binary, so this should override glibc memcpy
__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) {
// folly_memcpy is faster for small copies, but rte seems to win out in most other circumstances
return rte_memcpy(__dest, __src, __n);
}
#else
void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
return memcpy(__dest, __src, __n);
}
#endif // (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
INetwork *g_network = 0;
FILE* randLog = 0;

178
flow/folly_memcpy.S Normal file
View File

@ -0,0 +1,178 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
* __AVX__ is defined, and uses SSE2 otherwise.
*
* @author Bin Liu <binliu@fb.com>
*/
#if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
.file "memcpy.S"
.text
/*
* _memcpy_short is a local helper used when length < 8. It cannot be called
* from outside, because it expects a non-standard calling convention:
*
* %rax: destination buffer address.
* %rsi: source buffer address.
* %edx: length, in the range of [0, 7]
*/
.type _memcpy_short, @function
_memcpy_short:
.LSHORT:
.cfi_startproc
// if (length == 0) return;
test %edx, %edx
jz .LEND
movzbl (%rsi), %ecx
// if (length - 4 < 0) goto LS4;
sub $4, %edx
jb .LS4
mov (%rsi), %ecx
mov (%rsi, %rdx), %edi
mov %ecx, (%rax)
mov %edi, (%rax, %rdx)
.LEND:
rep
ret
nop
.LS4:
// At this point, length can be 1 or 2 or 3, and $cl contains
// the first byte.
mov %cl, (%rax)
// if (length - 4 + 2 < 0) return;
add $2, %edx
jnc .LEND
// length is 2 or 3 here. In either case, just copy the last
// two bytes.
movzwl (%rsi, %rdx), %ecx
mov %cx, (%rax, %rdx)
ret
.cfi_endproc
.size _memcpy_short, .-_memcpy_short
/*
* void* memcpy(void* dst, void* src, uint32_t length);
*
*/
.align 16
.globl folly_memcpy
.type folly_memcpy, @function
folly_memcpy:
.cfi_startproc
mov %rdx, %rcx
mov %rdi, %rax
cmp $8, %rdx
jb .LSHORT
mov -8(%rsi, %rdx), %r8
mov (%rsi), %r9
mov %r8, -8(%rdi, %rdx)
and $24, %rcx
jz .L32
mov %r9, (%rdi)
mov %rcx, %r8
sub $16, %rcx
jb .LT32
#ifndef __AVX__
movdqu (%rsi, %rcx), %xmm1
movdqu %xmm1, (%rdi, %rcx)
#else
vmovdqu (%rsi, %rcx), %xmm1
vmovdqu %xmm1, (%rdi, %rcx)
#endif
// Test if there are 32-byte groups
.LT32:
add %r8, %rsi
and $-32, %rdx
jnz .L32_adjDI
ret
.align 16
.L32_adjDI:
add %r8, %rdi
.L32:
#ifndef __AVX__
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm1
#else
vmovdqu (%rsi), %ymm0
#endif
shr $6, %rdx
jnc .L64_32read
#ifndef __AVX__
movdqu %xmm0, (%rdi)
movdqu %xmm1, 16(%rdi)
#else
vmovdqu %ymm0, (%rdi)
#endif
lea 32(%rsi), %rsi
jnz .L64_adjDI
#ifdef __AVX__
vzeroupper
#endif
ret
.L64_adjDI:
add $32, %rdi
.L64:
#ifndef __AVX__
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm1
#else
vmovdqu (%rsi), %ymm0
#endif
.L64_32read:
#ifndef __AVX__
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
add $64, %rsi
movdqu %xmm0, (%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
#else
vmovdqu 32(%rsi), %ymm1
add $64, %rsi
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, 32(%rdi)
#endif
add $64, %rdi
dec %rdx
jnz .L64
#ifdef __AVX__
vzeroupper
#endif
ret
.cfi_endproc
.size folly_memcpy, .-folly_memcpy
#endif

33
flow/folly_memcpy.h Normal file
View File

@ -0,0 +1,33 @@
/*
* flow.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLOW_FOLLY_MEMCPY_H
#define FLOW_FOLLY_MEMCPY_H
#pragma once
#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
extern "C" {
void* folly_memcpy(void* dst, const void* src, uint32_t length);
}
#endif // linux or bsd and avx
#endif

913
flow/rte_memcpy.h Normal file
View File

@ -0,0 +1,913 @@
/*
SPDX-License-Identifier: BSD-3-Clause
Copyright(c) 2010-2014 Intel Corporation
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _RTE_MEMCPY_X86_64_H_
#define _RTE_MEMCPY_X86_64_H_
/**
* @file
*
* Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <flow/Platform.h>
#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
#ifdef __cplusplus
extern "C" {
#endif
/**
* Copy bytes from one location to another. The locations must not overlap.
*
* @note This is implemented as a macro, so it's address should not be taken
* and care is needed as parameter expressions may be evaluated multiple times.
*
* @param dst
* Pointer to the destination of the data.
* @param src
* Pointer to the source data.
* @param n
* Number of bytes to copy.
* @return
* Pointer to the destination data.
*/
static force_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
#ifdef __AVX512F__
#define RTE_MACHINE_CPUFLAG_AVX512F
#elif defined(__AVX__)
#define RTE_MACHINE_CPUFLAG_AVX2
#endif
#ifdef RTE_MACHINE_CPUFLAG_AVX512F
#define ALIGNMENT_MASK 0x3F
/**
* AVX512 implementation below
*/
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
xmm0 = _mm_loadu_si128((const __m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm0);
}
/**
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
__m256i ymm0;
ymm0 = _mm256_loadu_si256((const __m256i *)src);
_mm256_storeu_si256((__m256i *)dst, ymm0);
}
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
__m512i zmm0;
zmm0 = _mm512_loadu_si512((const void *)src);
_mm512_storeu_si512((void *)dst, zmm0);
}
/**
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov64(dst + 0 * 64, src + 0 * 64);
rte_mov64(dst + 1 * 64, src + 1 * 64);
}
/**
* Copy 256 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov256(uint8_t *dst, const uint8_t *src)
{
rte_mov64(dst + 0 * 64, src + 0 * 64);
rte_mov64(dst + 1 * 64, src + 1 * 64);
rte_mov64(dst + 2 * 64, src + 2 * 64);
rte_mov64(dst + 3 * 64, src + 3 * 64);
}
/**
* Copy 128-byte blocks from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m512i zmm0, zmm1;
while (n >= 128) {
zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
n -= 128;
zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
src = src + 128;
_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
dst = dst + 128;
}
}
/**
* Copy 512-byte blocks from one location to another,
* locations should not overlap.
*/
static inline void
rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
while (n >= 512) {
zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
n -= 512;
zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
src = src + 512;
_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
dst = dst + 512;
}
}
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
/**
* Copy less than 16 bytes
*/
if (n < 16) {
if (n & 0x01) {
*(uint8_t *)dstu = *(const uint8_t *)srcu;
srcu = (uintptr_t)((const uint8_t *)srcu + 1);
dstu = (uintptr_t)((uint8_t *)dstu + 1);
}
if (n & 0x02) {
*(uint16_t *)dstu = *(const uint16_t *)srcu;
srcu = (uintptr_t)((const uint16_t *)srcu + 1);
dstu = (uintptr_t)((uint16_t *)dstu + 1);
}
if (n & 0x04) {
*(uint32_t *)dstu = *(const uint32_t *)srcu;
srcu = (uintptr_t)((const uint32_t *)srcu + 1);
dstu = (uintptr_t)((uint32_t *)dstu + 1);
}
if (n & 0x08)
*(uint64_t *)dstu = *(const uint64_t *)srcu;
return ret;
}
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n,
(const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);
return ret;
}
if (n <= 512) {
if (n >= 256) {
n -= 256;
rte_mov256((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 256;
dst = (uint8_t *)dst + 256;
}
if (n >= 128) {
n -= 128;
rte_mov128((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 128;
dst = (uint8_t *)dst + 128;
}
COPY_BLOCK_128_BACK63:
if (n > 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
rte_mov64((uint8_t *)dst - 64 + n,
(const uint8_t *)src - 64 + n);
return ret;
}
if (n > 0)
rte_mov64((uint8_t *)dst - 64 + n,
(const uint8_t *)src - 64 + n);
return ret;
}
/**
* Make store aligned when copy size exceeds 512 bytes
*/
dstofss = ((uintptr_t)dst & 0x3F);
if (dstofss > 0) {
dstofss = 64 - dstofss;
n -= dstofss;
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + dstofss;
dst = (uint8_t *)dst + dstofss;
}
/**
* Copy 512-byte blocks.
* Use copy block function for better instruction order control,
* which is important when load is unaligned.
*/
rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
bits = n;
n = n & 511;
bits -= n;
src = (const uint8_t *)src + bits;
dst = (uint8_t *)dst + bits;
/**
* Copy 128-byte blocks.
* Use copy block function for better instruction order control,
* which is important when load is unaligned.
*/
if (n >= 128) {
rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
bits = n;
n = n & 127;
bits -= n;
src = (const uint8_t *)src + bits;
dst = (uint8_t *)dst + bits;
}
/**
* Copy whatever left
*/
goto COPY_BLOCK_128_BACK63;
}
#elif defined RTE_MACHINE_CPUFLAG_AVX2
#define ALIGNMENT_MASK 0x1F
/**
* AVX2 implementation below
*/
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
xmm0 = _mm_loadu_si128((const __m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm0);
}
/**
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
__m256i ymm0;
ymm0 = _mm256_loadu_si256((const __m256i *)src);
_mm256_storeu_si256((__m256i *)dst, ymm0);
}
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
}
/**
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
}
/**
* Copy 128-byte blocks from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
{
__m256i ymm0, ymm1, ymm2, ymm3;
while (n >= 128) {
ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
n -= 128;
ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
src = (const uint8_t *)src + 128;
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
dst = (uint8_t *)dst + 128;
}
}
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
/**
* Copy less than 16 bytes
*/
if (n < 16) {
if (n & 0x01) {
*(uint8_t *)dstu = *(const uint8_t *)srcu;
srcu = (uintptr_t)((const uint8_t *)srcu + 1);
dstu = (uintptr_t)((uint8_t *)dstu + 1);
}
if (n & 0x02) {
*(uint16_t *)dstu = *(const uint16_t *)srcu;
srcu = (uintptr_t)((const uint16_t *)srcu + 1);
dstu = (uintptr_t)((uint16_t *)dstu + 1);
}
if (n & 0x04) {
*(uint32_t *)dstu = *(const uint32_t *)srcu;
srcu = (uintptr_t)((const uint32_t *)srcu + 1);
dstu = (uintptr_t)((uint32_t *)dstu + 1);
}
if (n & 0x08) {
*(uint64_t *)dstu = *(const uint64_t *)srcu;
}
return ret;
}
/**
* Fast way when copy size doesn't exceed 256 bytes
*/
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n,
(const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 48) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
rte_mov16((uint8_t *)dst - 16 + n,
(const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);
return ret;
}
if (n <= 256) {
if (n >= 128) {
n -= 128;
rte_mov128((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 128;
dst = (uint8_t *)dst + 128;
}
COPY_BLOCK_128_BACK31:
if (n >= 64) {
n -= 64;
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 64;
dst = (uint8_t *)dst + 64;
}
if (n > 32) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);
return ret;
}
if (n > 0) {
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);
}
return ret;
}
/**
* Make store aligned when copy size exceeds 256 bytes
*/
dstofss = (uintptr_t)dst & 0x1F;
if (dstofss > 0) {
dstofss = 32 - dstofss;
n -= dstofss;
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + dstofss;
dst = (uint8_t *)dst + dstofss;
}
/**
* Copy 128-byte blocks
*/
rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
bits = n;
n = n & 127;
bits -= n;
src = (const uint8_t *)src + bits;
dst = (uint8_t *)dst + bits;
/**
* Copy whatever left
*/
goto COPY_BLOCK_128_BACK31;
}
#else /* RTE_MACHINE_CPUFLAG */
#define ALIGNMENT_MASK 0x0F
/**
* SSE & AVX implementation below
*/
/**
* Copy 16 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov16(uint8_t *dst, const uint8_t *src)
{
__m128i xmm0;
xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm0);
}
/**
* Copy 32 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov32(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
}
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov64(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
}
/**
* Copy 128 bytes from one location to another,
* locations should not overlap.
*/
static force_inline void
rte_mov128(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
}
/**
* Copy 256 bytes from one location to another,
* locations should not overlap.
*/
static inline void
rte_mov256(uint8_t *dst, const uint8_t *src)
{
rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
}
/**
* Macro for copying unaligned block from one location to another with constant load offset,
* 47 bytes leftover maximum,
* locations should not overlap.
* Requirements:
* - Store is aligned
* - Load offset is <offset>, which must be immediate value within [1, 15]
* - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
* - <dst>, <src>, <len> must be variables
* - __m128i <xmm0> ~ <xmm8> must be pre-defined
*/
#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
__extension__ ({ \
size_t tmp; \
while (len >= 128 + 16 - offset) { \
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
len -= 128; \
xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \
xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \
xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \
xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \
xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \
xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \
src = (const uint8_t *)src + 128; \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
dst = (uint8_t *)dst + 128; \
} \
tmp = len; \
len = ((len - 16 + offset) & 127) + 16 - offset; \
tmp -= len; \
src = (const uint8_t *)src + tmp; \
dst = (uint8_t *)dst + tmp; \
if (len >= 32 + 16 - offset) { \
while (len >= 32 + 16 - offset) { \
xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
len -= 32; \
xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
src = (const uint8_t *)src + 32; \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
dst = (uint8_t *)dst + 32; \
} \
tmp = len; \
len = ((len - 16 + offset) & 31) + 16 - offset; \
tmp -= len; \
src = (const uint8_t *)src + tmp; \
dst = (uint8_t *)dst + tmp; \
} \
})
/**
* Macro for copying unaligned block from one location to another,
* 47 bytes leftover maximum,
* locations should not overlap.
* Use switch here because the aligning instruction requires immediate value for shift count.
* Requirements:
* - Store is aligned
* - Load offset is <offset>, which must be within [1, 15]
* - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
* - <dst>, <src>, <len> must be variables
* - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
*/
#define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
__extension__ ({ \
switch (offset) { \
case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \
case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \
case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \
case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \
case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \
case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \
case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \
case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \
case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \
case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \
case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \
case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \
case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \
default:; \
} \
})
static force_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t srcofs;
/**
* Copy less than 16 bytes
*/
if (n < 16) {
if (n & 0x01) {
*(uint8_t *)dstu = *(const uint8_t *)srcu;
srcu = (uintptr_t)((const uint8_t *)srcu + 1);
dstu = (uintptr_t)((uint8_t *)dstu + 1);
}
if (n & 0x02) {
*(uint16_t *)dstu = *(const uint16_t *)srcu;
srcu = (uintptr_t)((const uint16_t *)srcu + 1);
dstu = (uintptr_t)((uint16_t *)dstu + 1);
}
if (n & 0x04) {
*(uint32_t *)dstu = *(const uint32_t *)srcu;
srcu = (uintptr_t)((const uint32_t *)srcu + 1);
dstu = (uintptr_t)((uint32_t *)dstu + 1);
}
if (n & 0x08) {
*(uint64_t *)dstu = *(const uint64_t *)srcu;
}
return ret;
}
/**
* Fast way when copy size doesn't exceed 512 bytes
*/
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 48) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
return ret;
}
if (n <= 128) {
goto COPY_BLOCK_128_BACK15;
}
if (n <= 512) {
if (n >= 256) {
n -= 256;
rte_mov128((uint8_t *)dst, (const uint8_t *)src);
rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
src = (const uint8_t *)src + 256;
dst = (uint8_t *)dst + 256;
}
COPY_BLOCK_255_BACK15:
if (n >= 128) {
n -= 128;
rte_mov128((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 128;
dst = (uint8_t *)dst + 128;
}
COPY_BLOCK_128_BACK15:
if (n >= 64) {
n -= 64;
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 64;
dst = (uint8_t *)dst + 64;
}
COPY_BLOCK_64_BACK15:
if (n >= 32) {
n -= 32;
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + 32;
dst = (uint8_t *)dst + 32;
}
if (n > 16) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
return ret;
}
if (n > 0) {
rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
}
return ret;
}
/**
* Make store aligned when copy size exceeds 512 bytes,
* and make sure the first 15 bytes are copied, because
* unaligned copy functions require up to 15 bytes
* backwards access.
*/
dstofss = (uintptr_t)dst & 0x0F;
if (dstofss > 0) {
dstofss = 16 - dstofss + 16;
n -= dstofss;
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + dstofss;
dst = (uint8_t *)dst + dstofss;
}
srcofs = ((uintptr_t)src & 0x0F);
/**
* For aligned copy
*/
if (srcofs == 0) {
/**
* Copy 256-byte blocks
*/
for (; n >= 256; n -= 256) {
rte_mov256((uint8_t *)dst, (const uint8_t *)src);
dst = (uint8_t *)dst + 256;
src = (const uint8_t *)src + 256;
}
/**
* Copy whatever left
*/
goto COPY_BLOCK_255_BACK15;
}
/**
* For copy with unaligned load
*/
MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
/**
* Copy whatever left
*/
goto COPY_BLOCK_64_BACK15;
}
#endif /* RTE_MACHINE_CPUFLAG */
static force_inline void *
rte_memcpy_aligned(void *dst, const void *src, size_t n)
{
void *ret = dst;
/* Copy size <= 16 bytes */
if (n < 16) {
if (n & 0x01) {
*(uint8_t *)dst = *(const uint8_t *)src;
src = (const uint8_t *)src + 1;
dst = (uint8_t *)dst + 1;
}
if (n & 0x02) {
*(uint16_t *)dst = *(const uint16_t *)src;
src = (const uint16_t *)src + 1;
dst = (uint16_t *)dst + 1;
}
if (n & 0x04) {
*(uint32_t *)dst = *(const uint32_t *)src;
src = (const uint32_t *)src + 1;
dst = (uint32_t *)dst + 1;
}
if (n & 0x08)
*(uint64_t *)dst = *(const uint64_t *)src;
return ret;
}
/* Copy 16 <= size <= 32 bytes */
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n,
(const uint8_t *)src - 16 + n);
return ret;
}
/* Copy 32 < size <= 64 bytes */
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);
return ret;
}
/* Copy 64 bytes blocks */
for (; n >= 64; n -= 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
dst = (uint8_t *)dst + 64;
src = (const uint8_t *)src + 64;
}
/* Copy whatever left */
rte_mov64((uint8_t *)dst - 64 + n,
(const uint8_t *)src - 64 + n);
return ret;
}
static force_inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
return rte_memcpy_aligned(dst, src, n);
else
return rte_memcpy_generic(dst, src, n);
}
static inline uint64_t
rte_rdtsc(void)
{
union {
uint64_t tsc_64;
struct {
uint32_t lo_32;
uint32_t hi_32;
};
} tsc;
asm volatile("rdtsc" :
"=a" (tsc.lo_32),
"=d" (tsc.hi_32));
return tsc.tsc_64;
}
#ifdef __cplusplus
}
#endif
#endif /* defined (__linux__) || defined (__FreeBSD__) */
#endif /* _RTE_MEMCPY_X86_64_H_ */

119
flow/test_memcpy.cpp Normal file
View File

@ -0,0 +1,119 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "flow/folly_memcpy.h"
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
/*
* Set this to the maximum buffer size you want to test. If it is 0, then the
* values in the buf_sizes[] array below will be used.
*/
#define TEST_VALUE_RANGE 0
/* List of buffer sizes to test */
#if TEST_VALUE_RANGE == 0
static size_t buf_sizes[] = {
0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
2048, 3072, 4096, 5120, 6144, 7168, 8192
};
/* MUST be as large as largest packet size above */
#define SMALL_BUFFER_SIZE 8192
#else /* TEST_VALUE_RANGE != 0 */
static size_t buf_sizes[TEST_VALUE_RANGE];
#define SMALL_BUFFER_SIZE TEST_VALUE_RANGE
#endif /* TEST_VALUE_RANGE == 0 */
/* Data is aligned on this many bytes (power of 2) */
#define ALIGNMENT_UNIT 32
/*
* Create two buffers, and initialise one with random values. These are copied
* to the second buffer and then compared to see if the copy was successful.
* The bytes outside the copied area are also checked to make sure they were not
* changed.
*/
static int
test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
{
unsigned int i;
uint8_t dest[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
uint8_t src[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
void * ret;
/* Setup buffers */
for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
dest[i] = 0;
src[i] = (uint8_t) deterministicRandom()->randomUInt32();
}
/* Do the copy */
ret = memcpy(dest + off_dst, src + off_src, size);
if (ret != (dest + off_dst)) {
printf("memcpy() returned %p, not %p\n",
ret, dest + off_dst);
}
/* Check nothing before offset is affected */
for (i = 0; i < off_dst; i++) {
if (dest[i] != 0) {
printf("memcpy() failed for %u bytes (offsets=%u,%u): "
"[modified before start of dst].\n",
(unsigned)size, off_src, off_dst);
return -1;
}
}
/* Check everything was copied */
for (i = 0; i < size; i++) {
if (dest[i + off_dst] != src[i + off_src]) {
printf("memcpy() failed for %u bytes (offsets=%u,%u): "
"[didn't copy byte %u].\n",
(unsigned)size, off_src, off_dst, i);
return -1;
}
}
/* Check nothing after copy was affected */
for (i = size; i < SMALL_BUFFER_SIZE; i++) {
if (dest[i + off_dst] != 0) {
printf("memcpy() failed for %u bytes (offsets=%u,%u): "
"[copied too many].\n",
(unsigned)size, off_src, off_dst);
return -1;
}
}
return 0;
}
/*
* Check functionality for various buffer sizes and data offsets/alignments.
*/
TEST_CASE("/rte/memcpy") {
unsigned int off_src, off_dst, i;
unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
int ret;
for (off_src = 0; off_src < ALIGNMENT_UNIT; off_src++) {
for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
for (i = 0; i < num_buf_sizes; i++) {
ret = test_single_memcpy(off_src, off_dst,
buf_sizes[i]);
ASSERT(ret == 0);
}
}
}
return Void();
}
void forceLinkMemcpyTests() { }

357
flow/test_memcpy_perf.cpp Normal file
View File

@ -0,0 +1,357 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "flow/rte_memcpy.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
#include "flow/flow.h"
#if (defined (__linux__) || defined (__FreeBSD__)) && defined (__AVX__)
extern "C" {
void* folly_memcpy(void* dst, const void* src, uint32_t length);
}
void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons
/*
* Set this to the maximum buffer size you want to test. If it is 0, then the
* values in the buf_sizes[] array below will be used.
*/
#define TEST_VALUE_RANGE 0
/* List of buffer sizes to test */
#if TEST_VALUE_RANGE == 0
static size_t buf_sizes[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
};
/* MUST be as large as largest packet size above */
#define SMALL_BUFFER_SIZE 8192
#else /* TEST_VALUE_RANGE != 0 */
static size_t buf_sizes[TEST_VALUE_RANGE];
#define SMALL_BUFFER_SIZE TEST_VALUE_RANGE
#endif /* TEST_VALUE_RANGE == 0 */
/*
* Arrays of this size are used for measuring uncached memory accesses by
* picking a random location within the buffer. Make this smaller if there are
* memory allocation errors.
*/
#define LARGE_BUFFER_SIZE (100 * 1024 * 1024)
/* How many times to run timing loop for performance tests */
#define TEST_ITERATIONS 1000000
#define TEST_BATCH_SIZE 100
/* Data is aligned on this many bytes (power of 2) */
// #ifdef RTE_MACHINE_CPUFLAG_AVX512F
#define ALIGNMENT_UNIT 64
// #elif defined RTE_MACHINE_CPUFLAG_AVX2
// #define ALIGNMENT_UNIT 32
// #else /* RTE_MACHINE_CPUFLAG */
// #define ALIGNMENT_UNIT 16
// #endif /* RTE_MACHINE_CPUFLAG */
/*
* Pointers used in performance tests. The two large buffers are for uncached
* access where random addresses within the buffer are used for each
* memcpy. The two small buffers are for cached access.
*/
static uint8_t *large_buf_read, *large_buf_write;
static uint8_t *small_buf_read, *small_buf_write;
static size_t round_up(size_t sz, size_t alignment) {
return (((sz - 1) / alignment) + 1) * alignment;
}
static uint8_t * rte_malloc(char const * ignored, size_t sz, size_t align) {
return (uint8_t*) aligned_alloc(align, round_up(sz, align));
}
static void rte_free(void * ptr) {
if (!!ptr) {
free(ptr);
}
}
/* Initialise data buffers. */
static int
init_buffers(void)
{
unsigned i;
large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_read == NULL)
goto error_large_buf_read;
large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_write == NULL)
goto error_large_buf_write;
small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_read == NULL)
goto error_small_buf_read;
small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_write == NULL)
goto error_small_buf_write;
for (i = 0; i < LARGE_BUFFER_SIZE; i++)
large_buf_read[i] = deterministicRandom()->randomUInt32();
for (i = 0; i < SMALL_BUFFER_SIZE; i++)
small_buf_read[i] = deterministicRandom()->randomUInt32();
return 0;
error_small_buf_write:
rte_free(small_buf_read);
error_small_buf_read:
rte_free(large_buf_write);
error_large_buf_write:
rte_free(large_buf_read);
error_large_buf_read:
printf("ERROR: not enough memory\n");
return -1;
}
/* Cleanup data buffers */
static void
free_buffers(void)
{
rte_free(large_buf_read);
rte_free(large_buf_write);
rte_free(small_buf_read);
rte_free(small_buf_write);
}
/*
* Get a random offset into large array, with enough space needed to perform
* max copy size. Offset is aligned, uoffset is used for unalignment setting.
*/
static inline size_t
get_rand_offset(size_t uoffset)
{
return ((deterministicRandom()->randomUInt32() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
~(ALIGNMENT_UNIT - 1)) + uoffset;
}
/* Fill in source and destination addresses. */
static inline void
fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
size_t *src_addr, int is_src_cached, size_t src_uoffset)
{
unsigned int i;
for (i = 0; i < TEST_BATCH_SIZE; i++) {
dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset);
src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset);
}
}
/*
* WORKAROUND: For some reason the first test doing an uncached write
* takes a very long time (~25 times longer than is expected). So we do
* it once without timing.
*/
static void
do_uncached_write(uint8_t *dst, int is_dst_cached,
const uint8_t *src, int is_src_cached, size_t size)
{
unsigned i, j;
size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
fill_addr_arrays(dst_addrs, is_dst_cached, 0,
src_addrs, is_src_cached, 0);
for (j = 0; j < TEST_BATCH_SIZE; j++) {
memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
}
}
}
/*
* Run a single memcpy performance test. This is a macro to ensure that if
* the "size" parameter is a constant it won't be converted to a variable.
*/
#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, \
src, is_src_cached, src_uoffset, size) \
do { \
unsigned int iter, t; \
size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; \
uint64_t start_time, total_time = 0; \
uint64_t total_time2 = 0; \
for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
src_addrs, is_src_cached, src_uoffset); \
start_time = rte_rdtsc(); \
for (t = 0; t < TEST_BATCH_SIZE; t++) \
rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size); \
total_time += rte_rdtsc() - start_time; \
} \
for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \
fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \
src_addrs, is_src_cached, src_uoffset); \
start_time = rte_rdtsc(); \
for (t = 0; t < TEST_BATCH_SIZE; t++) \
memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \
total_time2 += rte_rdtsc() - start_time; \
} \
printf("%3.0f -", (double)total_time / TEST_ITERATIONS); \
printf("%3.0f", (double)total_time2 / TEST_ITERATIONS); \
printf("(%6.2f%%) ", ((double)total_time - total_time2)*100/total_time2); \
} while (0)
/* Run aligned memcpy tests for each cached/uncached permutation */
#define ALL_PERF_TESTS_FOR_SIZE(n) \
do { \
if (__builtin_constant_p(n)) \
printf("\nC%6u", (unsigned)n); \
else \
printf("\n%7u", (unsigned)n); \
SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n); \
SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n); \
SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n); \
SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n); \
} while (0)
/* Run unaligned memcpy tests for each cached/uncached permutation */
#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n) \
do { \
if (__builtin_constant_p(n)) \
printf("\nC%6u", (unsigned)n); \
else \
printf("\n%7u", (unsigned)n); \
SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n); \
SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n); \
SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n); \
SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n); \
} while (0)
/* Run memcpy tests for constant length */
#define ALL_PERF_TEST_FOR_CONSTANT \
do { \
TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U); \
TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U); \
TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \
} while (0)
/* Run all memcpy tests for aligned constant cases */
static inline void
perf_test_constant_aligned(void)
{
#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
ALL_PERF_TEST_FOR_CONSTANT;
#undef TEST_CONSTANT
}
/* Run all memcpy tests for unaligned constant cases */
static inline void
perf_test_constant_unaligned(void)
{
#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
ALL_PERF_TEST_FOR_CONSTANT;
#undef TEST_CONSTANT
}
/* Run all memcpy tests for aligned variable cases */
static inline void
perf_test_variable_aligned(void)
{
unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
unsigned i;
for (i = 0; i < n; i++) {
ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
}
}
/* Run all memcpy tests for unaligned variable cases */
static inline void
perf_test_variable_unaligned(void)
{
unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
unsigned i;
for (i = 0; i < n; i++) {
ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
}
}
/* Run all memcpy tests */
TEST_CASE("performance/memcpy/rte") {
int ret;
struct timeval tv_begin, tv_end;
double time_aligned, time_unaligned;
double time_aligned_const, time_unaligned_const;
ret = init_buffers();
ASSERT(ret == 0);
#if TEST_VALUE_RANGE != 0
/* Set up buf_sizes array, if required */
unsigned i;
for (i = 0; i < TEST_VALUE_RANGE; i++)
buf_sizes[i] = i;
#endif
/* See function comment */
do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
"======= ================= ================= ================= =================\n"
" Size Cache to cache Cache to mem Mem to cache Mem to mem\n"
"(bytes) (ticks) (ticks) (ticks) (ticks)\n"
"------- ----------------- ----------------- ----------------- -----------------");
printf("\n================================= %2dB aligned =================================",
ALIGNMENT_UNIT);
/* Do aligned tests where size is a variable */
gettimeofday(&tv_begin, NULL);
perf_test_variable_aligned();
gettimeofday(&tv_end, NULL);
time_aligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
printf("\n------- ----------------- ----------------- ----------------- -----------------");
/* Do aligned tests where size is a compile-time constant */
gettimeofday(&tv_begin, NULL);
perf_test_constant_aligned();
gettimeofday(&tv_end, NULL);
time_aligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
printf("\n================================== Unaligned ==================================");
/* Do unaligned tests where size is a variable */
gettimeofday(&tv_begin, NULL);
perf_test_variable_unaligned();
gettimeofday(&tv_end, NULL);
time_unaligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
printf("\n------- ----------------- ----------------- ----------------- -----------------");
/* Do unaligned tests where size is a compile-time constant */
gettimeofday(&tv_begin, NULL);
perf_test_constant_unaligned();
gettimeofday(&tv_end, NULL);
time_unaligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
printf("\n======= ================= ================= ================= =================\n\n");
printf("Test Execution Time (seconds):\n");
printf("Aligned variable copy size = %8.3f\n", time_aligned);
printf("Aligned constant copy size = %8.3f\n", time_aligned_const);
printf("Unaligned variable copy size = %8.3f\n", time_unaligned);
printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
free_buffers();
return Void();
}
#endif // defined (__linux__) || defined (__FreeBSD__)
void forceLinkMemcpyPerfTests() {}