Merge pull request #3089 from sears/memcpy

Memcpy
2020-06-04 15:50:23 -07:00 · 2020-06-04 15:50:23 -07:00 · e7d72f458c
parent cf1ae52751 3415bbcd09
commit e7d72f458c
10 changed files with 1695 additions and 2 deletions
--- a/48
+++ b/48
@ -536,3 +536,51 @@ sse2neon Authors (sse2neon)
 	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 	SOFTWARE.
+
+
+rte_memcpy.h (from DPDK):
+   SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2010-2014 Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+
+folly_memcpy:
+
+ Copyright (c) Facebook, Inc. and its affiliates.
+ Author: Bin Liu <binliu@fb.com>
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -209,6 +209,25 @@ else()
  #   -mavx
  #   -msse4.2)

+  # Tentatively re-enabling vector instructions
+  set(USE_AVX512F OFF CACHE BOOL "Enable AVX 512F instructions")
+  if (USE_AVX512F)
+    add_compile_options(-mavx512f)
+  endif()
+  set(USE_AVX ON CACHE BOOL "Enable AVX instructions")
+  if (USE_AVX)
+    add_compile_options(-mavx)
+  endif()
+
+  # Intentionally using builtin memcpy.  G++ does a good job on small memcpy's when the size is known at runtime.
+  # If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
+  # writing; see flow.cpp).
+  #
+  # The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
+  # copies of sizes that are known at compile time, this might not be a win.  See the output of performance/memcpy
+  # for more information.
+  #add_compile_options(-fno-builtin-memcpy)
+
  if (USE_VALGRIND)
    add_compile_options(-DVALGRIND -DUSE_VALGRIND)
  endif()
@ -254,7 +273,6 @@ else()
  endif()
  if (GCC)
    add_compile_options(-Wno-pragmas)
-
    # Otherwise `state [[maybe_unused]] int x;` will issue a warning.
    # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
    add_compile_options(-Wno-attributes)
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
 void forceLinkDequeTests();
 void forceLinkFlowTests();
 void forceLinkVersionedMapTests();
+void forceLinkMemcpyTests();
+void forceLinkMemcpyPerfTests();

 struct UnitTestWorkload : TestWorkload {
 	bool enabled;
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkDequeTests();
 		forceLinkFlowTests();
 		forceLinkVersionedMapTests();
+		forceLinkMemcpyTests();
+		forceLinkMemcpyPerfTests();
 	}

 	virtual std::string description() { return "UnitTests"; }
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -67,7 +67,7 @@ set(FLOW_SRCS
  XmlTraceLogFormatter.cpp
  XmlTraceLogFormatter.h
  actorcompiler.h
-	crc32c.h
+  crc32c.h
  crc32c.cpp
  error_definitions.h
  ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
@ -75,14 +75,18 @@ set(FLOW_SRCS
  flat_buffers.h
  flow.cpp
  flow.h
+  folly_memcpy.S
  genericactors.actor.cpp
  genericactors.actor.h
  network.cpp
  network.h
+  rte_memcpy.h
  serialize.cpp
  serialize.h
  stacktrace.amalgamation.cpp
  stacktrace.h
+  test_memcpy.cpp
+  test_memcpy_perf.cpp
  version.cpp)

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@ -21,9 +21,28 @@
 #include "flow/flow.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/UnitTest.h"
+#include "flow/rte_memcpy.h"
+#include "flow/folly_memcpy.h"
 #include <stdarg.h>
 #include <cinttypes>

+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+// For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
+void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	return rte_memcpy(__dest, __src, __n);
+}
+
+// This compilation unit will be linked in to the main binary, so this should override glibc memcpy
+__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	// folly_memcpy is faster for small copies, but rte seems to win out in most other circumstances
+	return rte_memcpy(__dest, __src, __n);
+}
+#else
+void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	return memcpy(__dest, __src, __n);
+}
+#endif // (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
 INetwork *g_network = 0;

 FILE* randLog = 0;
--- a/flow/folly_memcpy.S
+++ b/flow/folly_memcpy.S
@ -0,0 +1,178 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
+ * __AVX__ is defined, and uses SSE2 otherwise.
+ *
+ * @author Bin Liu <binliu@fb.com>
+ */
+
+#if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
+
+        .file     "memcpy.S"
+        .text
+
+/*
+ * _memcpy_short is a local helper used when length < 8. It cannot be called
+ * from outside, because it expects a non-standard calling convention:
+ *
+ *    %rax:  destination buffer address.
+ *    %rsi:  source buffer address.
+ *    %edx:  length, in the range of [0, 7]
+ */
+        .type     _memcpy_short, @function
+_memcpy_short:
+.LSHORT:
+        .cfi_startproc
+        //        if (length == 0) return;
+        test      %edx, %edx
+        jz        .LEND
+
+        movzbl    (%rsi), %ecx
+        //        if (length - 4 < 0) goto LS4;
+        sub       $4, %edx
+        jb        .LS4
+
+        mov       (%rsi), %ecx
+        mov       (%rsi, %rdx), %edi
+        mov       %ecx, (%rax)
+        mov       %edi, (%rax, %rdx)
+.LEND:
+        rep
+        ret
+        nop
+
+.LS4:
+        //        At this point, length can be 1 or 2 or 3, and $cl contains
+        //        the first byte.
+        mov       %cl, (%rax)
+        //        if (length - 4 + 2 < 0) return;
+        add       $2, %edx
+        jnc       .LEND
+
+        //        length is 2 or 3 here. In either case, just copy the last
+        //        two bytes.
+        movzwl    (%rsi, %rdx), %ecx
+        mov       %cx, (%rax, %rdx)
+        ret
+
+        .cfi_endproc
+        .size     _memcpy_short, .-_memcpy_short
+
+
+/*
+ * void* memcpy(void* dst, void* src, uint32_t length);
+ *
+ */
+        .align    16
+        .globl    folly_memcpy
+        .type     folly_memcpy, @function
+folly_memcpy:
+        .cfi_startproc
+
+        mov       %rdx, %rcx
+        mov       %rdi, %rax
+        cmp       $8, %rdx
+        jb        .LSHORT
+
+        mov       -8(%rsi, %rdx), %r8
+        mov       (%rsi), %r9
+        mov       %r8, -8(%rdi, %rdx)
+        and       $24, %rcx
+        jz        .L32
+
+        mov       %r9, (%rdi)
+        mov       %rcx, %r8
+        sub       $16, %rcx
+        jb        .LT32
+#ifndef __AVX__
+        movdqu    (%rsi, %rcx), %xmm1
+        movdqu    %xmm1, (%rdi, %rcx)
+#else
+        vmovdqu   (%rsi, %rcx), %xmm1
+        vmovdqu   %xmm1, (%rdi, %rcx)
+#endif
+        //        Test if there are 32-byte groups
+.LT32:
+        add       %r8, %rsi
+        and       $-32, %rdx
+        jnz       .L32_adjDI
+        ret
+
+        .align    16
+.L32_adjDI:
+        add       %r8, %rdi
+.L32:
+#ifndef __AVX__
+        movdqu    (%rsi), %xmm0
+        movdqu    16(%rsi), %xmm1
+#else
+        vmovdqu   (%rsi), %ymm0
+#endif
+        shr       $6, %rdx
+        jnc       .L64_32read
+#ifndef __AVX__
+        movdqu    %xmm0, (%rdi)
+        movdqu    %xmm1, 16(%rdi)
+#else
+        vmovdqu   %ymm0, (%rdi)
+#endif
+        lea       32(%rsi), %rsi
+        jnz       .L64_adjDI
+#ifdef __AVX__
+        vzeroupper
+#endif
+        ret
+
+.L64_adjDI:
+        add       $32, %rdi
+
+.L64:
+#ifndef __AVX__
+        movdqu    (%rsi), %xmm0
+        movdqu    16(%rsi), %xmm1
+#else
+        vmovdqu   (%rsi), %ymm0
+#endif
+
+.L64_32read:
+#ifndef __AVX__
+        movdqu    32(%rsi), %xmm2
+        movdqu    48(%rsi), %xmm3
+        add       $64, %rsi
+        movdqu    %xmm0, (%rdi)
+        movdqu    %xmm1, 16(%rdi)
+        movdqu    %xmm2, 32(%rdi)
+        movdqu    %xmm3, 48(%rdi)
+#else
+        vmovdqu   32(%rsi), %ymm1
+        add       $64, %rsi
+        vmovdqu   %ymm0, (%rdi)
+        vmovdqu   %ymm1, 32(%rdi)
+#endif
+        add       $64, %rdi
+        dec       %rdx
+        jnz       .L64
+#ifdef __AVX__
+        vzeroupper
+#endif
+        ret
+
+        .cfi_endproc
+        .size folly_memcpy, .-folly_memcpy
+
+#endif
--- a/flow/folly_memcpy.h
+++ b/flow/folly_memcpy.h
@ -0,0 +1,33 @@
+/*
+ * flow.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLOW_FOLLY_MEMCPY_H
+#define FLOW_FOLLY_MEMCPY_H
+#pragma once
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
+extern "C" {
+	void* folly_memcpy(void* dst, const void* src, uint32_t length);
+}
+
+#endif // linux or bsd and avx
+
+#endif
--- a/flow/rte_memcpy.h
+++ b/flow/rte_memcpy.h
@ -0,0 +1,913 @@
+/*
+SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2010-2014 Intel Corporation
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCPY_X86_64_H_
+#define _RTE_MEMCPY_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <flow/Platform.h>
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static force_inline void *
+rte_memcpy(void *dst, const void *src, size_t n);
+
+#ifdef __AVX512F__
+#define RTE_MACHINE_CPUFLAG_AVX512F
+#elif defined(__AVX__)
+#define RTE_MACHINE_CPUFLAG_AVX2
+#endif
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+
+#define ALIGNMENT_MASK 0x3F
+
+/**
+ * AVX512 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_loadu_si512((const void *)src);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	rte_mov64(dst + 2 * 64, src + 2 * 64);
+	rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1;
+
+	while (n >= 128) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 128;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		src = src + 128;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		dst = dst + 128;
+	}
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+	while (n >= 512) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 512;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+		src = src + 512;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
+		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
+		dst = dst + 512;
+	}
+}
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				  (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				  (const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK63:
+		if (n > 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+			return ret;
+		}
+		if (n > 0)
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes
+	 */
+	dstofss = ((uintptr_t)dst & 0x3F);
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 512-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 511;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy 128-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	if (n >= 128) {
+		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		src = (const uint8_t *)src + bits;
+		dst = (uint8_t *)dst + bits;
+	}
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK63;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+#define ALIGNMENT_MASK 0x1F
+
+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	while (n >= 128) {
+		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
+		n -= 128;
+		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
+		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
+		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
+		src = (const uint8_t *)src + 128;
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
+		dst = (uint8_t *)dst + 128;
+	}
+}
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08) {
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		}
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 256 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 256) {
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK31:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+		if (n > 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes
+	 */
+	dstofss = (uintptr_t)dst & 0x1F;
+	if (dstofss > 0) {
+		dstofss = 32 - dstofss;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 127;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK31;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+#define ALIGNMENT_MASK 0x0F
+
+/**
+ * SSE & AVX implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+}
+
+/**
+ * Macro for copying unaligned block from one location to another with constant load offset,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be immediate value within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
+__extension__ ({                                                                                            \
+    size_t tmp;                                                                                                \
+    while (len >= 128 + 16 - offset) {                                                                      \
+        xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
+        len -= 128;                                                                                         \
+        xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
+        xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
+        xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
+        xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
+        xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
+        xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
+        xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
+        xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
+        src = (const uint8_t *)src + 128;                                                                   \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
+        dst = (uint8_t *)dst + 128;                                                                         \
+    }                                                                                                       \
+    tmp = len;                                                                                              \
+    len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
+    tmp -= len;                                                                                             \
+    src = (const uint8_t *)src + tmp;                                                                       \
+    dst = (uint8_t *)dst + tmp;                                                                             \
+    if (len >= 32 + 16 - offset) {                                                                          \
+        while (len >= 32 + 16 - offset) {                                                                   \
+            xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
+            len -= 32;                                                                                      \
+            xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
+            xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
+            src = (const uint8_t *)src + 32;                                                                \
+            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
+            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
+            dst = (uint8_t *)dst + 32;                                                                      \
+        }                                                                                                   \
+        tmp = len;                                                                                          \
+        len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
+        tmp -= len;                                                                                         \
+        src = (const uint8_t *)src + tmp;                                                                   \
+        dst = (uint8_t *)dst + tmp;                                                                         \
+    }                                                                                                       \
+})
+
+/**
+ * Macro for copying unaligned block from one location to another,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Use switch here because the aligning instruction requires immediate value for shift count.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
+__extension__ ({                                                      \
+    switch (offset) {                                                 \
+    case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
+    case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
+    case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
+    case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
+    case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
+    case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
+    case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
+    case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
+    case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
+    case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
+    case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
+    case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
+    case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
+    case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
+    case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
+    default:;                                                         \
+    }                                                                 \
+})
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t srcofs;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08) {
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		}
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 128) {
+		goto COPY_BLOCK_128_BACK15;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+COPY_BLOCK_255_BACK15:
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK15:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+COPY_BLOCK_64_BACK15:
+		if (n >= 32) {
+			n -= 32;
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 32;
+			dst = (uint8_t *)dst + 32;
+		}
+		if (n > 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes,
+	 * and make sure the first 15 bytes are copied, because
+	 * unaligned copy functions require up to 15 bytes
+	 * backwards access.
+	 */
+	dstofss = (uintptr_t)dst & 0x0F;
+	if (dstofss > 0) {
+		dstofss = 16 - dstofss + 16;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+	srcofs = ((uintptr_t)src & 0x0F);
+
+	/**
+	 * For aligned copy
+	 */
+	if (srcofs == 0) {
+		/**
+		 * Copy 256-byte blocks
+		 */
+		for (; n >= 256; n -= 256) {
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			dst = (uint8_t *)dst + 256;
+			src = (const uint8_t *)src + 256;
+		}
+
+		/**
+		 * Copy whatever left
+		 */
+		goto COPY_BLOCK_255_BACK15;
+	}
+
+	/**
+	 * For copy with unaligned load
+	 */
+	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_64_BACK15;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+static force_inline void *
+rte_memcpy_aligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+
+	/* Copy size <= 16 bytes */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			src = (const uint8_t *)src + 1;
+			dst = (uint8_t *)dst + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			src = (const uint16_t *)src + 1;
+			dst = (uint16_t *)dst + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			src = (const uint32_t *)src + 1;
+			dst = (uint32_t *)dst + 1;
+		}
+		if (n & 0x08)
+			*(uint64_t *)dst = *(const uint64_t *)src;
+
+		return ret;
+	}
+
+	/* Copy 16 <= size <= 32 bytes */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+
+		return ret;
+	}
+
+	/* Copy 32 < size <= 64 bytes */
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+
+		return ret;
+	}
+
+	/* Copy 64 bytes blocks */
+	for (; n >= 64; n -= 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;
+	}
+
+	/* Copy whatever left */
+	rte_mov64((uint8_t *)dst - 64 + n,
+			(const uint8_t *)src - 64 + n);
+
+	return ret;
+}
+
+static force_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+		return rte_memcpy_aligned(dst, src, n);
+	else
+		return rte_memcpy_generic(dst, src, n);
+}
+
+static inline uint64_t
+rte_rdtsc(void)
+{
+	union {
+		uint64_t tsc_64;
+		struct {
+			uint32_t lo_32;
+			uint32_t hi_32;
+		};
+	} tsc;
+
+	asm volatile("rdtsc" :
+		     "=a" (tsc.lo_32),
+		     "=d" (tsc.hi_32));
+	return tsc.tsc_64;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* defined (__linux__) || defined (__FreeBSD__) */
+
+#endif /* _RTE_MEMCPY_X86_64_H_ */
--- a/flow/test_memcpy.cpp
+++ b/flow/test_memcpy.cpp
@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "flow/folly_memcpy.h"
+#include "flow/rte_memcpy.h"
+#include "flow/IRandom.h"
+
+#include "flow/UnitTest.h"
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
+	256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
+	2048, 3072, 4096, 5120, 6144, 7168, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE       8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+/* Data is aligned on this many bytes (power of 2) */
+#define ALIGNMENT_UNIT          32
+
+
+/*
+ * Create two buffers, and initialise one with random values. These are copied
+ * to the second buffer and then compared to see if the copy was successful.
+ * The bytes outside the copied area are also checked to make sure they were not
+ * changed.
+ */
+static int
+test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
+{
+	unsigned int i;
+	uint8_t dest[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
+	uint8_t src[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
+	void * ret;
+
+	/* Setup buffers */
+	for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
+		dest[i] = 0;
+		src[i] = (uint8_t) deterministicRandom()->randomUInt32();
+	}
+
+	/* Do the copy */
+	ret = memcpy(dest + off_dst, src + off_src, size);
+	if (ret != (dest + off_dst)) {
+		printf("memcpy() returned %p, not %p\n",
+		       ret, dest + off_dst);
+	}
+
+	/* Check nothing before offset is affected */
+	for (i = 0; i < off_dst; i++) {
+		if (dest[i] != 0) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[modified before start of dst].\n",
+			       (unsigned)size, off_src, off_dst);
+			return -1;
+		}
+	}
+
+	/* Check everything was copied */
+	for (i = 0; i < size; i++) {
+		if (dest[i + off_dst] != src[i + off_src]) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[didn't copy byte %u].\n",
+			       (unsigned)size, off_src, off_dst, i);
+			return -1;
+		}
+	}
+
+	/* Check nothing after copy was affected */
+	for (i = size; i < SMALL_BUFFER_SIZE; i++) {
+		if (dest[i + off_dst] != 0) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[copied too many].\n",
+			       (unsigned)size, off_src, off_dst);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Check functionality for various buffer sizes and data offsets/alignments.
+ */
+TEST_CASE("/rte/memcpy") {
+	unsigned int off_src, off_dst, i;
+	unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	int ret;
+
+	for (off_src = 0; off_src < ALIGNMENT_UNIT; off_src++) {
+		for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
+			for (i = 0; i < num_buf_sizes; i++) {
+				ret = test_single_memcpy(off_src, off_dst,
+				                         buf_sizes[i]);
+				ASSERT(ret == 0);
+			}
+		}
+	}
+	return Void();
+}
+
+void forceLinkMemcpyTests() { }
--- a/flow/test_memcpy_perf.cpp
+++ b/flow/test_memcpy_perf.cpp
@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "flow/rte_memcpy.h"
+#include "flow/IRandom.h"
+#include "flow/UnitTest.h"
+#include "flow/flow.h"
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined (__AVX__)
+extern "C" {
+	void* folly_memcpy(void* dst, const void* src, uint32_t length);
+}
+
+
+void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+	129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
+	449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
+	2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE       8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+
+/*
+ * Arrays of this size are used for measuring uncached memory accesses by
+ * picking a random location within the buffer. Make this smaller if there are
+ * memory allocation errors.
+ */
+#define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
+
+/* How many times to run timing loop for performance tests */
+#define TEST_ITERATIONS         1000000
+#define TEST_BATCH_SIZE         100
+
+/* Data is aligned on this many bytes (power of 2) */
+// #ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT          64
+// #elif defined RTE_MACHINE_CPUFLAG_AVX2
+// #define ALIGNMENT_UNIT          32
+// #else /* RTE_MACHINE_CPUFLAG */
+// #define ALIGNMENT_UNIT          16
+// #endif /* RTE_MACHINE_CPUFLAG */
+
+/*
+ * Pointers used in performance tests. The two large buffers are for uncached
+ * access where random addresses within the buffer are used for each
+ * memcpy. The two small buffers are for cached access.
+ */
+static uint8_t *large_buf_read, *large_buf_write;
+static uint8_t *small_buf_read, *small_buf_write;
+
+static size_t round_up(size_t sz, size_t alignment) {
+	return (((sz - 1) / alignment) + 1) * alignment;
+}
+
+static uint8_t * rte_malloc(char const * ignored, size_t sz, size_t align) {
+	return (uint8_t*) aligned_alloc(align, round_up(sz, align));
+}
+
+static void rte_free(void * ptr) {
+	if (!!ptr) {
+		free(ptr);
+	}
+}
+
+/* Initialise data buffers. */
+static int
+init_buffers(void)
+{
+	unsigned i;
+
+	large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_read == NULL)
+		goto error_large_buf_read;
+
+	large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_write == NULL)
+		goto error_large_buf_write;
+
+	small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_read == NULL)
+		goto error_small_buf_read;
+
+	small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_write == NULL)
+		goto error_small_buf_write;
+
+	for (i = 0; i < LARGE_BUFFER_SIZE; i++)
+		large_buf_read[i] = deterministicRandom()->randomUInt32();
+	for (i = 0; i < SMALL_BUFFER_SIZE; i++)
+		small_buf_read[i] = deterministicRandom()->randomUInt32();
+
+	return 0;
+
+error_small_buf_write:
+	rte_free(small_buf_read);
+error_small_buf_read:
+	rte_free(large_buf_write);
+error_large_buf_write:
+	rte_free(large_buf_read);
+error_large_buf_read:
+	printf("ERROR: not enough memory\n");
+	return -1;
+}
+
+/* Cleanup data buffers */
+static void
+free_buffers(void)
+{
+	rte_free(large_buf_read);
+	rte_free(large_buf_write);
+	rte_free(small_buf_read);
+	rte_free(small_buf_write);
+}
+
+/*
+ * Get a random offset into large array, with enough space needed to perform
+ * max copy size. Offset is aligned, uoffset is used for unalignment setting.
+ */
+static inline size_t
+get_rand_offset(size_t uoffset)
+{
+	return ((deterministicRandom()->randomUInt32() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+			~(ALIGNMENT_UNIT - 1)) + uoffset;
+}
+
+/* Fill in source and destination addresses. */
+static inline void
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
+				 size_t *src_addr, int is_src_cached, size_t src_uoffset)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_BATCH_SIZE; i++) {
+		dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset);
+		src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset);
+	}
+}
+
+/*
+ * WORKAROUND: For some reason the first test doing an uncached write
+ * takes a very long time (~25 times longer than is expected). So we do
+ * it once without timing.
+ */
+static void
+do_uncached_write(uint8_t *dst, int is_dst_cached,
+				  const uint8_t *src, int is_src_cached, size_t size)
+{
+	unsigned i, j;
+	size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
+
+	for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
+		fill_addr_arrays(dst_addrs, is_dst_cached, 0,
+						 src_addrs, is_src_cached, 0);
+		for (j = 0; j < TEST_BATCH_SIZE; j++) {
+			memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
+		}
+	}
+}
+
+/*
+ * Run a single memcpy performance test. This is a macro to ensure that if
+ * the "size" parameter is a constant it won't be converted to a variable.
+ */
+#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset,                   \
+                         src, is_src_cached, src_uoffset, size)             \
+do {                                                                        \
+    unsigned int iter, t;                                                   \
+    size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];          \
+    uint64_t start_time, total_time = 0;                                    \
+    uint64_t total_time2 = 0;                                               \
+    for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
+        fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
+                         src_addrs, is_src_cached, src_uoffset);            \
+        start_time = rte_rdtsc();                                           \
+        for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+            rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size);           \
+        total_time += rte_rdtsc() - start_time;                             \
+    }                                                                       \
+    for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
+        fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
+                         src_addrs, is_src_cached, src_uoffset);            \
+        start_time = rte_rdtsc();                                           \
+        for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+            memcpy(dst+dst_addrs[t], src+src_addrs[t], size);               \
+        total_time2 += rte_rdtsc() - start_time;                            \
+    }                                                                       \
+    printf("%3.0f -", (double)total_time  / TEST_ITERATIONS);                 \
+    printf("%3.0f",   (double)total_time2 / TEST_ITERATIONS);                 \
+    printf("(%6.2f%%) ", ((double)total_time - total_time2)*100/total_time2); \
+} while (0)
+
+/* Run aligned memcpy tests for each cached/uncached permutation */
+#define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
+do {                                                                     \
+    if (__builtin_constant_p(n))                                         \
+        printf("\nC%6u", (unsigned)n);                                   \
+    else                                                                 \
+        printf("\n%7u", (unsigned)n);                                    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n);    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n);    \
+} while (0)
+
+/* Run unaligned memcpy tests for each cached/uncached permutation */
+#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
+do {                                                                     \
+    if (__builtin_constant_p(n))                                         \
+        printf("\nC%6u", (unsigned)n);                                   \
+    else                                                                 \
+        printf("\n%7u", (unsigned)n);                                    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n);    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n);    \
+} while (0)
+
+/* Run memcpy tests for constant length */
+#define ALL_PERF_TEST_FOR_CONSTANT                                      \
+do {                                                                    \
+    TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);         \
+    TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);      \
+    TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U);    \
+} while (0)
+
+/* Run all memcpy tests for aligned constant cases */
+static inline void
+perf_test_constant_aligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memcpy tests for unaligned constant cases */
+static inline void
+perf_test_constant_unaligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memcpy tests for aligned variable cases */
+static inline void
+perf_test_variable_aligned(void)
+{
+	unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned i;
+	for (i = 0; i < n; i++) {
+		ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
+	}
+}
+
+/* Run all memcpy tests for unaligned variable cases */
+static inline void
+perf_test_variable_unaligned(void)
+{
+	unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned i;
+	for (i = 0; i < n; i++) {
+		ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
+	}
+}
+
+/* Run all memcpy tests */
+TEST_CASE("performance/memcpy/rte") {
+	int ret;
+	struct timeval tv_begin, tv_end;
+	double time_aligned, time_unaligned;
+	double time_aligned_const, time_unaligned_const;
+
+	ret = init_buffers();
+	ASSERT(ret == 0);
+
+#if TEST_VALUE_RANGE != 0
+	/* Set up buf_sizes array, if required */
+	unsigned i;
+	for (i = 0; i < TEST_VALUE_RANGE; i++)
+		buf_sizes[i] = i;
+#endif
+
+	/* See function comment */
+	do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
+
+	printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
+		   "======= ================= ================= ================= =================\n"
+		   "   Size   Cache to cache     Cache to mem      Mem to cache        Mem to mem\n"
+		   "(bytes)          (ticks)          (ticks)           (ticks)           (ticks)\n"
+		   "------- ----------------- ----------------- ----------------- -----------------");
+
+	printf("\n================================= %2dB aligned =================================",
+		ALIGNMENT_UNIT);
+	/* Do aligned tests where size is a variable */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_variable_aligned();
+	gettimeofday(&tv_end, NULL);
+	time_aligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n------- ----------------- ----------------- ----------------- -----------------");
+	/* Do aligned tests where size is a compile-time constant */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_constant_aligned();
+	gettimeofday(&tv_end, NULL);
+	time_aligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n================================== Unaligned ==================================");
+	/* Do unaligned tests where size is a variable */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_variable_unaligned();
+	gettimeofday(&tv_end, NULL);
+	time_unaligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n------- ----------------- ----------------- ----------------- -----------------");
+	/* Do unaligned tests where size is a compile-time constant */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_constant_unaligned();
+	gettimeofday(&tv_end, NULL);
+	time_unaligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n======= ================= ================= ================= =================\n\n");
+
+	printf("Test Execution Time (seconds):\n");
+	printf("Aligned variable copy size   = %8.3f\n", time_aligned);
+	printf("Aligned constant copy size   = %8.3f\n", time_aligned_const);
+	printf("Unaligned variable copy size = %8.3f\n", time_unaligned);
+	printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
+	free_buffers();
+
+	return Void();
+}
+
+#endif // defined (__linux__) || defined (__FreeBSD__)
+
+void forceLinkMemcpyPerfTests() {}