forked from OSchip/llvm-project
[libc][NFC] Simplify memcpy implementation
Summary: This is a NFC, it aims at simplifying both the code and build files. Reviewers: abrachet, sivachandra Subscribers: mgorny, tschuett, ecnelises, libc-commits, courbet Tags: #libc-project Differential Revision: https://reviews.llvm.org/D80291
This commit is contained in:
parent
4b7812116d
commit
0d52a7d038
libc
src/string
test/src/string/memory_utils
|
@ -68,30 +68,17 @@ endfunction()
|
|||
|
||||
# include the relevant architecture specific implementations
|
||||
if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
|
||||
set(LIBC_STRING_TARGET_FOLDER "x86")
|
||||
set(LIBC_STRING_TARGET_ARCH "x86")
|
||||
else()
|
||||
set(LIBC_STRING_TARGET_FOLDER ${LIBC_TARGET_MACHINE})
|
||||
set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE})
|
||||
endif()
|
||||
|
||||
add_gen_header(
|
||||
memcpy_arch_specific
|
||||
DEF_FILE
|
||||
memcpy_arch_specific.h.def
|
||||
GEN_HDR
|
||||
memcpy_arch_specific.h
|
||||
PARAMS
|
||||
memcpy_arch_specific=${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
|
||||
DATA_FILES
|
||||
${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
|
||||
)
|
||||
|
||||
function(add_memcpy memcpy_name)
|
||||
add_implementation(memcpy ${memcpy_name}
|
||||
SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
|
||||
SRCS ${LIBC_SOURCE_DIR}/src/string/${LIBC_STRING_TARGET_ARCH}/memcpy.cpp
|
||||
HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
|
||||
DEPENDS
|
||||
.memory_utils.memory_utils
|
||||
.memcpy_arch_specific
|
||||
libc.include.string
|
||||
COMPILE_OPTIONS
|
||||
-fno-builtin-memcpy
|
||||
|
@ -144,4 +131,4 @@ add_bzero(bzero MARCH native)
|
|||
# Add all other relevant implementations for the native target.
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
include(${LIBC_STRING_TARGET_FOLDER}/CMakeLists.txt)
|
||||
include(${LIBC_STRING_TARGET_ARCH}/CMakeLists.txt)
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
//===-- Implementation of memcpy ------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/string/memcpy.h"
|
||||
#include "src/__support/common.h"
|
||||
#include "src/string/memcpy_arch_specific.h"
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
|
||||
const void *__restrict src, size_t size) {
|
||||
memcpy_no_return(reinterpret_cast<char *>(dst),
|
||||
reinterpret_cast<const char *>(src), size);
|
||||
return dst;
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
|
@ -1,65 +0,0 @@
|
|||
//===-- Implementation of arch specific memcpy ----------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
|
||||
#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
|
||||
|
||||
%%include_file(${memcpy_arch_specific})
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
// Design rationale
|
||||
// ================
|
||||
//
|
||||
// Using a profiler to observe size distributions for calls into libc
|
||||
// functions, it was found most operations act on a small number of bytes.
|
||||
// This makes it important to favor small sizes.
|
||||
//
|
||||
// The tests for `count` are in ascending order so the cost of branching is
|
||||
// proportional to the cost of copying.
|
||||
//
|
||||
// The function is written in C++ for several reasons:
|
||||
// - The compiler can __see__ the code, this is useful when performing Profile
|
||||
// Guided Optimization as the optimized code can take advantage of branching
|
||||
// probabilities.
|
||||
// - It also allows for easier customization and favors testing multiple
|
||||
// implementation parameters.
|
||||
// - As compilers and processors get better, the generated code is improved
|
||||
// with little change on the code side.
|
||||
static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
if (count == 0)
|
||||
return;
|
||||
if (count == 1)
|
||||
return Copy<1>(dst, src);
|
||||
if (count == 2)
|
||||
return Copy<2>(dst, src);
|
||||
if (count == 3)
|
||||
return Copy<3>(dst, src);
|
||||
if (count == 4)
|
||||
return Copy<4>(dst, src);
|
||||
if (count < 8)
|
||||
return CopyOverlap<4>(dst, src, count);
|
||||
if (count == 8)
|
||||
return Copy<8>(dst, src);
|
||||
if (count < 16)
|
||||
return CopyOverlap<8>(dst, src, count);
|
||||
if (count == 16)
|
||||
return Copy<16>(dst, src);
|
||||
if (count < 32)
|
||||
return CopyOverlap<16>(dst, src, count);
|
||||
if (count < 64)
|
||||
return CopyOverlap<32>(dst, src, count);
|
||||
if (count < 128)
|
||||
return CopyOverlap<64>(dst, src, count);
|
||||
CopyGE128(dst, src, count);
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
||||
|
||||
#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
|
|
@ -32,7 +32,7 @@ extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
|
|||
|
||||
// Copies `kBlockSize` bytes from `src` to `dst`.
|
||||
template <size_t kBlockSize>
|
||||
static void Copy(char *__restrict dst, const char *__restrict src) {
|
||||
static void CopyBlock(char *__restrict dst, const char *__restrict src) {
|
||||
#if defined(LLVM_LIBC_MEMCPY_MONITOR)
|
||||
LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
|
||||
#elif defined(USE_BUILTIN_MEMCPY_INLINE)
|
||||
|
@ -52,7 +52,7 @@ template <size_t kBlockSize>
|
|||
static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
const size_t offset = count - kBlockSize;
|
||||
Copy<kBlockSize>(dst + offset, src + offset);
|
||||
CopyBlock<kBlockSize>(dst + offset, src + offset);
|
||||
}
|
||||
|
||||
// Copies `kBlockSize` bytes twice with an overlap between the two.
|
||||
|
@ -64,9 +64,9 @@ static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
|
|||
//
|
||||
// Precondition: `count >= kBlockSize && count <= kBlockSize`.
|
||||
template <size_t kBlockSize>
|
||||
static void CopyOverlap(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
Copy<kBlockSize>(dst, src);
|
||||
static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
CopyBlock<kBlockSize>(dst, src);
|
||||
CopyLastBlock<kBlockSize>(dst, src, count);
|
||||
}
|
||||
|
||||
|
@ -85,14 +85,14 @@ static void CopyOverlap(char *__restrict dst, const char *__restrict src,
|
|||
// Precondition: `count > 2 * kBlockSize` for efficiency.
|
||||
// `count >= kBlockSize` for correctness.
|
||||
template <size_t kBlockSize>
|
||||
static void CopyAligned(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
Copy<kBlockSize>(dst, src); // Copy first block
|
||||
static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
CopyBlock<kBlockSize>(dst, src); // Copy first block
|
||||
|
||||
// Copy aligned blocks
|
||||
size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
|
||||
for (; offset + kBlockSize < count; offset += kBlockSize)
|
||||
Copy<kBlockSize>(dst + offset, src + offset);
|
||||
CopyBlock<kBlockSize>(dst + offset, src + offset);
|
||||
|
||||
CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
|
||||
}
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
//===-- Implementation of memcpy ------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/string/memcpy.h"
|
||||
#include "src/__support/common.h"
|
||||
#include "src/string/memory_utils/memcpy_utils.h"
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
// FIXME: Add MSVC support with
|
||||
// #include <intrin.h>
|
||||
// __movsb(reinterpret_cast<unsigned char *>(dst),
|
||||
// reinterpret_cast<const unsigned char *>(src), count);
|
||||
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
|
||||
}
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define BEST_SIZE 64
|
||||
#else
|
||||
#define BEST_SIZE 32
|
||||
#endif
|
||||
|
||||
// Design rationale
|
||||
// ================
|
||||
//
|
||||
// Using a profiler to observe size distributions for calls into libc
|
||||
// functions, it was found most operations act on a small number of bytes.
|
||||
// This makes it important to favor small sizes.
|
||||
//
|
||||
// The tests for `count` are in ascending order so the cost of branching is
|
||||
// proportional to the cost of copying.
|
||||
//
|
||||
// The function is written in C++ for several reasons:
|
||||
// - The compiler can __see__ the code, this is useful when performing Profile
|
||||
// Guided Optimization as the optimized code can take advantage of branching
|
||||
// probabilities.
|
||||
// - It also allows for easier customization and favors testing multiple
|
||||
// implementation parameters.
|
||||
// - As compilers and processors get better, the generated code is improved
|
||||
// with little change on the code side.
|
||||
static void memcpy_x86(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
if (count == 0)
|
||||
return;
|
||||
if (count == 1)
|
||||
return CopyBlock<1>(dst, src);
|
||||
if (count == 2)
|
||||
return CopyBlock<2>(dst, src);
|
||||
if (count == 3)
|
||||
return CopyBlock<3>(dst, src);
|
||||
if (count == 4)
|
||||
return CopyBlock<4>(dst, src);
|
||||
if (count < 8)
|
||||
return CopyBlockOverlap<4>(dst, src, count);
|
||||
if (count == 8)
|
||||
return CopyBlock<8>(dst, src);
|
||||
if (count < 16)
|
||||
return CopyBlockOverlap<8>(dst, src, count);
|
||||
if (count == 16)
|
||||
return CopyBlock<16>(dst, src);
|
||||
if (count < 32)
|
||||
return CopyBlockOverlap<16>(dst, src, count);
|
||||
if (count < 64)
|
||||
return CopyBlockOverlap<32>(dst, src, count);
|
||||
if (count < 128)
|
||||
return CopyBlockOverlap<64>(dst, src, count);
|
||||
#if defined(__AVX__)
|
||||
if (count < 256)
|
||||
return CopyBlockOverlap<128>(dst, src, count);
|
||||
#endif
|
||||
// kRepMovsBSize == -1 : Only CopyAligned is used.
|
||||
// kRepMovsBSize == 0 : Only RepMovsb is used.
|
||||
// else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
|
||||
constexpr size_t kRepMovsBSize = -1;
|
||||
if (count <= kRepMovsBSize)
|
||||
return CopyAlignedBlocks<BEST_SIZE>(dst, src, count);
|
||||
return CopyRepMovsb(dst, src, count);
|
||||
}
|
||||
|
||||
void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
|
||||
const void *__restrict src, size_t size) {
|
||||
memcpy_x86(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src),
|
||||
size);
|
||||
return dst;
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
|
@ -1,35 +0,0 @@
|
|||
#include "src/string/memory_utils/memcpy_utils.h"
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
// FIXME: Add MSVC support with
|
||||
// #include <intrin.h>
|
||||
// __movsb(reinterpret_cast<unsigned char *>(dst),
|
||||
// reinterpret_cast<const unsigned char *>(src), count);
|
||||
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
|
||||
}
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define BEST_SIZE 64
|
||||
#else
|
||||
#define BEST_SIZE 32
|
||||
#endif
|
||||
|
||||
static void CopyGE128(char *__restrict dst, const char *__restrict src,
|
||||
size_t count) {
|
||||
#if defined(__AVX__)
|
||||
if (count < 256)
|
||||
return CopyOverlap<128>(dst, src, count);
|
||||
#endif
|
||||
// kRepMovsBSize == -1 : Only CopyAligned is used.
|
||||
// kRepMovsBSize == 0 : Only RepMovsb is used.
|
||||
// else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
|
||||
constexpr size_t kRepMovsBSize = -1;
|
||||
if (count <= kRepMovsBSize)
|
||||
return CopyAligned<BEST_SIZE>(dst, src, count);
|
||||
CopyRepMovsb(dst, src, count);
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
|
@ -83,37 +83,37 @@ TEST(MemcpyUtilsTest, CopyTrivial) {
|
|||
auto &trace = GetTrace();
|
||||
|
||||
trace.Clear();
|
||||
Copy<1>(I(0), I(0));
|
||||
CopyBlock<1>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "1");
|
||||
EXPECT_STREQ(trace.Read(), "1");
|
||||
|
||||
trace.Clear();
|
||||
Copy<2>(I(0), I(0));
|
||||
CopyBlock<2>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "11");
|
||||
EXPECT_STREQ(trace.Read(), "11");
|
||||
|
||||
trace.Clear();
|
||||
Copy<4>(I(0), I(0));
|
||||
CopyBlock<4>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "1111");
|
||||
EXPECT_STREQ(trace.Read(), "1111");
|
||||
|
||||
trace.Clear();
|
||||
Copy<8>(I(0), I(0));
|
||||
CopyBlock<8>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "11111111");
|
||||
EXPECT_STREQ(trace.Read(), "11111111");
|
||||
|
||||
trace.Clear();
|
||||
Copy<16>(I(0), I(0));
|
||||
CopyBlock<16>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "1111111111111111");
|
||||
EXPECT_STREQ(trace.Read(), "1111111111111111");
|
||||
|
||||
trace.Clear();
|
||||
Copy<32>(I(0), I(0));
|
||||
CopyBlock<32>(I(0), I(0));
|
||||
EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
|
||||
EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
|
||||
|
||||
trace.Clear();
|
||||
Copy<64>(I(0), I(0));
|
||||
CopyBlock<64>(I(0), I(0));
|
||||
EXPECT_STREQ(
|
||||
trace.Write(),
|
||||
"1111111111111111111111111111111111111111111111111111111111111111");
|
||||
|
@ -126,41 +126,41 @@ TEST(MemcpyUtilsTest, CopyOffset) {
|
|||
auto &trace = GetTrace();
|
||||
|
||||
trace.Clear();
|
||||
Copy<1>(I(3), I(1));
|
||||
CopyBlock<1>(I(3), I(1));
|
||||
EXPECT_STREQ(trace.Write(), "0001");
|
||||
EXPECT_STREQ(trace.Read(), "01");
|
||||
|
||||
trace.Clear();
|
||||
Copy<1>(I(2), I(1));
|
||||
CopyBlock<1>(I(2), I(1));
|
||||
EXPECT_STREQ(trace.Write(), "001");
|
||||
EXPECT_STREQ(trace.Read(), "01");
|
||||
}
|
||||
|
||||
TEST(MemcpyUtilsTest, CopyOverlap) {
|
||||
TEST(MemcpyUtilsTest, CopyBlockOverlap) {
|
||||
auto &trace = GetTrace();
|
||||
|
||||
trace.Clear();
|
||||
CopyOverlap<2>(I(0), I(0), 2);
|
||||
CopyBlockOverlap<2>(I(0), I(0), 2);
|
||||
EXPECT_STREQ(trace.Write(), "22");
|
||||
EXPECT_STREQ(trace.Read(), "22");
|
||||
|
||||
trace.Clear();
|
||||
CopyOverlap<2>(I(0), I(0), 3);
|
||||
CopyBlockOverlap<2>(I(0), I(0), 3);
|
||||
EXPECT_STREQ(trace.Write(), "121");
|
||||
EXPECT_STREQ(trace.Read(), "121");
|
||||
|
||||
trace.Clear();
|
||||
CopyOverlap<2>(I(0), I(0), 4);
|
||||
CopyBlockOverlap<2>(I(0), I(0), 4);
|
||||
EXPECT_STREQ(trace.Write(), "1111");
|
||||
EXPECT_STREQ(trace.Read(), "1111");
|
||||
|
||||
trace.Clear();
|
||||
CopyOverlap<4>(I(2), I(1), 7);
|
||||
CopyBlockOverlap<4>(I(2), I(1), 7);
|
||||
EXPECT_STREQ(trace.Write(), "001112111");
|
||||
EXPECT_STREQ(trace.Read(), "01112111");
|
||||
}
|
||||
|
||||
TEST(MemcpyUtilsTest, CopyAligned) {
|
||||
TEST(MemcpyUtilsTest, CopyAlignedBlocks) {
|
||||
auto &trace = GetTrace();
|
||||
// Destination is aligned already.
|
||||
// "1111000000000"
|
||||
|
@ -169,7 +169,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
|
|||
// + "0000000001111"
|
||||
// = "1111111112221"
|
||||
trace.Clear();
|
||||
CopyAligned<4>(I(0), I(0), 13);
|
||||
CopyAlignedBlocks<4>(I(0), I(0), 13);
|
||||
EXPECT_STREQ(trace.Write(), "1111111112221");
|
||||
EXPECT_STREQ(trace.Read(), "1111111112221");
|
||||
|
||||
|
@ -180,7 +180,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
|
|||
// + "00000000001111"
|
||||
// = "01112111112211"
|
||||
trace.Clear();
|
||||
CopyAligned<4>(I(1), I(0), 13);
|
||||
CopyAlignedBlocks<4>(I(1), I(0), 13);
|
||||
EXPECT_STREQ(trace.Write(), "01112111112211");
|
||||
EXPECT_STREQ(trace.Read(), "1112111112211");
|
||||
}
|
||||
|
@ -191,7 +191,7 @@ TEST(MemcpyUtilsTest, MaxReloads) {
|
|||
for (size_t count = 64; count < 768; ++count) {
|
||||
trace.Clear();
|
||||
// We should never reload more than twice when copying from count = 2x32.
|
||||
CopyAligned<32>(I(alignment), I(0), count);
|
||||
CopyAlignedBlocks<32>(I(alignment), I(0), count);
|
||||
const char *const written = trace.Write();
|
||||
// First bytes are untouched.
|
||||
for (size_t i = 0; i < alignment; ++i)
|
||||
|
|
Loading…
Reference in New Issue