Merge pull request #4920 from apple/anoyes/aarch64-memcmp

Use custom mem{cmp,cpy} impl on Arm
This commit is contained in:
Andrew Noyes 2021-08-26 14:30:05 -07:00 committed by GitHub
commit 05853b97c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 496 additions and 0 deletions

View File

@ -562,3 +562,28 @@ folly_memcpy:
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Arm Limited (optimized-routines)
MIT License
Copyright (c) 1999-2019, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -120,6 +120,10 @@ if(UNIX AND NOT APPLE)
list(APPEND FLOW_SRCS folly_memcpy.S)
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
list(APPEND FLOW_SRCS aarch64/memcmp.S aarch64/memcpy.S)
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)

98
flow/aarch64/asmdefs.h Normal file
View File

@ -0,0 +1,98 @@
/*
* Macros for asm code.
*
* Copyright (c) 2019-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
#ifndef _ASMDEFS_H
#define _ASMDEFS_H
#if defined(__aarch64__)
/* Branch Target Identitication support. */
#define BTI_C hint 34
#define BTI_J hint 36
/* Return address signing support (pac-ret). */
#define PACIASP hint 25; .cfi_window_save
#define AUTIASP hint 29; .cfi_window_save
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
#define FEATURE_1_AND 0xc0000000
#define FEATURE_1_BTI 1
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
.word 4; \
.word 16; \
.word 5; \
.asciz "GNU"; \
.word type; \
.word 4; \
.word value; \
.word 0; \
.text
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
#ifndef WANT_GNU_PROPERTY
#define WANT_GNU_PROPERTY 1
#endif
#if WANT_GNU_PROPERTY
/* Add property note with supported features to all asm files. */
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#endif
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
.align alignment; \
name: \
.cfi_startproc; \
BTI_C;
#else
#define END_FILE
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
.align alignment; \
name: \
.cfi_startproc;
#endif
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
.global name; \
.type name,%function; \
name:
#define END(name) \
.cfi_endproc; \
.size name, .-name;
#define L(l) .L ## l
#ifdef __ILP32__
/* Sanitize padding bits of pointer arguments as per aapcs64 */
#define PTR_ARG(n) mov w##n, w##n
#else
#define PTR_ARG(n)
#endif
#ifdef __ILP32__
/* Sanitize padding bits of size arguments as per aapcs64 */
#define SIZE_ARG(n) mov w##n, w##n
#else
#define SIZE_ARG(n)
#endif
#endif

136
flow/aarch64/memcmp.S Normal file
View File

@ -0,0 +1,136 @@
/* memcmp - compare memory
*
* Copyright (c) 2013-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
*/
#include "asmdefs.h"
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data1h x4
#define data2 x5
#define data2w w5
#define data2h x6
#define tmp1 x7
#define tmp2 x8
ENTRY (memcmp)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
subs limit, limit, 8
b.lo L(less8)
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
b.ne L(return)
subs limit, limit, 8
b.gt L(more16)
ldr data1, [src1, limit]
ldr data2, [src2, limit]
b L(return)
L(more16):
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
bne L(return)
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
strings. */
subs limit, limit, 16
b.ls L(last_bytes)
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
try to align, so limit it only to strings larger than 128 bytes. */
cmp limit, 96
b.ls L(loop16)
/* Align src1 and adjust src2 with bytes not yet done. */
and tmp1, src1, 15
add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
/* Loop performing 16 bytes per iteration using aligned src1.
Limit is pre-decremented by 16 and must be larger than zero.
Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
L(loop16):
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
subs limit, limit, 16
ccmp data1, data2, 0, hi
ccmp data1h, data2h, 0, eq
b.eq L(loop16)
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
cmp data1, data2
bne L(return)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
add src1, src1, limit
add src2, src2, limit
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
cmp data1, data2
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
L(ret_eq):
cset result, ne
cneg result, result, lo
ret
.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
L(less8):
adds limit, limit, 4
b.lo L(less4)
ldr data1w, [src1], 4
ldr data2w, [src2], 4
cmp data1w, data2w
b.ne L(return)
sub limit, limit, 4
L(less4):
adds limit, limit, 4
beq L(ret_eq)
L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop)
sub result, data1w, data2w
ret
END (memcmp)

206
flow/aarch64/memcpy.S Normal file
View File

@ -0,0 +1,206 @@
/*
* memcpy - copy memory area
*
* Copyright (c) 2019-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#include "asmdefs.h"
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_lw w10
#define tmp1 x14
#define A_q q0
#define B_q q1
#define C_q q2
#define D_q q3
#define E_q q4
#define F_q q5
#define G_q q6
#define H_q q7
/* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration.
The source pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end.
*/
ENTRY_ALIAS (memmove)
ENTRY (memcpy)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
cmp count, 32
b.hi L(copy32_128)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
ldr A_q, [src]
ldr B_q, [srcend, -16]
str A_q, [dstin]
str B_q, [dstend, -16]
ret
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
ldr A_lw, [src]
ldr B_lw, [srcend, -4]
str A_lw, [dstin]
str B_lw, [dstend, -4]
ret
/* Copy 0..3 bytes using a branchless sequence. */
L(copy4):
cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1]
L(copy0):
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
ldp A_q, B_q, [src]
ldp C_q, D_q, [srcend, -32]
cmp count, 64
b.hi L(copy128)
stp A_q, B_q, [dstin]
stp C_q, D_q, [dstend, -32]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
cmp count, 96
b.ls L(copy96)
ldp G_q, H_q, [srcend, -64]
stp G_q, H_q, [dstend, -64]
L(copy96):
stp A_q, B_q, [dstin]
stp E_q, F_q, [dstin, 32]
stp C_q, D_q, [dstend, -32]
ret
/* Copy more than 128 bytes. */
L(copy_long):
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align src to 16-byte alignment. */
ldr D_q, [src]
and tmp1, src, 15
bic src, src, 15
sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_q, B_q, [src, 16]
str D_q, [dstin]
ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
L(loop64):
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [src, 80]
stp C_q, D_q, [dst, 48]
ldp C_q, D_q, [src, 112]
add src, src, 64
add dst, dst, 64
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_q, F_q, [srcend, -64]
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [srcend, -32]
stp C_q, D_q, [dst, 48]
stp E_q, F_q, [dstend, -64]
stp A_q, B_q, [dstend, -32]
ret
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):
cbz tmp1, L(copy0)
ldr D_q, [srcend, -16]
and tmp1, srcend, 15
bic srcend, srcend, 15
sub count, count, tmp1
ldp A_q, B_q, [srcend, -32]
str D_q, [dstend, -16]
ldp C_q, D_q, [srcend, -64]
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
str B_q, [dstend, -16]
str A_q, [dstend, -32]
ldp A_q, B_q, [srcend, -96]
str D_q, [dstend, -48]
str C_q, [dstend, -64]!
ldp C_q, D_q, [srcend, -128]
sub srcend, srcend, 64
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp E_q, F_q, [src, 32]
stp A_q, B_q, [dstend, -32]
ldp A_q, B_q, [src]
stp C_q, D_q, [dstend, -64]
stp E_q, F_q, [dstin, 32]
stp A_q, B_q, [dstin]
ret
END (memcpy)

View File

@ -67,3 +67,30 @@ static void bench_hash(benchmark::State& state) {
BENCHMARK_TEMPLATE(bench_hash, HashType::CRC32C)->DenseRange(2, 18)->ReportAggregatesOnly(true);
BENCHMARK_TEMPLATE(bench_hash, HashType::HashLittle2)->DenseRange(2, 18)->ReportAggregatesOnly(true);
BENCHMARK_TEMPLATE(bench_hash, HashType::XXHash3)->DenseRange(2, 18)->ReportAggregatesOnly(true);
static void bench_memcmp(benchmark::State& state) {
constexpr int kLength = 10000;
std::unique_ptr<char[]> b1{ new char[kLength] };
std::unique_ptr<char[]> b2{ new char[kLength] };
memset(b1.get(), 0, kLength);
memset(b2.get(), 0, kLength);
b2.get()[kLength - 1] = 1;
while (state.KeepRunning()) {
benchmark::DoNotOptimize(memcmp(b1.get(), b2.get(), kLength));
}
}
static void bench_memcpy(benchmark::State& state) {
constexpr int kLength = 10000;
std::unique_ptr<char[]> b1{ new char[kLength] };
std::unique_ptr<char[]> b2{ new char[kLength] };
memset(b1.get(), 0, kLength);
while (state.KeepRunning()) {
benchmark::DoNotOptimize(memcpy(b2.get(), b1.get(), kLength));
}
}
BENCHMARK(bench_memcmp);
BENCHMARK(bench_memcpy);