crypto: arm64/crc-t10dif - move NEON yield to C code
Instead of yielding from the bowels of the asm routine if a reschedule is needed, divide up the input into 4 KB chunks in the C glue. This simplifies the code substantially, and avoids scheduling out the task with the asm routine on the call stack, which is undesirable from a CFI/instrumentation point of view. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
f0070f4a79
commit
fc754c024a
|
@ -68,10 +68,10 @@
|
|||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
init_crc .req w19
|
||||
buf .req x20
|
||||
len .req x21
|
||||
fold_consts_ptr .req x22
|
||||
init_crc .req w0
|
||||
buf .req x1
|
||||
len .req x2
|
||||
fold_consts_ptr .req x3
|
||||
|
||||
fold_consts .req v10
|
||||
|
||||
|
@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|||
.endm
|
||||
|
||||
.macro crc_t10dif_pmull, p
|
||||
frame_push 4, 128
|
||||
|
||||
mov init_crc, w0
|
||||
mov buf, x1
|
||||
mov len, x2
|
||||
|
||||
__pmull_init_\p
|
||||
|
||||
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
||||
|
@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||
fold_32_bytes \p, v6, v7
|
||||
|
||||
subs len, len, #128
|
||||
b.lt .Lfold_128_bytes_loop_done_\@
|
||||
|
||||
if_will_cond_yield_neon
|
||||
stp q0, q1, [sp, #.Lframe_local_offset]
|
||||
stp q2, q3, [sp, #.Lframe_local_offset + 32]
|
||||
stp q4, q5, [sp, #.Lframe_local_offset + 64]
|
||||
stp q6, q7, [sp, #.Lframe_local_offset + 96]
|
||||
do_cond_yield_neon
|
||||
ldp q0, q1, [sp, #.Lframe_local_offset]
|
||||
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
|
||||
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
|
||||
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
|
||||
ld1 {fold_consts.2d}, [fold_consts_ptr]
|
||||
__pmull_init_\p
|
||||
__pmull_pre_\p fold_consts
|
||||
endif_yield_neon
|
||||
|
||||
b .Lfold_128_bytes_loop_\@
|
||||
|
||||
.Lfold_128_bytes_loop_done_\@:
|
||||
b.ge .Lfold_128_bytes_loop_\@
|
||||
|
||||
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
|
||||
|
||||
|
@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
|
|||
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
|
||||
|
||||
umov w0, v0.h[0]
|
||||
frame_pop
|
||||
.ifc \p, p8
|
||||
ldp x29, x30, [sp], #16
|
||||
.endif
|
||||
ret
|
||||
|
||||
.Lless_than_256_bytes_\@:
|
||||
|
@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||
// Assumes len >= 16.
|
||||
//
|
||||
SYM_FUNC_START(crc_t10dif_pmull_p8)
|
||||
crc_t10dif_pmull p8
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
crc_t10dif_pmull p8
|
||||
SYM_FUNC_END(crc_t10dif_pmull_p8)
|
||||
|
||||
.align 5
|
||||
|
|
|
@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
|
|||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p8(*crc, data, length);
|
||||
kernel_neon_end();
|
||||
do {
|
||||
unsigned int chunk = length;
|
||||
|
||||
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||
chunk = SZ_4K;
|
||||
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
|
||||
kernel_neon_end();
|
||||
data += chunk;
|
||||
length -= chunk;
|
||||
} while (length);
|
||||
} else {
|
||||
*crc = crc_t10dif_generic(*crc, data, length);
|
||||
}
|
||||
|
@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
|
|||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p64(*crc, data, length);
|
||||
kernel_neon_end();
|
||||
do {
|
||||
unsigned int chunk = length;
|
||||
|
||||
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||
chunk = SZ_4K;
|
||||
|
||||
kernel_neon_begin();
|
||||
*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
|
||||
kernel_neon_end();
|
||||
data += chunk;
|
||||
length -= chunk;
|
||||
} while (length);
|
||||
} else {
|
||||
*crc = crc_t10dif_generic(*crc, data, length);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue