2019-11-08 20:22:25 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
|
|
|
|
*
|
|
|
|
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <asm/hwcap.h>
|
|
|
|
#include <asm/neon.h>
|
|
|
|
#include <asm/simd.h>
|
|
|
|
#include <asm/unaligned.h>
|
|
|
|
#include <crypto/algapi.h>
|
|
|
|
#include <crypto/internal/hash.h>
|
|
|
|
#include <crypto/internal/poly1305.h>
|
|
|
|
#include <crypto/internal/simd.h>
|
|
|
|
#include <linux/cpufeature.h>
|
|
|
|
#include <linux/crypto.h>
|
|
|
|
#include <linux/jump_label.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
|
|
|
|
void poly1305_init_arm(void *state, const u8 *key);
|
|
|
|
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
|
2020-08-25 09:23:00 +08:00
|
|
|
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
2020-01-06 11:40:49 +08:00
|
|
|
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
|
2019-11-08 20:22:25 +08:00
|
|
|
|
|
|
|
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
|
|
|
|
|
|
|
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
|
|
|
{
|
|
|
|
poly1305_init_arm(&dctx->h, key);
|
|
|
|
dctx->s[0] = get_unaligned_le32(key + 16);
|
|
|
|
dctx->s[1] = get_unaligned_le32(key + 20);
|
|
|
|
dctx->s[2] = get_unaligned_le32(key + 24);
|
|
|
|
dctx->s[3] = get_unaligned_le32(key + 28);
|
|
|
|
dctx->buflen = 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(poly1305_init_arch);
|
|
|
|
|
|
|
|
static int arm_poly1305_init(struct shash_desc *desc)
|
|
|
|
{
|
|
|
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
|
|
|
|
|
|
|
dctx->buflen = 0;
|
|
|
|
dctx->rset = 0;
|
|
|
|
dctx->sset = false;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
|
|
|
u32 len, u32 hibit, bool do_neon)
|
|
|
|
{
|
|
|
|
if (unlikely(!dctx->sset)) {
|
|
|
|
if (!dctx->rset) {
|
|
|
|
poly1305_init_arm(&dctx->h, src);
|
|
|
|
src += POLY1305_BLOCK_SIZE;
|
|
|
|
len -= POLY1305_BLOCK_SIZE;
|
|
|
|
dctx->rset = 1;
|
|
|
|
}
|
|
|
|
if (len >= POLY1305_BLOCK_SIZE) {
|
|
|
|
dctx->s[0] = get_unaligned_le32(src + 0);
|
|
|
|
dctx->s[1] = get_unaligned_le32(src + 4);
|
|
|
|
dctx->s[2] = get_unaligned_le32(src + 8);
|
|
|
|
dctx->s[3] = get_unaligned_le32(src + 12);
|
|
|
|
src += POLY1305_BLOCK_SIZE;
|
|
|
|
len -= POLY1305_BLOCK_SIZE;
|
|
|
|
dctx->sset = true;
|
|
|
|
}
|
|
|
|
if (len < POLY1305_BLOCK_SIZE)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
|
|
|
|
|
|
|
if (static_branch_likely(&have_neon) && likely(do_neon))
|
|
|
|
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
|
|
|
else
|
|
|
|
poly1305_blocks_arm(&dctx->h, src, len, hibit);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
|
|
|
const u8 *src, u32 len, bool do_neon)
|
|
|
|
{
|
|
|
|
if (unlikely(dctx->buflen)) {
|
|
|
|
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
|
|
|
|
|
|
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
|
|
|
src += bytes;
|
|
|
|
len -= bytes;
|
|
|
|
dctx->buflen += bytes;
|
|
|
|
|
|
|
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
|
|
|
arm_poly1305_blocks(dctx, dctx->buf,
|
|
|
|
POLY1305_BLOCK_SIZE, 1, false);
|
|
|
|
dctx->buflen = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
|
|
|
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
|
|
|
|
src += round_down(len, POLY1305_BLOCK_SIZE);
|
|
|
|
len %= POLY1305_BLOCK_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(len)) {
|
|
|
|
dctx->buflen = len;
|
|
|
|
memcpy(dctx->buf, src, len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int arm_poly1305_update(struct shash_desc *desc,
|
|
|
|
const u8 *src, unsigned int srclen)
|
|
|
|
{
|
|
|
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
|
|
|
|
|
|
|
arm_poly1305_do_update(dctx, src, srclen, false);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
|
|
|
|
const u8 *src,
|
|
|
|
unsigned int srclen)
|
|
|
|
{
|
|
|
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
|
|
|
bool do_neon = crypto_simd_usable() && srclen > 128;
|
|
|
|
|
|
|
|
if (static_branch_likely(&have_neon) && do_neon)
|
|
|
|
kernel_neon_begin();
|
|
|
|
arm_poly1305_do_update(dctx, src, srclen, do_neon);
|
|
|
|
if (static_branch_likely(&have_neon) && do_neon)
|
|
|
|
kernel_neon_end();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
|
|
|
unsigned int nbytes)
|
|
|
|
{
|
|
|
|
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
|
|
|
crypto_simd_usable();
|
|
|
|
|
|
|
|
if (unlikely(dctx->buflen)) {
|
|
|
|
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
|
|
|
|
|
|
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
|
|
|
src += bytes;
|
|
|
|
nbytes -= bytes;
|
|
|
|
dctx->buflen += bytes;
|
|
|
|
|
|
|
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
|
|
|
poly1305_blocks_arm(&dctx->h, dctx->buf,
|
|
|
|
POLY1305_BLOCK_SIZE, 1);
|
|
|
|
dctx->buflen = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
|
|
|
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
|
|
|
|
|
|
|
if (static_branch_likely(&have_neon) && do_neon) {
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
do {
|
|
|
|
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
|
|
|
|
|
|
|
kernel_neon_begin();
|
|
|
|
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
|
|
|
kernel_neon_end();
|
|
|
|
|
|
|
|
len -= todo;
|
|
|
|
src += todo;
|
|
|
|
} while (len);
|
2019-11-08 20:22:25 +08:00
|
|
|
} else {
|
|
|
|
poly1305_blocks_arm(&dctx->h, src, len, 1);
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
src += len;
|
2019-11-08 20:22:25 +08:00
|
|
|
}
|
|
|
|
nbytes %= POLY1305_BLOCK_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(nbytes)) {
|
|
|
|
dctx->buflen = nbytes;
|
|
|
|
memcpy(dctx->buf, src, nbytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(poly1305_update_arch);
|
|
|
|
|
|
|
|
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
|
|
|
{
|
|
|
|
if (unlikely(dctx->buflen)) {
|
|
|
|
dctx->buf[dctx->buflen++] = 1;
|
|
|
|
memset(dctx->buf + dctx->buflen, 0,
|
|
|
|
POLY1305_BLOCK_SIZE - dctx->buflen);
|
|
|
|
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
|
|
|
}
|
|
|
|
|
2020-01-06 11:40:49 +08:00
|
|
|
poly1305_emit_arm(&dctx->h, dst, dctx->s);
|
2019-11-08 20:22:25 +08:00
|
|
|
*dctx = (struct poly1305_desc_ctx){};
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(poly1305_final_arch);
|
|
|
|
|
|
|
|
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
|
|
|
|
{
|
|
|
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
|
|
|
|
|
|
|
if (unlikely(!dctx->sset))
|
|
|
|
return -ENOKEY;
|
|
|
|
|
|
|
|
poly1305_final_arch(dctx, dst);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct shash_alg arm_poly1305_algs[] = {{
|
|
|
|
.init = arm_poly1305_init,
|
|
|
|
.update = arm_poly1305_update,
|
|
|
|
.final = arm_poly1305_final,
|
|
|
|
.digestsize = POLY1305_DIGEST_SIZE,
|
|
|
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
|
|
|
|
|
|
|
.base.cra_name = "poly1305",
|
|
|
|
.base.cra_driver_name = "poly1305-arm",
|
|
|
|
.base.cra_priority = 150,
|
|
|
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
|
|
|
.base.cra_module = THIS_MODULE,
|
|
|
|
#ifdef CONFIG_KERNEL_MODE_NEON
|
|
|
|
}, {
|
|
|
|
.init = arm_poly1305_init,
|
|
|
|
.update = arm_poly1305_update_neon,
|
|
|
|
.final = arm_poly1305_final,
|
|
|
|
.digestsize = POLY1305_DIGEST_SIZE,
|
|
|
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
|
|
|
|
|
|
|
.base.cra_name = "poly1305",
|
|
|
|
.base.cra_driver_name = "poly1305-neon",
|
|
|
|
.base.cra_priority = 200,
|
|
|
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
|
|
|
.base.cra_module = THIS_MODULE,
|
|
|
|
#endif
|
|
|
|
}};
|
|
|
|
|
|
|
|
static int __init arm_poly1305_mod_init(void)
|
|
|
|
{
|
|
|
|
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
|
|
|
(elf_hwcap & HWCAP_NEON))
|
|
|
|
static_branch_enable(&have_neon);
|
2019-11-25 18:31:12 +08:00
|
|
|
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
2019-11-08 20:22:25 +08:00
|
|
|
/* register only the first entry */
|
|
|
|
return crypto_register_shash(&arm_poly1305_algs[0]);
|
|
|
|
|
2019-11-25 18:31:12 +08:00
|
|
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
|
|
|
crypto_register_shashes(arm_poly1305_algs,
|
|
|
|
ARRAY_SIZE(arm_poly1305_algs)) : 0;
|
2019-11-08 20:22:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit arm_poly1305_mod_exit(void)
|
|
|
|
{
|
2019-11-25 18:31:12 +08:00
|
|
|
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
|
|
|
return;
|
2019-11-08 20:22:25 +08:00
|
|
|
if (!static_branch_likely(&have_neon)) {
|
|
|
|
crypto_unregister_shash(&arm_poly1305_algs[0]);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
crypto_unregister_shashes(arm_poly1305_algs,
|
|
|
|
ARRAY_SIZE(arm_poly1305_algs));
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(arm_poly1305_mod_init);
|
|
|
|
module_exit(arm_poly1305_mod_exit);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL v2");
|
|
|
|
MODULE_ALIAS_CRYPTO("poly1305");
|
|
|
|
MODULE_ALIAS_CRYPTO("poly1305-arm");
|
|
|
|
MODULE_ALIAS_CRYPTO("poly1305-neon");
|