2019-11-08 20:22:31 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <crypto/internal/blake2s.h>
|
|
|
|
#include <crypto/internal/simd.h>
|
|
|
|
#include <crypto/internal/hash.h>
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/jump_label.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
2020-08-19 19:58:20 +08:00
|
|
|
#include <linux/sizes.h>
|
2019-11-08 20:22:31 +08:00
|
|
|
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
#include <asm/fpu/api.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/simd.h>
|
|
|
|
|
|
|
|
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
|
|
|
const u8 *block, const size_t nblocks,
|
|
|
|
const u32 inc);
|
|
|
|
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
|
|
|
const u8 *block, const size_t nblocks,
|
|
|
|
const u32 inc);
|
|
|
|
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
|
|
|
|
|
|
|
void blake2s_compress_arch(struct blake2s_state *state,
|
|
|
|
const u8 *block, size_t nblocks,
|
|
|
|
const u32 inc)
|
|
|
|
{
|
|
|
|
/* SIMD disables preemption, so relax after processing each page. */
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
2019-11-08 20:22:31 +08:00
|
|
|
|
|
|
|
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
|
|
|
|
blake2s_compress_generic(state, block, nblocks, inc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
do {
|
2019-11-08 20:22:31 +08:00
|
|
|
const size_t blocks = min_t(size_t, nblocks,
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
SZ_4K / BLAKE2S_BLOCK_SIZE);
|
2019-11-08 20:22:31 +08:00
|
|
|
|
|
|
|
kernel_fpu_begin();
|
|
|
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
|
|
static_branch_likely(&blake2s_use_avx512))
|
|
|
|
blake2s_compress_avx512(state, block, blocks, inc);
|
|
|
|
else
|
|
|
|
blake2s_compress_ssse3(state, block, blocks, inc);
|
|
|
|
kernel_fpu_end();
|
|
|
|
|
|
|
|
nblocks -= blocks;
|
|
|
|
block += blocks * BLAKE2S_BLOCK_SIZE;
|
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-23 07:18:53 +08:00
|
|
|
} while (nblocks);
|
2019-11-08 20:22:31 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(blake2s_compress_arch);
|
|
|
|
|
crypto: blake2s - share the "shash" API boilerplate code
Add helper functions for shash implementations of BLAKE2s to
include/crypto/internal/blake2s.h, taking advantage of
__blake2s_update() and __blake2s_final() that were added by the previous
patch to share more code between the library and shash implementations.
crypto_blake2s_setkey() and crypto_blake2s_init() are usable as
shash_alg::setkey and shash_alg::init directly, while
crypto_blake2s_update() and crypto_blake2s_final() take an extra
'blake2s_compress_t' function pointer parameter. This allows the
implementation of the compression function to be overridden, which is
the only part that optimized implementations really care about.
The new functions are inline functions (similar to those in sha1_base.h,
sha256_base.h, and sm3_base.h) because this avoids needing to add a new
module blake2s_helpers.ko, they aren't *too* long, and this avoids
indirect calls which are expensive these days. Note that they can't go
in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S
from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency.
Finally, use these new helper functions in the x86 implementation of
BLAKE2s. (This part should be a separate patch, but unfortunately the
x86 implementation used the exact same function names like
"crypto_blake2s_update()", so it had to be updated at the same time.)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-12-23 16:09:54 +08:00
|
|
|
static int crypto_blake2s_update_x86(struct shash_desc *desc,
|
|
|
|
const u8 *in, unsigned int inlen)
|
2019-11-08 20:22:31 +08:00
|
|
|
{
|
crypto: blake2s - share the "shash" API boilerplate code
Add helper functions for shash implementations of BLAKE2s to
include/crypto/internal/blake2s.h, taking advantage of
__blake2s_update() and __blake2s_final() that were added by the previous
patch to share more code between the library and shash implementations.
crypto_blake2s_setkey() and crypto_blake2s_init() are usable as
shash_alg::setkey and shash_alg::init directly, while
crypto_blake2s_update() and crypto_blake2s_final() take an extra
'blake2s_compress_t' function pointer parameter. This allows the
implementation of the compression function to be overridden, which is
the only part that optimized implementations really care about.
The new functions are inline functions (similar to those in sha1_base.h,
sha256_base.h, and sm3_base.h) because this avoids needing to add a new
module blake2s_helpers.ko, they aren't *too* long, and this avoids
indirect calls which are expensive these days. Note that they can't go
in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S
from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency.
Finally, use these new helper functions in the x86 implementation of
BLAKE2s. (This part should be a separate patch, but unfortunately the
x86 implementation used the exact same function names like
"crypto_blake2s_update()", so it had to be updated at the same time.)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-12-23 16:09:54 +08:00
|
|
|
return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
|
2019-11-08 20:22:31 +08:00
|
|
|
}
|
|
|
|
|
crypto: blake2s - share the "shash" API boilerplate code
Add helper functions for shash implementations of BLAKE2s to
include/crypto/internal/blake2s.h, taking advantage of
__blake2s_update() and __blake2s_final() that were added by the previous
patch to share more code between the library and shash implementations.
crypto_blake2s_setkey() and crypto_blake2s_init() are usable as
shash_alg::setkey and shash_alg::init directly, while
crypto_blake2s_update() and crypto_blake2s_final() take an extra
'blake2s_compress_t' function pointer parameter. This allows the
implementation of the compression function to be overridden, which is
the only part that optimized implementations really care about.
The new functions are inline functions (similar to those in sha1_base.h,
sha256_base.h, and sm3_base.h) because this avoids needing to add a new
module blake2s_helpers.ko, they aren't *too* long, and this avoids
indirect calls which are expensive these days. Note that they can't go
in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S
from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency.
Finally, use these new helper functions in the x86 implementation of
BLAKE2s. (This part should be a separate patch, but unfortunately the
x86 implementation used the exact same function names like
"crypto_blake2s_update()", so it had to be updated at the same time.)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-12-23 16:09:54 +08:00
|
|
|
static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
|
2019-11-08 20:22:31 +08:00
|
|
|
{
|
crypto: blake2s - share the "shash" API boilerplate code
Add helper functions for shash implementations of BLAKE2s to
include/crypto/internal/blake2s.h, taking advantage of
__blake2s_update() and __blake2s_final() that were added by the previous
patch to share more code between the library and shash implementations.
crypto_blake2s_setkey() and crypto_blake2s_init() are usable as
shash_alg::setkey and shash_alg::init directly, while
crypto_blake2s_update() and crypto_blake2s_final() take an extra
'blake2s_compress_t' function pointer parameter. This allows the
implementation of the compression function to be overridden, which is
the only part that optimized implementations really care about.
The new functions are inline functions (similar to those in sha1_base.h,
sha256_base.h, and sm3_base.h) because this avoids needing to add a new
module blake2s_helpers.ko, they aren't *too* long, and this avoids
indirect calls which are expensive these days. Note that they can't go
in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S
from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency.
Finally, use these new helper functions in the x86 implementation of
BLAKE2s. (This part should be a separate patch, but unfortunately the
x86 implementation used the exact same function names like
"crypto_blake2s_update()", so it had to be updated at the same time.)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-12-23 16:09:54 +08:00
|
|
|
return crypto_blake2s_final(desc, out, blake2s_compress_arch);
|
2019-11-08 20:22:31 +08:00
|
|
|
}
|
|
|
|
|
2020-12-23 16:09:51 +08:00
|
|
|
#define BLAKE2S_ALG(name, driver_name, digest_size) \
|
|
|
|
{ \
|
|
|
|
.base.cra_name = name, \
|
|
|
|
.base.cra_driver_name = driver_name, \
|
|
|
|
.base.cra_priority = 200, \
|
|
|
|
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
|
|
|
|
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
|
|
|
|
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
|
|
|
|
.base.cra_module = THIS_MODULE, \
|
|
|
|
.digestsize = digest_size, \
|
|
|
|
.setkey = crypto_blake2s_setkey, \
|
|
|
|
.init = crypto_blake2s_init, \
|
crypto: blake2s - share the "shash" API boilerplate code
Add helper functions for shash implementations of BLAKE2s to
include/crypto/internal/blake2s.h, taking advantage of
__blake2s_update() and __blake2s_final() that were added by the previous
patch to share more code between the library and shash implementations.
crypto_blake2s_setkey() and crypto_blake2s_init() are usable as
shash_alg::setkey and shash_alg::init directly, while
crypto_blake2s_update() and crypto_blake2s_final() take an extra
'blake2s_compress_t' function pointer parameter. This allows the
implementation of the compression function to be overridden, which is
the only part that optimized implementations really care about.
The new functions are inline functions (similar to those in sha1_base.h,
sha256_base.h, and sm3_base.h) because this avoids needing to add a new
module blake2s_helpers.ko, they aren't *too* long, and this avoids
indirect calls which are expensive these days. Note that they can't go
in blake2s_generic.ko, as that would require selecting CRYPTO_BLAKE2S
from CRYPTO_BLAKE2S_X86, which would cause a recursive dependency.
Finally, use these new helper functions in the x86 implementation of
BLAKE2s. (This part should be a separate patch, but unfortunately the
x86 implementation used the exact same function names like
"crypto_blake2s_update()", so it had to be updated at the same time.)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-12-23 16:09:54 +08:00
|
|
|
.update = crypto_blake2s_update_x86, \
|
|
|
|
.final = crypto_blake2s_final_x86, \
|
2020-12-23 16:09:51 +08:00
|
|
|
.descsize = sizeof(struct blake2s_state), \
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct shash_alg blake2s_algs[] = {
|
|
|
|
BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
|
|
|
|
BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
|
|
|
|
BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
|
|
|
|
BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
|
|
|
|
};
|
2019-11-08 20:22:31 +08:00
|
|
|
|
|
|
|
static int __init blake2s_mod_init(void)
|
|
|
|
{
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
static_branch_enable(&blake2s_use_ssse3);
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
|
|
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
|
|
|
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
|
|
|
XFEATURE_MASK_AVX512, NULL))
|
|
|
|
static_branch_enable(&blake2s_use_avx512);
|
|
|
|
|
2019-11-25 18:31:12 +08:00
|
|
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
|
|
|
crypto_register_shashes(blake2s_algs,
|
|
|
|
ARRAY_SIZE(blake2s_algs)) : 0;
|
2019-11-08 20:22:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit blake2s_mod_exit(void)
|
|
|
|
{
|
2019-11-25 18:31:12 +08:00
|
|
|
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
2019-11-08 20:22:31 +08:00
|
|
|
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(blake2s_mod_init);
|
|
|
|
module_exit(blake2s_mod_exit);
|
|
|
|
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-128");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-160");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-224");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-256");
|
|
|
|
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
|
|
|
MODULE_LICENSE("GPL v2");
|