crypto: x86/chacha20 - add XChaCha20 support
Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD implementations of ChaCha20. This can be used by Adiantum. An SSSE3 implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha permutation into its own function. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
0f961f9f67
commit
4af7826187
|
@ -10,6 +10,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/frame.h>
|
||||||
|
|
||||||
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
||||||
.align 16
|
.align 16
|
||||||
|
@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
ENTRY(chacha20_block_xor_ssse3)
|
/*
|
||||||
# %rdi: Input state matrix, s
|
* chacha20_permute - permute one block
|
||||||
# %rsi: up to 1 data block output, o
|
*
|
||||||
# %rdx: up to 1 data block input, i
|
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
|
||||||
# %rcx: input/output length in bytes
|
* function performs matrix operations on four words in parallel, but requires
|
||||||
|
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
|
||||||
# This function encrypts one ChaCha20 block by loading the state matrix
|
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
|
||||||
# in four SSE registers. It performs matrix operation on four words in
|
* rotation uses traditional shift+OR.
|
||||||
# parallel, but requires shuffling to rearrange the words after each
|
*
|
||||||
# round. 8/16-bit word rotation is done with the slightly better
|
* Clobbers: %ecx, %xmm4-%xmm7
|
||||||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
|
*/
|
||||||
# traditional shift+OR.
|
chacha20_permute:
|
||||||
|
|
||||||
# x0..3 = s0..3
|
|
||||||
movdqa 0x00(%rdi),%xmm0
|
|
||||||
movdqa 0x10(%rdi),%xmm1
|
|
||||||
movdqa 0x20(%rdi),%xmm2
|
|
||||||
movdqa 0x30(%rdi),%xmm3
|
|
||||||
movdqa %xmm0,%xmm8
|
|
||||||
movdqa %xmm1,%xmm9
|
|
||||||
movdqa %xmm2,%xmm10
|
|
||||||
movdqa %xmm3,%xmm11
|
|
||||||
|
|
||||||
movdqa ROT8(%rip),%xmm4
|
movdqa ROT8(%rip),%xmm4
|
||||||
movdqa ROT16(%rip),%xmm5
|
movdqa ROT16(%rip),%xmm5
|
||||||
|
|
||||||
mov %rcx,%rax
|
|
||||||
mov $10,%ecx
|
mov $10,%ecx
|
||||||
|
|
||||||
.Ldoubleround:
|
.Ldoubleround:
|
||||||
|
|
||||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
paddd %xmm1,%xmm0
|
paddd %xmm1,%xmm0
|
||||||
pxor %xmm0,%xmm3
|
pxor %xmm0,%xmm3
|
||||||
|
@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||||
dec %ecx
|
dec %ecx
|
||||||
jnz .Ldoubleround
|
jnz .Ldoubleround
|
||||||
|
|
||||||
|
ret
|
||||||
|
ENDPROC(chacha20_permute)
|
||||||
|
|
||||||
|
ENTRY(chacha20_block_xor_ssse3)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 1 data block output, o
|
||||||
|
# %rdx: up to 1 data block input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
FRAME_BEGIN
|
||||||
|
|
||||||
|
# x0..3 = s0..3
|
||||||
|
movdqa 0x00(%rdi),%xmm0
|
||||||
|
movdqa 0x10(%rdi),%xmm1
|
||||||
|
movdqa 0x20(%rdi),%xmm2
|
||||||
|
movdqa 0x30(%rdi),%xmm3
|
||||||
|
movdqa %xmm0,%xmm8
|
||||||
|
movdqa %xmm1,%xmm9
|
||||||
|
movdqa %xmm2,%xmm10
|
||||||
|
movdqa %xmm3,%xmm11
|
||||||
|
|
||||||
|
mov %rcx,%rax
|
||||||
|
call chacha20_permute
|
||||||
|
|
||||||
# o0 = i0 ^ (x0 + s0)
|
# o0 = i0 ^ (x0 + s0)
|
||||||
paddd %xmm8,%xmm0
|
paddd %xmm8,%xmm0
|
||||||
cmp $0x10,%rax
|
cmp $0x10,%rax
|
||||||
|
@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||||
movdqu %xmm0,0x30(%rsi)
|
movdqu %xmm0,0x30(%rsi)
|
||||||
|
|
||||||
.Ldone:
|
.Ldone:
|
||||||
|
FRAME_END
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.Lxorpart:
|
.Lxorpart:
|
||||||
|
@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||||
|
|
||||||
ENDPROC(chacha20_block_xor_ssse3)
|
ENDPROC(chacha20_block_xor_ssse3)
|
||||||
|
|
||||||
|
ENTRY(hchacha20_block_ssse3)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: output (8 32-bit words)
|
||||||
|
FRAME_BEGIN
|
||||||
|
|
||||||
|
movdqa 0x00(%rdi),%xmm0
|
||||||
|
movdqa 0x10(%rdi),%xmm1
|
||||||
|
movdqa 0x20(%rdi),%xmm2
|
||||||
|
movdqa 0x30(%rdi),%xmm3
|
||||||
|
|
||||||
|
call chacha20_permute
|
||||||
|
|
||||||
|
movdqu %xmm0,0x00(%rsi)
|
||||||
|
movdqu %xmm3,0x10(%rsi)
|
||||||
|
|
||||||
|
FRAME_END
|
||||||
|
ret
|
||||||
|
ENDPROC(hchacha20_block_ssse3)
|
||||||
|
|
||||||
ENTRY(chacha20_4block_xor_ssse3)
|
ENTRY(chacha20_4block_xor_ssse3)
|
||||||
# %rdi: Input state matrix, s
|
# %rdi: Input state matrix, s
|
||||||
# %rsi: up to 4 data blocks output, o
|
# %rsi: up to 4 data blocks output, o
|
||||||
|
|
|
@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
unsigned int len);
|
unsigned int len);
|
||||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
unsigned int len);
|
unsigned int len);
|
||||||
|
asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out);
|
||||||
#ifdef CONFIG_AS_AVX2
|
#ifdef CONFIG_AS_AVX2
|
||||||
asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||||
unsigned int len);
|
unsigned int len);
|
||||||
|
@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int chacha20_simd(struct skcipher_request *req)
|
static int chacha20_simd_stream_xor(struct skcipher_request *req,
|
||||||
|
struct chacha_ctx *ctx, u8 *iv)
|
||||||
{
|
{
|
||||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
|
||||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
|
||||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||||
struct skcipher_walk walk;
|
struct skcipher_walk walk;
|
||||||
int err;
|
int err;
|
||||||
|
@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req)
|
||||||
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
||||||
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
||||||
|
|
||||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
|
||||||
return crypto_chacha_crypt(req);
|
|
||||||
|
|
||||||
err = skcipher_walk_virt(&walk, req, true);
|
err = skcipher_walk_virt(&walk, req, true);
|
||||||
|
|
||||||
crypto_chacha_init(state, ctx, walk.iv);
|
crypto_chacha_init(state, ctx, iv);
|
||||||
|
|
||||||
kernel_fpu_begin();
|
|
||||||
|
|
||||||
while (walk.nbytes > 0) {
|
while (walk.nbytes > 0) {
|
||||||
unsigned int nbytes = walk.nbytes;
|
unsigned int nbytes = walk.nbytes;
|
||||||
|
@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req)
|
||||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int chacha20_simd(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
int err;
|
||||||
|
|
||||||
|
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
|
||||||
|
return crypto_chacha_crypt(req);
|
||||||
|
|
||||||
|
kernel_fpu_begin();
|
||||||
|
err = chacha20_simd_stream_xor(req, ctx, req->iv);
|
||||||
|
kernel_fpu_end();
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int xchacha20_simd(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
struct chacha_ctx subctx;
|
||||||
|
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||||
|
u8 real_iv[16];
|
||||||
|
int err;
|
||||||
|
|
||||||
|
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
|
||||||
|
return crypto_xchacha_crypt(req);
|
||||||
|
|
||||||
|
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
||||||
|
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
||||||
|
crypto_chacha_init(state, ctx, req->iv);
|
||||||
|
|
||||||
|
kernel_fpu_begin();
|
||||||
|
|
||||||
|
hchacha20_block_ssse3(state, subctx.key);
|
||||||
|
|
||||||
|
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||||
|
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||||
|
err = chacha20_simd_stream_xor(req, &subctx, real_iv);
|
||||||
|
|
||||||
kernel_fpu_end();
|
kernel_fpu_end();
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct skcipher_alg alg = {
|
static struct skcipher_alg algs[] = {
|
||||||
.base.cra_name = "chacha20",
|
{
|
||||||
.base.cra_driver_name = "chacha20-simd",
|
.base.cra_name = "chacha20",
|
||||||
.base.cra_priority = 300,
|
.base.cra_driver_name = "chacha20-simd",
|
||||||
.base.cra_blocksize = 1,
|
.base.cra_priority = 300,
|
||||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
.base.cra_blocksize = 1,
|
||||||
.base.cra_module = THIS_MODULE,
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
.min_keysize = CHACHA_KEY_SIZE,
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
.ivsize = CHACHA_IV_SIZE,
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
.setkey = crypto_chacha20_setkey,
|
.setkey = crypto_chacha20_setkey,
|
||||||
.encrypt = chacha20_simd,
|
.encrypt = chacha20_simd,
|
||||||
.decrypt = chacha20_simd,
|
.decrypt = chacha20_simd,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha20",
|
||||||
|
.base.cra_driver_name = "xchacha20-simd",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = crypto_chacha20_setkey,
|
||||||
|
.encrypt = xchacha20_simd,
|
||||||
|
.decrypt = xchacha20_simd,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
static int __init chacha20_simd_mod_init(void)
|
static int __init chacha20_simd_mod_init(void)
|
||||||
|
@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void)
|
||||||
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
|
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
return crypto_register_skcipher(&alg);
|
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __exit chacha20_simd_mod_fini(void)
|
static void __exit chacha20_simd_mod_fini(void)
|
||||||
{
|
{
|
||||||
crypto_unregister_skcipher(&alg);
|
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||||
}
|
}
|
||||||
|
|
||||||
module_init(chacha20_simd_mod_init);
|
module_init(chacha20_simd_mod_init);
|
||||||
|
@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
||||||
MODULE_ALIAS_CRYPTO("chacha20");
|
MODULE_ALIAS_CRYPTO("chacha20");
|
||||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20-simd");
|
||||||
|
|
|
@ -1468,19 +1468,13 @@ config CRYPTO_CHACHA20
|
||||||
in some performance-sensitive scenarios.
|
in some performance-sensitive scenarios.
|
||||||
|
|
||||||
config CRYPTO_CHACHA20_X86_64
|
config CRYPTO_CHACHA20_X86_64
|
||||||
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
|
tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
|
||||||
depends on X86 && 64BIT
|
depends on X86 && 64BIT
|
||||||
select CRYPTO_BLKCIPHER
|
select CRYPTO_BLKCIPHER
|
||||||
select CRYPTO_CHACHA20
|
select CRYPTO_CHACHA20
|
||||||
help
|
help
|
||||||
ChaCha20 cipher algorithm, RFC7539.
|
SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20
|
||||||
|
and XChaCha20 stream ciphers.
|
||||||
ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
|
|
||||||
Bernstein and further specified in RFC7539 for use in IETF protocols.
|
|
||||||
This is the x86_64 assembler implementation using SIMD instructions.
|
|
||||||
|
|
||||||
See also:
|
|
||||||
<http://cr.yp.to/chacha/chacha-20080128.pdf>
|
|
||||||
|
|
||||||
config CRYPTO_SEED
|
config CRYPTO_SEED
|
||||||
tristate "SEED cipher algorithm"
|
tristate "SEED cipher algorithm"
|
||||||
|
|
Loading…
Reference in New Issue