crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant

Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Martin Willi 2018-11-11 10:36:25 +01:00 committed by Herbert Xu
parent 05ba88468b
commit e4e72063d3
2 changed files with 62 additions and 21 deletions

View File

@ -25,12 +25,13 @@ CTRINC: .octa 0x00000003000000020000000100000000
ENTRY(chacha20_block_xor_ssse3) ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s # %rdi: Input state matrix, s
# %rsi: 1 data block output, o # %rsi: up to 1 data block output, o
# %rdx: 1 data block input, i # %rdx: up to 1 data block input, i
# %rcx: input/output length in bytes
# This function encrypts one ChaCha20 block by loading the state matrix # This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in # in four SSE registers. It performs matrix operation on four words in
# parallel, but requireds shuffling to rearrange the words after each # parallel, but requires shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better # round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR. # traditional shift+OR.
@ -48,6 +49,7 @@ ENTRY(chacha20_block_xor_ssse3)
movdqa ROT8(%rip),%xmm4 movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5 movdqa ROT16(%rip),%xmm5
mov %rcx,%rax
mov $10,%ecx mov $10,%ecx
.Ldoubleround: .Ldoubleround:
@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
jnz .Ldoubleround jnz .Ldoubleround
# o0 = i0 ^ (x0 + s0) # o0 = i0 ^ (x0 + s0)
movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0 paddd %xmm8,%xmm0
cmp $0x10,%rax
jl .Lxorpart
movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0 pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi) movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1) # o1 = i1 ^ (x1 + s1)
movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1 paddd %xmm9,%xmm1
pxor %xmm5,%xmm1 movdqa %xmm1,%xmm0
movdqu %xmm1,0x10(%rsi) cmp $0x20,%rax
jl .Lxorpart
movdqu 0x10(%rdx),%xmm0
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2) # o2 = i2 ^ (x2 + s2)
movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2 paddd %xmm10,%xmm2
pxor %xmm6,%xmm2 movdqa %xmm2,%xmm0
movdqu %xmm2,0x20(%rsi) cmp $0x30,%rax
jl .Lxorpart
movdqu 0x20(%rdx),%xmm0
pxor %xmm2,%xmm0
movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3) # o3 = i3 ^ (x3 + s3)
movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3 paddd %xmm11,%xmm3
pxor %xmm7,%xmm3 movdqa %xmm3,%xmm0
movdqu %xmm3,0x30(%rsi) cmp $0x40,%rax
jl .Lxorpart
movdqu 0x30(%rdx),%xmm0
pxor %xmm3,%xmm0
movdqu %xmm0,0x30(%rsi)
.Ldone:
ret ret
.Lxorpart:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone
and $~0x0f,%rax
mov %rsi,%r11
lea 8(%rsp),%r10
sub $0x10,%rsp
and $~31,%rsp
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
lea -8(%r10),%rsp
jmp .Ldone
ENDPROC(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3)
ENTRY(chacha20_4block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3)

View File

@ -19,7 +19,8 @@
#define CHACHA20_STATE_ALIGN 16 #define CHACHA20_STATE_ALIGN 16
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2 #ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
@ -29,8 +30,6 @@ static bool chacha20_use_avx2;
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes) unsigned int bytes)
{ {
u8 buf[CHACHA20_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2 #ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) { if (chacha20_use_avx2) {
while (bytes >= CHACHA20_BLOCK_SIZE * 8) { while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += 4; state[12] += 4;
} }
while (bytes >= CHACHA20_BLOCK_SIZE) { while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block_xor_ssse3(state, dst, src); chacha20_block_xor_ssse3(state, dst, src, bytes);
bytes -= CHACHA20_BLOCK_SIZE; bytes -= CHACHA20_BLOCK_SIZE;
src += CHACHA20_BLOCK_SIZE; src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE;
state[12]++; state[12]++;
} }
if (bytes) { if (bytes) {
memcpy(buf, src, bytes); chacha20_block_xor_ssse3(state, dst, src, bytes);
chacha20_block_xor_ssse3(state, buf, buf);
memcpy(dst, buf, bytes);
} }
} }