crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance

This patch adds improved F-macro for 4-way parallel functions. With new
F-macro for 4-way parallel functions, blowfish sees ~15% improvement in
speed tests on AMD Phenom II (~5% on Intel Xeon E7330).

However when used in 1-way blowfish function new macro would be ~10%
slower than original, so old F-macro is kept for 1-way functions.
Patch cleans up old F-macro as it is no longer needed in 4-way part.

Patch also does register macro renaming to reduce stack usage.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Jussi Kivilinna 2011-09-23 19:50:55 +03:00 committed by Herbert Xu
parent fad8fa4782
commit e827bb09c8
1 changed files with 96 additions and 98 deletions

View File

@ -56,38 +56,32 @@
#define RT0 %rbp #define RT0 %rbp
#define RT1 %rsi #define RT1 %rsi
#define RT2 %r8
#define RT3 %r9
#define RT0d %ebp #define RT0d %ebp
#define RT1d %esi #define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d
#define RK0 %r8 #define RKEY %r10
#define RK1 %r9
#define RK2 %r10
#define RK3 %r11
#define RK0d %r8d
#define RK1d %r9d
#define RK2d %r10d
#define RK3d %r11d
#define RKEY %r12
/*********************************************************************** /***********************************************************************
* 1-way blowfish * 1-way blowfish
***********************************************************************/ ***********************************************************************/
#define F(x, k) \ #define F() \
rorq $16, x; \ rorq $16, RX0; \
movzbl x ## bh, RT0d; \ movzbl RX0bh, RT0d; \
movzbl x ## bl, RT1d; \ movzbl RX0bl, RT1d; \
rolq $16, x; \ rolq $16, RX0; \
movl s0(CTX,RT0,4), k ## d; \ movl s0(CTX,RT0,4), RT0d; \
addl s1(CTX,RT1,4), k ## d; \ addl s1(CTX,RT1,4), RT0d; \
movzbl x ## bh, RT0d; \ movzbl RX0bh, RT1d; \
movzbl x ## bl, RT1d; \ movzbl RX0bl, RT2d; \
rolq $32, x; \ rolq $32, RX0; \
xorl s2(CTX,RT0,4), k ## d; \ xorl s2(CTX,RT1,4), RT0d; \
addl s3(CTX,RT1,4), k ## d; \ addl s3(CTX,RT2,4), RT0d; \
xorq k, x; xorq RT0, RX0;
#define add_roundkey_enc(n) \ #define add_roundkey_enc(n) \
xorq p+4*(n)(CTX), RX0; xorq p+4*(n)(CTX), RX0;
@ -95,11 +89,8 @@
#define round_enc(n) \ #define round_enc(n) \
add_roundkey_enc(n); \ add_roundkey_enc(n); \
\ \
F(RX0, RK0); \ F(); \
F(RX0, RK0); F();
#define round_final_enc(n) \
xorq p+4*(n)(CTX), RX0;
#define add_roundkey_dec(n) \ #define add_roundkey_dec(n) \
movq p+4*(n-1)(CTX), RT0; \ movq p+4*(n-1)(CTX), RT0; \
@ -109,8 +100,8 @@
#define round_dec(n) \ #define round_dec(n) \
add_roundkey_dec(n); \ add_roundkey_dec(n); \
\ \
F(RX0, RK0); \ F(); \
F(RX0, RK0); \ F(); \
#define read_block() \ #define read_block() \
movq (RIO), RX0; \ movq (RIO), RX0; \
@ -130,16 +121,15 @@
.type __blowfish_enc_blk,@function; .type __blowfish_enc_blk,@function;
__blowfish_enc_blk: __blowfish_enc_blk:
// input: /* input:
// %rdi: ctx, CTX * %rdi: ctx, CTX
// %rsi: dst * %rsi: dst
// %rdx: src * %rdx: src
// %rcx: bool xor * %rcx: bool, if true: xor output
pushq %rbp; */
pushq %rbx; movq %rbp, %r11;
pushq %rsi; movq %rsi, %r10;
pushq %rcx;
movq %rdx, RIO; movq %rdx, RIO;
read_block(); read_block();
@ -154,38 +144,31 @@ __blowfish_enc_blk:
round_enc(14); round_enc(14);
add_roundkey_enc(16); add_roundkey_enc(16);
popq %rbp; movq %r11, %rbp;
popq RIO;
test %bpl, %bpl; movq %r10, RIO;
test %cl, %cl;
jnz __enc_xor; jnz __enc_xor;
write_block(); write_block();
__enc_ret:
popq %rbx;
popq %rbp;
ret; ret;
__enc_xor: __enc_xor:
xor_block(); xor_block();
ret;
jmp __enc_ret;
.align 8 .align 8
.global blowfish_dec_blk .global blowfish_dec_blk
.type blowfish_dec_blk,@function; .type blowfish_dec_blk,@function;
blowfish_dec_blk: blowfish_dec_blk:
// input: /* input:
// %rdi: ctx, CTX * %rdi: ctx, CTX
// %rsi: dst * %rsi: dst
// %rdx: src * %rdx: src
pushq %rbp; */
pushq %rbx; movq %rbp, %r11;
pushq %rsi; movq %rsi, %r10;
movq %rdx, RIO; movq %rdx, RIO;
read_block(); read_block();
@ -200,17 +183,33 @@ blowfish_dec_blk:
round_dec(3); round_dec(3);
add_roundkey_dec(1); add_roundkey_dec(1);
popq RIO; movq %r10, RIO;
write_block(); write_block();
popq %rbx; movq %r11, %rbp;
popq %rbp;
ret; ret;
/********************************************************************** /**********************************************************************
4-way blowfish, four blocks parallel 4-way blowfish, four blocks parallel
**********************************************************************/ **********************************************************************/
/* F() for 4-way. Slower when used alone/1-way, but faster when used
* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
*/
#define F4(x) \
movzbl x ## bh, RT1d; \
movzbl x ## bl, RT3d; \
rorq $16, x; \
movzbl x ## bh, RT0d; \
movzbl x ## bl, RT2d; \
rorq $16, x; \
movl s0(CTX,RT0,4), RT0d; \
addl s1(CTX,RT2,4), RT0d; \
xorl s2(CTX,RT1,4), RT0d; \
addl s3(CTX,RT3,4), RT0d; \
xorq RT0, x;
#define add_preloaded_roundkey4() \ #define add_preloaded_roundkey4() \
xorq RKEY, RX0; \ xorq RKEY, RX0; \
xorq RKEY, RX1; \ xorq RKEY, RX1; \
@ -227,15 +226,15 @@ blowfish_dec_blk:
#define round_enc4(n) \ #define round_enc4(n) \
add_roundkey_enc4(n); \ add_roundkey_enc4(n); \
\ \
F(RX0, RK0); \ F4(RX0); \
F(RX1, RK1); \ F4(RX1); \
F(RX2, RK2); \ F4(RX2); \
F(RX3, RK3); \ F4(RX3); \
\ \
F(RX0, RK0); \ F4(RX0); \
F(RX1, RK1); \ F4(RX1); \
F(RX2, RK2); \ F4(RX2); \
F(RX3, RK3); F4(RX3);
#define preload_roundkey_dec(n) \ #define preload_roundkey_dec(n) \
movq p+4*((n)-1)(CTX), RKEY; \ movq p+4*((n)-1)(CTX), RKEY; \
@ -248,15 +247,15 @@ blowfish_dec_blk:
#define round_dec4(n) \ #define round_dec4(n) \
add_roundkey_dec4(n); \ add_roundkey_dec4(n); \
\ \
F(RX0, RK0); \ F4(RX0); \
F(RX1, RK1); \ F4(RX1); \
F(RX2, RK2); \ F4(RX2); \
F(RX3, RK3); \ F4(RX3); \
\ \
F(RX0, RK0); \ F4(RX0); \
F(RX1, RK1); \ F4(RX1); \
F(RX2, RK2); \ F4(RX2); \
F(RX3, RK3); F4(RX3);
#define read_block4() \ #define read_block4() \
movq (RIO), RX0; \ movq (RIO), RX0; \
@ -306,18 +305,19 @@ blowfish_dec_blk:
.type __blowfish_enc_blk_4way,@function; .type __blowfish_enc_blk_4way,@function;
__blowfish_enc_blk_4way: __blowfish_enc_blk_4way:
// input: /* input:
// %rdi: ctx, CTX * %rdi: ctx, CTX
// %rsi: dst * %rsi: dst
// %rdx: src * %rdx: src
// %rcx: bool xor * %rcx: bool, if true: xor output
*/
pushq %rbp; pushq %rbp;
pushq %rbx; pushq %rbx;
pushq RKEY; pushq %rcx;
preload_roundkey_enc(0); preload_roundkey_enc(0);
pushq %rsi; movq %rsi, %r11;
pushq %rcx;
movq %rdx, RIO; movq %rdx, RIO;
read_block4(); read_block4();
@ -333,40 +333,39 @@ __blowfish_enc_blk_4way:
add_preloaded_roundkey4(); add_preloaded_roundkey4();
popq %rbp; popq %rbp;
popq RIO; movq %r11, RIO;
test %bpl, %bpl; test %bpl, %bpl;
jnz __enc_xor4; jnz __enc_xor4;
write_block4(); write_block4();
__enc_ret4:
popq RKEY;
popq %rbx; popq %rbx;
popq %rbp; popq %rbp;
ret; ret;
__enc_xor4: __enc_xor4:
xor_block4(); xor_block4();
jmp __enc_ret4; popq %rbx;
popq %rbp;
ret;
.align 8 .align 8
.global blowfish_dec_blk_4way .global blowfish_dec_blk_4way
.type blowfish_dec_blk_4way,@function; .type blowfish_dec_blk_4way,@function;
blowfish_dec_blk_4way: blowfish_dec_blk_4way:
// input: /* input:
// %rdi: ctx, CTX * %rdi: ctx, CTX
// %rsi: dst * %rsi: dst
// %rdx: src * %rdx: src
*/
pushq %rbp; pushq %rbp;
pushq %rbx; pushq %rbx;
pushq RKEY;
preload_roundkey_dec(17); preload_roundkey_dec(17);
pushq %rsi; movq %rsi, %r11;
movq %rdx, RIO; movq %rdx, RIO;
read_block4(); read_block4();
@ -381,10 +380,9 @@ blowfish_dec_blk_4way:
round_dec4(3); round_dec4(3);
add_preloaded_roundkey4(); add_preloaded_roundkey4();
popq RIO; movq %r11, RIO;
write_block4(); write_block4();
popq RKEY;
popq %rbx; popq %rbx;
popq %rbp; popq %rbp;