2014-05-15 for-3.16 pull request
-----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.14 (GNU/Linux) iQF8BAABCgBmBQJTdT+DXxSAAAAAAC4AKGlzc3Vlci1mcHJAbm90YXRpb25zLm9w ZW5wZ3AuZmlmdGhob3JzZW1hbi5uZXQ5Q0QyQTBEQTZBRDhGNzMzMDE3NUUyQkJD MjM3MjA3RTk1NzRGQTdEAAoJEMI3IH6VdPp9jL8H/3kzXh+5rZycb5r48E6Cic/a Gl0NmRjGtbsxLZcvd8NR3cDol1c9mEAelFrwSA3ar0W91hDf9gsEgxYSBcGKfX/b sxqzhFoArMDitvu8QQ38SMIlXaGokW3sevj4B93ljw9DqhFR/BvJctmVfyNuPLpp fFZh0JRyHnkMkXKMJKtYyiXRgfGiJ90rGHPRZLJW7zCIk0/oYd4LESpnqWC+cGKg w8WIe0Vu4CsiQffkWRmMMJmcJIVayJXAtMHOyBNCKMnYziZSqs+JIUwhbPCxci51 vTxWOFs/CAeZp8mrlHR2TD3dHtzP8c/NSL7E3k4QA0tYReA8T3XHwhRmqtHCp7E= =sllM -----END PGP SIGNATURE----- Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream FPSIMD register bank context switching and crypto algorithms optimisations for arm64 from Ard Biesheuvel. * tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm: arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions arm64: pull in <asm/simd.h> from asm-generic arm64/crypto: AES in CCM mode using ARMv8 Crypto Extensions arm64/crypto: AES using ARMv8 Crypto Extensions arm64/crypto: GHASH secure hash using ARMv8 Crypto Extensions arm64/crypto: SHA-224/SHA-256 using ARMv8 Crypto Extensions arm64/crypto: SHA-1 using ARMv8 Crypto Extensions arm64: add support for kernel mode NEON in interrupt context arm64: defer reloading a task's FPSIMD state to userland resume arm64: add abstractions for FPSIMD state manipulation asm-generic: allow generic unaligned access if the arch supports it Conflicts: arch/arm64/include/asm/thread_info.h
This commit is contained in:
commit
cf5c95db57
|
@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug"
|
|||
source "security/Kconfig"
|
||||
|
||||
source "crypto/Kconfig"
|
||||
if CRYPTO
|
||||
source "arch/arm64/crypto/Kconfig"
|
||||
endif
|
||||
|
||||
source "lib/Kconfig"
|
||||
|
|
|
@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS
|
|||
core-y += arch/arm64/kernel/ arch/arm64/mm/
|
||||
core-$(CONFIG_KVM) += arch/arm64/kvm/
|
||||
core-$(CONFIG_XEN) += arch/arm64/xen/
|
||||
core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
|
||||
libs-y := arch/arm64/lib/ $(libs-y)
|
||||
libs-y += $(LIBGCC)
|
||||
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
menuconfig ARM64_CRYPTO
|
||||
bool "ARM64 Accelerated Cryptographic Algorithms"
|
||||
depends on ARM64
|
||||
help
|
||||
Say Y here to choose from a selection of cryptographic algorithms
|
||||
implemented using ARM64 specific CPU features or instructions.
|
||||
|
||||
if ARM64_CRYPTO
|
||||
|
||||
config CRYPTO_SHA1_ARM64_CE
|
||||
tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_SHA2_ARM64_CE
|
||||
tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_GHASH_ARM64_CE
|
||||
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_AES_ARM64_CE
|
||||
tristate "AES core cipher using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_AES
|
||||
|
||||
config CRYPTO_AES_ARM64_CE_CCM
|
||||
tristate "AES in CCM mode using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_AEAD
|
||||
|
||||
config CRYPTO_AES_ARM64_CE_BLK
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_ABLK_HELPER
|
||||
|
||||
config CRYPTO_AES_ARM64_NEON_BLK
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
|
||||
depends on ARM64 && KERNEL_MODE_NEON
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_ABLK_HELPER
|
||||
|
||||
endif
|
|
@ -0,0 +1,38 @@
|
|||
#
|
||||
# linux/arch/arm64/crypto/Makefile
|
||||
#
|
||||
# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
#
|
||||
|
||||
obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
|
||||
sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
|
||||
sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
|
||||
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
|
||||
CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
|
||||
aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
|
||||
aes-ce-blk-y := aes-glue-ce.o aes-ce.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
|
||||
aes-neon-blk-y := aes-glue-neon.o aes-neon.o
|
||||
|
||||
AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE
|
||||
AFLAGS_aes-neon.o := -DINTERLEAVE=4
|
||||
|
||||
CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
|
||||
|
||||
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
|
||||
$(call if_changed_dep,cc_o_c)
|
|
@ -0,0 +1,222 @@
|
|||
/*
|
||||
* aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
/*
|
||||
* void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
|
||||
* u32 *macp, u8 const rk[], u32 rounds);
|
||||
*/
|
||||
ENTRY(ce_aes_ccm_auth_data)
|
||||
ldr w8, [x3] /* leftover from prev round? */
|
||||
ld1 {v0.2d}, [x0] /* load mac */
|
||||
cbz w8, 1f
|
||||
sub w8, w8, #16
|
||||
eor v1.16b, v1.16b, v1.16b
|
||||
0: ldrb w7, [x1], #1 /* get 1 byte of input */
|
||||
subs w2, w2, #1
|
||||
add w8, w8, #1
|
||||
ins v1.b[0], w7
|
||||
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
|
||||
beq 8f /* out of input? */
|
||||
cbnz w8, 0b
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
1: ld1 {v3.2d}, [x4] /* load first round key */
|
||||
prfm pldl1strm, [x1]
|
||||
cmp w5, #12 /* which key size? */
|
||||
add x6, x4, #16
|
||||
sub w7, w5, #2 /* modified # of rounds */
|
||||
bmi 2f
|
||||
bne 5f
|
||||
mov v5.16b, v3.16b
|
||||
b 4f
|
||||
2: mov v4.16b, v3.16b
|
||||
ld1 {v5.2d}, [x6], #16 /* load 2nd round key */
|
||||
3: aese v0.16b, v4.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
4: ld1 {v3.2d}, [x6], #16 /* load next round key */
|
||||
aese v0.16b, v5.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
5: ld1 {v4.2d}, [x6], #16 /* load next round key */
|
||||
subs w7, w7, #3
|
||||
aese v0.16b, v3.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
ld1 {v5.2d}, [x6], #16 /* load next round key */
|
||||
bpl 3b
|
||||
aese v0.16b, v4.16b
|
||||
subs w2, w2, #16 /* last data? */
|
||||
eor v0.16b, v0.16b, v5.16b /* final round */
|
||||
bmi 6f
|
||||
ld1 {v1.16b}, [x1], #16 /* load next input block */
|
||||
eor v0.16b, v0.16b, v1.16b /* xor with mac */
|
||||
bne 1b
|
||||
6: st1 {v0.2d}, [x0] /* store mac */
|
||||
beq 10f
|
||||
adds w2, w2, #16
|
||||
beq 10f
|
||||
mov w8, w2
|
||||
7: ldrb w7, [x1], #1
|
||||
umov w6, v0.b[0]
|
||||
eor w6, w6, w7
|
||||
strb w6, [x0], #1
|
||||
subs w2, w2, #1
|
||||
beq 10f
|
||||
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
|
||||
b 7b
|
||||
8: mov w7, w8
|
||||
add w8, w8, #16
|
||||
9: ext v1.16b, v1.16b, v1.16b, #1
|
||||
adds w7, w7, #1
|
||||
bne 9b
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
st1 {v0.2d}, [x0]
|
||||
10: str w8, [x3]
|
||||
ret
|
||||
ENDPROC(ce_aes_ccm_auth_data)
|
||||
|
||||
/*
|
||||
* void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
|
||||
* u32 rounds);
|
||||
*/
|
||||
ENTRY(ce_aes_ccm_final)
|
||||
ld1 {v3.2d}, [x2], #16 /* load first round key */
|
||||
ld1 {v0.2d}, [x0] /* load mac */
|
||||
cmp w3, #12 /* which key size? */
|
||||
sub w3, w3, #2 /* modified # of rounds */
|
||||
ld1 {v1.2d}, [x1] /* load 1st ctriv */
|
||||
bmi 0f
|
||||
bne 3f
|
||||
mov v5.16b, v3.16b
|
||||
b 2f
|
||||
0: mov v4.16b, v3.16b
|
||||
1: ld1 {v5.2d}, [x2], #16 /* load next round key */
|
||||
aese v0.16b, v4.16b
|
||||
aese v1.16b, v4.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
2: ld1 {v3.2d}, [x2], #16 /* load next round key */
|
||||
aese v0.16b, v5.16b
|
||||
aese v1.16b, v5.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
3: ld1 {v4.2d}, [x2], #16 /* load next round key */
|
||||
subs w3, w3, #3
|
||||
aese v0.16b, v3.16b
|
||||
aese v1.16b, v3.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
bpl 1b
|
||||
aese v0.16b, v4.16b
|
||||
aese v1.16b, v4.16b
|
||||
/* final round key cancels out */
|
||||
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
|
||||
st1 {v0.2d}, [x0] /* store result */
|
||||
ret
|
||||
ENDPROC(ce_aes_ccm_final)
|
||||
|
||||
.macro aes_ccm_do_crypt,enc
|
||||
ldr x8, [x6, #8] /* load lower ctr */
|
||||
ld1 {v0.2d}, [x5] /* load mac */
|
||||
rev x8, x8 /* keep swabbed ctr in reg */
|
||||
0: /* outer loop */
|
||||
ld1 {v1.1d}, [x6] /* load upper ctr */
|
||||
prfm pldl1strm, [x1]
|
||||
add x8, x8, #1
|
||||
rev x9, x8
|
||||
cmp w4, #12 /* which key size? */
|
||||
sub w7, w4, #2 /* get modified # of rounds */
|
||||
ins v1.d[1], x9 /* no carry in lower ctr */
|
||||
ld1 {v3.2d}, [x3] /* load first round key */
|
||||
add x10, x3, #16
|
||||
bmi 1f
|
||||
bne 4f
|
||||
mov v5.16b, v3.16b
|
||||
b 3f
|
||||
1: mov v4.16b, v3.16b
|
||||
ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
|
||||
2: /* inner loop: 3 rounds, 2x interleaved */
|
||||
aese v0.16b, v4.16b
|
||||
aese v1.16b, v4.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
3: ld1 {v3.2d}, [x10], #16 /* load next round key */
|
||||
aese v0.16b, v5.16b
|
||||
aese v1.16b, v5.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
4: ld1 {v4.2d}, [x10], #16 /* load next round key */
|
||||
subs w7, w7, #3
|
||||
aese v0.16b, v3.16b
|
||||
aese v1.16b, v3.16b
|
||||
aesmc v0.16b, v0.16b
|
||||
aesmc v1.16b, v1.16b
|
||||
ld1 {v5.2d}, [x10], #16 /* load next round key */
|
||||
bpl 2b
|
||||
aese v0.16b, v4.16b
|
||||
aese v1.16b, v4.16b
|
||||
subs w2, w2, #16
|
||||
bmi 6f /* partial block? */
|
||||
ld1 {v2.16b}, [x1], #16 /* load next input block */
|
||||
.if \enc == 1
|
||||
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
|
||||
eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
|
||||
.else
|
||||
eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */
|
||||
eor v1.16b, v2.16b, v5.16b /* final round enc */
|
||||
.endif
|
||||
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
|
||||
st1 {v1.16b}, [x0], #16 /* write output block */
|
||||
bne 0b
|
||||
rev x8, x8
|
||||
st1 {v0.2d}, [x5] /* store mac */
|
||||
str x8, [x6, #8] /* store lsb end of ctr (BE) */
|
||||
5: ret
|
||||
|
||||
6: eor v0.16b, v0.16b, v5.16b /* final round mac */
|
||||
eor v1.16b, v1.16b, v5.16b /* final round enc */
|
||||
st1 {v0.2d}, [x5] /* store mac */
|
||||
add w2, w2, #16 /* process partial tail block */
|
||||
7: ldrb w9, [x1], #1 /* get 1 byte of input */
|
||||
umov w6, v1.b[0] /* get top crypted ctr byte */
|
||||
umov w7, v0.b[0] /* get top mac byte */
|
||||
.if \enc == 1
|
||||
eor w7, w7, w9
|
||||
eor w9, w9, w6
|
||||
.else
|
||||
eor w9, w9, w6
|
||||
eor w7, w7, w9
|
||||
.endif
|
||||
strb w9, [x0], #1 /* store out byte */
|
||||
strb w7, [x5], #1 /* store mac byte */
|
||||
subs w2, w2, #1
|
||||
beq 5b
|
||||
ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
|
||||
ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
|
||||
b 7b
|
||||
.endm
|
||||
|
||||
/*
|
||||
* void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
|
||||
* u8 const rk[], u32 rounds, u8 mac[],
|
||||
* u8 ctr[]);
|
||||
* void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
|
||||
* u8 const rk[], u32 rounds, u8 mac[],
|
||||
* u8 ctr[]);
|
||||
*/
|
||||
ENTRY(ce_aes_ccm_encrypt)
|
||||
aes_ccm_do_crypt 1
|
||||
ENDPROC(ce_aes_ccm_encrypt)
|
||||
|
||||
ENTRY(ce_aes_ccm_decrypt)
|
||||
aes_ccm_do_crypt 0
|
||||
ENDPROC(ce_aes_ccm_decrypt)
|
|
@ -0,0 +1,297 @@
|
|||
/*
|
||||
* aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/aes.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/scatterwalk.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
static int num_rounds(struct crypto_aes_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* # of rounds specified by AES:
|
||||
* 128 bit key 10 rounds
|
||||
* 192 bit key 12 rounds
|
||||
* 256 bit key 14 rounds
|
||||
* => n byte key => 6 + (n/4) rounds
|
||||
*/
|
||||
return 6 + ctx->key_length / 4;
|
||||
}
|
||||
|
||||
asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
|
||||
u32 *macp, u32 const rk[], u32 rounds);
|
||||
|
||||
asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
|
||||
u32 const rk[], u32 rounds, u8 mac[],
|
||||
u8 ctr[]);
|
||||
|
||||
asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
|
||||
u32 const rk[], u32 rounds, u8 mac[],
|
||||
u8 ctr[]);
|
||||
|
||||
asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
|
||||
u32 rounds);
|
||||
|
||||
static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
|
||||
int ret;
|
||||
|
||||
ret = crypto_aes_expand_key(ctx, in_key, key_len);
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
|
||||
{
|
||||
if ((authsize & 1) || authsize < 4)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
__be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
|
||||
u32 l = req->iv[0] + 1;
|
||||
|
||||
/* verify that CCM dimension 'L' is set correctly in the IV */
|
||||
if (l < 2 || l > 8)
|
||||
return -EINVAL;
|
||||
|
||||
/* verify that msglen can in fact be represented in L bytes */
|
||||
if (l < 4 && msglen >> (8 * l))
|
||||
return -EOVERFLOW;
|
||||
|
||||
/*
|
||||
* Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
|
||||
* uses a u32 type to represent msglen so the top 4 bytes are always 0.
|
||||
*/
|
||||
n[0] = 0;
|
||||
n[1] = cpu_to_be32(msglen);
|
||||
|
||||
memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
|
||||
|
||||
/*
|
||||
* Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
|
||||
* - bits 0..2 : max # of bytes required to represent msglen, minus 1
|
||||
* (already set by caller)
|
||||
* - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
|
||||
* - bit 6 : indicates presence of authenticate-only data
|
||||
*/
|
||||
maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
|
||||
if (req->assoclen)
|
||||
maciv[0] |= 0x40;
|
||||
|
||||
memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
|
||||
struct __packed { __be16 l; __be32 h; u16 len; } ltag;
|
||||
struct scatter_walk walk;
|
||||
u32 len = req->assoclen;
|
||||
u32 macp = 0;
|
||||
|
||||
/* prepend the AAD with a length tag */
|
||||
if (len < 0xff00) {
|
||||
ltag.l = cpu_to_be16(len);
|
||||
ltag.len = 2;
|
||||
} else {
|
||||
ltag.l = cpu_to_be16(0xfffe);
|
||||
put_unaligned_be32(len, <ag.h);
|
||||
ltag.len = 6;
|
||||
}
|
||||
|
||||
ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc,
|
||||
num_rounds(ctx));
|
||||
scatterwalk_start(&walk, req->assoc);
|
||||
|
||||
do {
|
||||
u32 n = scatterwalk_clamp(&walk, len);
|
||||
u8 *p;
|
||||
|
||||
if (!n) {
|
||||
scatterwalk_start(&walk, sg_next(walk.sg));
|
||||
n = scatterwalk_clamp(&walk, len);
|
||||
}
|
||||
p = scatterwalk_map(&walk);
|
||||
ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
|
||||
num_rounds(ctx));
|
||||
len -= n;
|
||||
|
||||
scatterwalk_unmap(p);
|
||||
scatterwalk_advance(&walk, n);
|
||||
scatterwalk_done(&walk, 0, len);
|
||||
} while (len);
|
||||
}
|
||||
|
||||
static int ccm_encrypt(struct aead_request *req)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
|
||||
struct blkcipher_desc desc = { .info = req->iv };
|
||||
struct blkcipher_walk walk;
|
||||
u8 __aligned(8) mac[AES_BLOCK_SIZE];
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
u32 len = req->cryptlen;
|
||||
int err;
|
||||
|
||||
err = ccm_init_mac(req, mac, len);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
|
||||
if (req->assoclen)
|
||||
ccm_calculate_auth_mac(req, mac);
|
||||
|
||||
/* preserve the original iv for the final round */
|
||||
memcpy(buf, req->iv, AES_BLOCK_SIZE);
|
||||
|
||||
blkcipher_walk_init(&walk, req->dst, req->src, len);
|
||||
err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
|
||||
AES_BLOCK_SIZE);
|
||||
|
||||
while (walk.nbytes) {
|
||||
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
|
||||
|
||||
if (walk.nbytes == len)
|
||||
tail = 0;
|
||||
|
||||
ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes - tail, ctx->key_enc,
|
||||
num_rounds(ctx), mac, walk.iv);
|
||||
|
||||
len -= walk.nbytes - tail;
|
||||
err = blkcipher_walk_done(&desc, &walk, tail);
|
||||
}
|
||||
if (!err)
|
||||
ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
|
||||
|
||||
kernel_neon_end();
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* copy authtag to end of dst */
|
||||
scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
|
||||
crypto_aead_authsize(aead), 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ccm_decrypt(struct aead_request *req)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
|
||||
unsigned int authsize = crypto_aead_authsize(aead);
|
||||
struct blkcipher_desc desc = { .info = req->iv };
|
||||
struct blkcipher_walk walk;
|
||||
u8 __aligned(8) mac[AES_BLOCK_SIZE];
|
||||
u8 buf[AES_BLOCK_SIZE];
|
||||
u32 len = req->cryptlen - authsize;
|
||||
int err;
|
||||
|
||||
err = ccm_init_mac(req, mac, len);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
|
||||
if (req->assoclen)
|
||||
ccm_calculate_auth_mac(req, mac);
|
||||
|
||||
/* preserve the original iv for the final round */
|
||||
memcpy(buf, req->iv, AES_BLOCK_SIZE);
|
||||
|
||||
blkcipher_walk_init(&walk, req->dst, req->src, len);
|
||||
err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
|
||||
AES_BLOCK_SIZE);
|
||||
|
||||
while (walk.nbytes) {
|
||||
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
|
||||
|
||||
if (walk.nbytes == len)
|
||||
tail = 0;
|
||||
|
||||
ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes - tail, ctx->key_enc,
|
||||
num_rounds(ctx), mac, walk.iv);
|
||||
|
||||
len -= walk.nbytes - tail;
|
||||
err = blkcipher_walk_done(&desc, &walk, tail);
|
||||
}
|
||||
if (!err)
|
||||
ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
|
||||
|
||||
kernel_neon_end();
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* compare calculated auth tag with the stored one */
|
||||
scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
|
||||
authsize, 0);
|
||||
|
||||
if (memcmp(mac, buf, authsize))
|
||||
return -EBADMSG;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct crypto_alg ccm_aes_alg = {
|
||||
.cra_name = "ccm(aes)",
|
||||
.cra_driver_name = "ccm-aes-ce",
|
||||
.cra_priority = 300,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_AEAD,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_aead_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_aead = {
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.maxauthsize = AES_BLOCK_SIZE,
|
||||
.setkey = ccm_setkey,
|
||||
.setauthsize = ccm_setauthsize,
|
||||
.encrypt = ccm_encrypt,
|
||||
.decrypt = ccm_decrypt,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init aes_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_AES))
|
||||
return -ENODEV;
|
||||
return crypto_register_alg(&ccm_aes_alg);
|
||||
}
|
||||
|
||||
static void __exit aes_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_alg(&ccm_aes_alg);
|
||||
}
|
||||
|
||||
module_init(aes_mod_init);
|
||||
module_exit(aes_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS("ccm(aes)");
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
* aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <crypto/aes.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
struct aes_block {
|
||||
u8 b[AES_BLOCK_SIZE];
|
||||
};
|
||||
|
||||
static int num_rounds(struct crypto_aes_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* # of rounds specified by AES:
|
||||
* 128 bit key 10 rounds
|
||||
* 192 bit key 12 rounds
|
||||
* 256 bit key 14 rounds
|
||||
* => n byte key => 6 + (n/4) rounds
|
||||
*/
|
||||
return 6 + ctx->key_length / 4;
|
||||
}
|
||||
|
||||
static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
struct aes_block *out = (struct aes_block *)dst;
|
||||
struct aes_block const *in = (struct aes_block *)src;
|
||||
void *dummy0;
|
||||
int dummy1;
|
||||
|
||||
kernel_neon_begin_partial(4);
|
||||
|
||||
__asm__(" ld1 {v0.16b}, %[in] ;"
|
||||
" ld1 {v1.2d}, [%[key]], #16 ;"
|
||||
" cmp %w[rounds], #10 ;"
|
||||
" bmi 0f ;"
|
||||
" bne 3f ;"
|
||||
" mov v3.16b, v1.16b ;"
|
||||
" b 2f ;"
|
||||
"0: mov v2.16b, v1.16b ;"
|
||||
" ld1 {v3.2d}, [%[key]], #16 ;"
|
||||
"1: aese v0.16b, v2.16b ;"
|
||||
" aesmc v0.16b, v0.16b ;"
|
||||
"2: ld1 {v1.2d}, [%[key]], #16 ;"
|
||||
" aese v0.16b, v3.16b ;"
|
||||
" aesmc v0.16b, v0.16b ;"
|
||||
"3: ld1 {v2.2d}, [%[key]], #16 ;"
|
||||
" subs %w[rounds], %w[rounds], #3 ;"
|
||||
" aese v0.16b, v1.16b ;"
|
||||
" aesmc v0.16b, v0.16b ;"
|
||||
" ld1 {v3.2d}, [%[key]], #16 ;"
|
||||
" bpl 1b ;"
|
||||
" aese v0.16b, v2.16b ;"
|
||||
" eor v0.16b, v0.16b, v3.16b ;"
|
||||
" st1 {v0.16b}, %[out] ;"
|
||||
|
||||
: [out] "=Q"(*out),
|
||||
[key] "=r"(dummy0),
|
||||
[rounds] "=r"(dummy1)
|
||||
: [in] "Q"(*in),
|
||||
"1"(ctx->key_enc),
|
||||
"2"(num_rounds(ctx) - 2)
|
||||
: "cc");
|
||||
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
struct aes_block *out = (struct aes_block *)dst;
|
||||
struct aes_block const *in = (struct aes_block *)src;
|
||||
void *dummy0;
|
||||
int dummy1;
|
||||
|
||||
kernel_neon_begin_partial(4);
|
||||
|
||||
__asm__(" ld1 {v0.16b}, %[in] ;"
|
||||
" ld1 {v1.2d}, [%[key]], #16 ;"
|
||||
" cmp %w[rounds], #10 ;"
|
||||
" bmi 0f ;"
|
||||
" bne 3f ;"
|
||||
" mov v3.16b, v1.16b ;"
|
||||
" b 2f ;"
|
||||
"0: mov v2.16b, v1.16b ;"
|
||||
" ld1 {v3.2d}, [%[key]], #16 ;"
|
||||
"1: aesd v0.16b, v2.16b ;"
|
||||
" aesimc v0.16b, v0.16b ;"
|
||||
"2: ld1 {v1.2d}, [%[key]], #16 ;"
|
||||
" aesd v0.16b, v3.16b ;"
|
||||
" aesimc v0.16b, v0.16b ;"
|
||||
"3: ld1 {v2.2d}, [%[key]], #16 ;"
|
||||
" subs %w[rounds], %w[rounds], #3 ;"
|
||||
" aesd v0.16b, v1.16b ;"
|
||||
" aesimc v0.16b, v0.16b ;"
|
||||
" ld1 {v3.2d}, [%[key]], #16 ;"
|
||||
" bpl 1b ;"
|
||||
" aesd v0.16b, v2.16b ;"
|
||||
" eor v0.16b, v0.16b, v3.16b ;"
|
||||
" st1 {v0.16b}, %[out] ;"
|
||||
|
||||
: [out] "=Q"(*out),
|
||||
[key] "=r"(dummy0),
|
||||
[rounds] "=r"(dummy1)
|
||||
: [in] "Q"(*in),
|
||||
"1"(ctx->key_dec),
|
||||
"2"(num_rounds(ctx) - 2)
|
||||
: "cc");
|
||||
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static struct crypto_alg aes_alg = {
|
||||
.cra_name = "aes",
|
||||
.cra_driver_name = "aes-ce",
|
||||
.cra_priority = 300,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_cipher = {
|
||||
.cia_min_keysize = AES_MIN_KEY_SIZE,
|
||||
.cia_max_keysize = AES_MAX_KEY_SIZE,
|
||||
.cia_setkey = crypto_aes_set_key,
|
||||
.cia_encrypt = aes_cipher_encrypt,
|
||||
.cia_decrypt = aes_cipher_decrypt
|
||||
}
|
||||
};
|
||||
|
||||
static int __init aes_mod_init(void)
|
||||
{
|
||||
return crypto_register_alg(&aes_alg);
|
||||
}
|
||||
|
||||
static void __exit aes_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_alg(&aes_alg);
|
||||
}
|
||||
|
||||
module_cpu_feature_match(AES, aes_mod_init);
|
||||
module_exit(aes_mod_exit);
|
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
|
||||
* Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define AES_ENTRY(func) ENTRY(ce_ ## func)
|
||||
#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
|
||||
|
||||
.arch armv8-a+crypto
|
||||
|
||||
/* preload all round keys */
|
||||
.macro load_round_keys, rounds, rk
|
||||
cmp \rounds, #12
|
||||
blo 2222f /* 128 bits */
|
||||
beq 1111f /* 192 bits */
|
||||
ld1 {v17.16b-v18.16b}, [\rk], #32
|
||||
1111: ld1 {v19.16b-v20.16b}, [\rk], #32
|
||||
2222: ld1 {v21.16b-v24.16b}, [\rk], #64
|
||||
ld1 {v25.16b-v28.16b}, [\rk], #64
|
||||
ld1 {v29.16b-v31.16b}, [\rk]
|
||||
.endm
|
||||
|
||||
/* prepare for encryption with key in rk[] */
|
||||
.macro enc_prepare, rounds, rk, ignore
|
||||
load_round_keys \rounds, \rk
|
||||
.endm
|
||||
|
||||
/* prepare for encryption (again) but with new key in rk[] */
|
||||
.macro enc_switch_key, rounds, rk, ignore
|
||||
load_round_keys \rounds, \rk
|
||||
.endm
|
||||
|
||||
/* prepare for decryption with key in rk[] */
|
||||
.macro dec_prepare, rounds, rk, ignore
|
||||
load_round_keys \rounds, \rk
|
||||
.endm
|
||||
|
||||
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
|
||||
aes\de \i0\().16b, \k\().16b
|
||||
.ifnb \i1
|
||||
aes\de \i1\().16b, \k\().16b
|
||||
.ifnb \i3
|
||||
aes\de \i2\().16b, \k\().16b
|
||||
aes\de \i3\().16b, \k\().16b
|
||||
.endif
|
||||
.endif
|
||||
aes\mc \i0\().16b, \i0\().16b
|
||||
.ifnb \i1
|
||||
aes\mc \i1\().16b, \i1\().16b
|
||||
.ifnb \i3
|
||||
aes\mc \i2\().16b, \i2\().16b
|
||||
aes\mc \i3\().16b, \i3\().16b
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* up to 4 interleaved encryption rounds with the same round key */
|
||||
.macro round_Nx, enc, k, i0, i1, i2, i3
|
||||
.ifc \enc, e
|
||||
do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
|
||||
.else
|
||||
do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* up to 4 interleaved final rounds */
|
||||
.macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
|
||||
aes\de \i0\().16b, \k\().16b
|
||||
.ifnb \i1
|
||||
aes\de \i1\().16b, \k\().16b
|
||||
.ifnb \i3
|
||||
aes\de \i2\().16b, \k\().16b
|
||||
aes\de \i3\().16b, \k\().16b
|
||||
.endif
|
||||
.endif
|
||||
eor \i0\().16b, \i0\().16b, \k2\().16b
|
||||
.ifnb \i1
|
||||
eor \i1\().16b, \i1\().16b, \k2\().16b
|
||||
.ifnb \i3
|
||||
eor \i2\().16b, \i2\().16b, \k2\().16b
|
||||
eor \i3\().16b, \i3\().16b, \k2\().16b
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* up to 4 interleaved blocks */
|
||||
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3
|
||||
cmp \rounds, #12
|
||||
blo 2222f /* 128 bits */
|
||||
beq 1111f /* 192 bits */
|
||||
round_Nx \enc, v17, \i0, \i1, \i2, \i3
|
||||
round_Nx \enc, v18, \i0, \i1, \i2, \i3
|
||||
1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
|
||||
round_Nx \enc, v20, \i0, \i1, \i2, \i3
|
||||
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
round_Nx \enc, \key, \i0, \i1, \i2, \i3
|
||||
.endr
|
||||
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
|
||||
.endm
|
||||
|
||||
.macro encrypt_block, in, rounds, t0, t1, t2
|
||||
do_block_Nx e, \rounds, \in
|
||||
.endm
|
||||
|
||||
.macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
|
||||
do_block_Nx e, \rounds, \i0, \i1
|
||||
.endm
|
||||
|
||||
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
|
||||
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
|
||||
.endm
|
||||
|
||||
.macro decrypt_block, in, rounds, t0, t1, t2
|
||||
do_block_Nx d, \rounds, \in
|
||||
.endm
|
||||
|
||||
.macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
|
||||
do_block_Nx d, \rounds, \i0, \i1
|
||||
.endm
|
||||
|
||||
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
|
||||
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
|
||||
.endm
|
||||
|
||||
#include "aes-modes.S"
|
|
@ -0,0 +1,446 @@
|
|||
/*
|
||||
* linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
|
||||
*
|
||||
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <crypto/aes.h>
|
||||
#include <crypto/ablk_helper.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpufeature.h>
|
||||
|
||||
#ifdef USE_V8_CRYPTO_EXTENSIONS
|
||||
#define MODE "ce"
|
||||
#define PRIO 300
|
||||
#define aes_ecb_encrypt ce_aes_ecb_encrypt
|
||||
#define aes_ecb_decrypt ce_aes_ecb_decrypt
|
||||
#define aes_cbc_encrypt ce_aes_cbc_encrypt
|
||||
#define aes_cbc_decrypt ce_aes_cbc_decrypt
|
||||
#define aes_ctr_encrypt ce_aes_ctr_encrypt
|
||||
#define aes_xts_encrypt ce_aes_xts_encrypt
|
||||
#define aes_xts_decrypt ce_aes_xts_decrypt
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
|
||||
#else
|
||||
#define MODE "neon"
|
||||
#define PRIO 200
|
||||
#define aes_ecb_encrypt neon_aes_ecb_encrypt
|
||||
#define aes_ecb_decrypt neon_aes_ecb_decrypt
|
||||
#define aes_cbc_encrypt neon_aes_cbc_encrypt
|
||||
#define aes_cbc_decrypt neon_aes_cbc_decrypt
|
||||
#define aes_ctr_encrypt neon_aes_ctr_encrypt
|
||||
#define aes_xts_encrypt neon_aes_xts_encrypt
|
||||
#define aes_xts_decrypt neon_aes_xts_decrypt
|
||||
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
|
||||
MODULE_ALIAS("ecb(aes)");
|
||||
MODULE_ALIAS("cbc(aes)");
|
||||
MODULE_ALIAS("ctr(aes)");
|
||||
MODULE_ALIAS("xts(aes)");
|
||||
#endif
|
||||
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
/* defined in aes-modes.S */
|
||||
asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
int rounds, int blocks, int first);
|
||||
asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
int rounds, int blocks, int first);
|
||||
|
||||
asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
int rounds, int blocks, u8 iv[], int first);
|
||||
asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
int rounds, int blocks, u8 iv[], int first);
|
||||
|
||||
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
|
||||
int rounds, int blocks, u8 ctr[], int first);
|
||||
|
||||
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
|
||||
int rounds, int blocks, u8 const rk2[], u8 iv[],
|
||||
int first);
|
||||
asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
|
||||
int rounds, int blocks, u8 const rk2[], u8 iv[],
|
||||
int first);
|
||||
|
||||
struct crypto_aes_xts_ctx {
|
||||
struct crypto_aes_ctx key1;
|
||||
struct crypto_aes_ctx __aligned(8) key2;
|
||||
};
|
||||
|
||||
static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
{
|
||||
struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
int ret;
|
||||
|
||||
ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
|
||||
if (!ret)
|
||||
ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
|
||||
key_len / 2);
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key_enc, rounds, blocks, first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key_dec, rounds, blocks, first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
|
||||
first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key_dec, rounds, blocks, walk.iv,
|
||||
first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
|
||||
|
||||
first = 1;
|
||||
kernel_neon_begin();
|
||||
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
|
||||
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
|
||||
first);
|
||||
first = 0;
|
||||
nbytes -= blocks * AES_BLOCK_SIZE;
|
||||
if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
|
||||
break;
|
||||
err = blkcipher_walk_done(desc, &walk,
|
||||
walk.nbytes % AES_BLOCK_SIZE);
|
||||
}
|
||||
if (nbytes) {
|
||||
u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
|
||||
u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
|
||||
u8 __aligned(8) tail[AES_BLOCK_SIZE];
|
||||
|
||||
/*
|
||||
* Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
|
||||
* to tell aes_ctr_encrypt() to only read half a block.
|
||||
*/
|
||||
blocks = (nbytes <= 8) ? -1 : 1;
|
||||
|
||||
aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
|
||||
blocks, walk.iv, first);
|
||||
memcpy(tdst, tail, nbytes);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key1.key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key1.key_enc, rounds, blocks,
|
||||
(u8 *)ctx->key2.key_enc, walk.iv, first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
int err, first, rounds = 6 + ctx->key1.key_length / 4;
|
||||
struct blkcipher_walk walk;
|
||||
unsigned int blocks;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
kernel_neon_begin();
|
||||
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
|
||||
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
(u8 *)ctx->key1.key_dec, rounds, blocks,
|
||||
(u8 *)ctx->key2.key_enc, walk.iv, first);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
kernel_neon_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct crypto_alg aes_algs[] = { {
|
||||
.cra_name = "__ecb-aes-" MODE,
|
||||
.cra_driver_name = "__driver-ecb-aes-" MODE,
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_blkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = crypto_aes_set_key,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-aes-" MODE,
|
||||
.cra_driver_name = "__driver-cbc-aes-" MODE,
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_blkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = crypto_aes_set_key,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-aes-" MODE,
|
||||
.cra_driver_name = "__driver-ctr-aes-" MODE,
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_blkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = crypto_aes_set_key,
|
||||
.encrypt = ctr_encrypt,
|
||||
.decrypt = ctr_encrypt,
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__xts-aes-" MODE,
|
||||
.cra_driver_name = "__driver-xts-aes-" MODE,
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_blkcipher = {
|
||||
.min_keysize = 2 * AES_MIN_KEY_SIZE,
|
||||
.max_keysize = 2 * AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = xts_set_key,
|
||||
.encrypt = xts_encrypt,
|
||||
.decrypt = xts_decrypt,
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(aes)",
|
||||
.cra_driver_name = "ecb-aes-" MODE,
|
||||
.cra_priority = PRIO,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_ablkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
}
|
||||
}, {
|
||||
.cra_name = "cbc(aes)",
|
||||
.cra_driver_name = "cbc-aes-" MODE,
|
||||
.cra_priority = PRIO,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_ablkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
}
|
||||
}, {
|
||||
.cra_name = "ctr(aes)",
|
||||
.cra_driver_name = "ctr-aes-" MODE,
|
||||
.cra_priority = PRIO,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_ablkcipher = {
|
||||
.min_keysize = AES_MIN_KEY_SIZE,
|
||||
.max_keysize = AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
}
|
||||
}, {
|
||||
.cra_name = "xts(aes)",
|
||||
.cra_driver_name = "xts-aes-" MODE,
|
||||
.cra_priority = PRIO,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_ablkcipher = {
|
||||
.min_keysize = 2 * AES_MIN_KEY_SIZE,
|
||||
.max_keysize = 2 * AES_MAX_KEY_SIZE,
|
||||
.ivsize = AES_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
}
|
||||
} };
|
||||
|
||||
static int __init aes_init(void)
|
||||
{
|
||||
return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
|
||||
}
|
||||
|
||||
static void __exit aes_exit(void)
|
||||
{
|
||||
crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
|
||||
}
|
||||
|
||||
#ifdef USE_V8_CRYPTO_EXTENSIONS
|
||||
module_cpu_feature_match(AES, aes_init);
|
||||
#else
|
||||
module_init(aes_init);
|
||||
#endif
|
||||
module_exit(aes_exit);
|
|
@ -0,0 +1,532 @@
|
|||
/*
|
||||
* linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
|
||||
*
|
||||
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
/* included by aes-ce.S and aes-neon.S */
|
||||
|
||||
.text
|
||||
.align 4
|
||||
|
||||
/*
|
||||
* There are several ways to instantiate this code:
|
||||
* - no interleave, all inline
|
||||
* - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
|
||||
* - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
|
||||
* - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
|
||||
* - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
|
||||
*
|
||||
* Macros imported by this code:
|
||||
* - enc_prepare - setup NEON registers for encryption
|
||||
* - dec_prepare - setup NEON registers for decryption
|
||||
* - enc_switch_key - change to new key after having prepared for encryption
|
||||
* - encrypt_block - encrypt a single block
|
||||
* - decrypt block - decrypt a single block
|
||||
* - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
|
||||
* - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
|
||||
* - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
|
||||
* - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
|
||||
*/
|
||||
|
||||
#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
|
||||
#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
|
||||
#define FRAME_POP ldp x29, x30, [sp],#16
|
||||
|
||||
#if INTERLEAVE == 2
|
||||
|
||||
aes_encrypt_block2x:
|
||||
encrypt_block2x v0, v1, w3, x2, x6, w7
|
||||
ret
|
||||
ENDPROC(aes_encrypt_block2x)
|
||||
|
||||
aes_decrypt_block2x:
|
||||
decrypt_block2x v0, v1, w3, x2, x6, w7
|
||||
ret
|
||||
ENDPROC(aes_decrypt_block2x)
|
||||
|
||||
#elif INTERLEAVE == 4
|
||||
|
||||
aes_encrypt_block4x:
|
||||
encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
|
||||
ret
|
||||
ENDPROC(aes_encrypt_block4x)
|
||||
|
||||
aes_decrypt_block4x:
|
||||
decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
|
||||
ret
|
||||
ENDPROC(aes_decrypt_block4x)
|
||||
|
||||
#else
|
||||
#error INTERLEAVE should equal 2 or 4
|
||||
#endif
|
||||
|
||||
.macro do_encrypt_block2x
|
||||
bl aes_encrypt_block2x
|
||||
.endm
|
||||
|
||||
.macro do_decrypt_block2x
|
||||
bl aes_decrypt_block2x
|
||||
.endm
|
||||
|
||||
.macro do_encrypt_block4x
|
||||
bl aes_encrypt_block4x
|
||||
.endm
|
||||
|
||||
.macro do_decrypt_block4x
|
||||
bl aes_decrypt_block4x
|
||||
.endm
|
||||
|
||||
#else
|
||||
#define FRAME_PUSH
|
||||
#define FRAME_POP
|
||||
|
||||
.macro do_encrypt_block2x
|
||||
encrypt_block2x v0, v1, w3, x2, x6, w7
|
||||
.endm
|
||||
|
||||
.macro do_decrypt_block2x
|
||||
decrypt_block2x v0, v1, w3, x2, x6, w7
|
||||
.endm
|
||||
|
||||
.macro do_encrypt_block4x
|
||||
encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
|
||||
.endm
|
||||
|
||||
.macro do_decrypt_block4x
|
||||
decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, int first)
|
||||
* aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, int first)
|
||||
*/
|
||||
|
||||
AES_ENTRY(aes_ecb_encrypt)
|
||||
FRAME_PUSH
|
||||
cbz w5, .LecbencloopNx
|
||||
|
||||
enc_prepare w3, x2, x5
|
||||
|
||||
.LecbencloopNx:
|
||||
#if INTERLEAVE >= 2
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lecbenc1x
|
||||
#if INTERLEAVE == 2
|
||||
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
|
||||
do_encrypt_block2x
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
#else
|
||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
|
||||
do_encrypt_block4x
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
#endif
|
||||
b .LecbencloopNx
|
||||
.Lecbenc1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lecbencout
|
||||
#endif
|
||||
.Lecbencloop:
|
||||
ld1 {v0.16b}, [x1], #16 /* get next pt block */
|
||||
encrypt_block v0, w3, x2, x5, w6
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
bne .Lecbencloop
|
||||
.Lecbencout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_ecb_encrypt)
|
||||
|
||||
|
||||
AES_ENTRY(aes_ecb_decrypt)
|
||||
FRAME_PUSH
|
||||
cbz w5, .LecbdecloopNx
|
||||
|
||||
dec_prepare w3, x2, x5
|
||||
|
||||
.LecbdecloopNx:
|
||||
#if INTERLEAVE >= 2
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lecbdec1x
|
||||
#if INTERLEAVE == 2
|
||||
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
|
||||
do_decrypt_block2x
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
#else
|
||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
||||
do_decrypt_block4x
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
#endif
|
||||
b .LecbdecloopNx
|
||||
.Lecbdec1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lecbdecout
|
||||
#endif
|
||||
.Lecbdecloop:
|
||||
ld1 {v0.16b}, [x1], #16 /* get next ct block */
|
||||
decrypt_block v0, w3, x2, x5, w6
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
bne .Lecbdecloop
|
||||
.Lecbdecout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_ecb_decrypt)
|
||||
|
||||
|
||||
/*
|
||||
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, u8 iv[], int first)
|
||||
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, u8 iv[], int first)
|
||||
*/
|
||||
|
||||
AES_ENTRY(aes_cbc_encrypt)
|
||||
cbz w6, .Lcbcencloop
|
||||
|
||||
ld1 {v0.16b}, [x5] /* get iv */
|
||||
enc_prepare w3, x2, x5
|
||||
|
||||
.Lcbcencloop:
|
||||
ld1 {v1.16b}, [x1], #16 /* get next pt block */
|
||||
eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
|
||||
encrypt_block v0, w3, x2, x5, w6
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
bne .Lcbcencloop
|
||||
ret
|
||||
AES_ENDPROC(aes_cbc_encrypt)
|
||||
|
||||
|
||||
AES_ENTRY(aes_cbc_decrypt)
|
||||
FRAME_PUSH
|
||||
cbz w6, .LcbcdecloopNx
|
||||
|
||||
ld1 {v7.16b}, [x5] /* get iv */
|
||||
dec_prepare w3, x2, x5
|
||||
|
||||
.LcbcdecloopNx:
|
||||
#if INTERLEAVE >= 2
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lcbcdec1x
|
||||
#if INTERLEAVE == 2
|
||||
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
|
||||
mov v2.16b, v0.16b
|
||||
mov v3.16b, v1.16b
|
||||
do_decrypt_block2x
|
||||
eor v0.16b, v0.16b, v7.16b
|
||||
eor v1.16b, v1.16b, v2.16b
|
||||
mov v7.16b, v3.16b
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
#else
|
||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
||||
mov v4.16b, v0.16b
|
||||
mov v5.16b, v1.16b
|
||||
mov v6.16b, v2.16b
|
||||
do_decrypt_block4x
|
||||
sub x1, x1, #16
|
||||
eor v0.16b, v0.16b, v7.16b
|
||||
eor v1.16b, v1.16b, v4.16b
|
||||
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
|
||||
eor v2.16b, v2.16b, v5.16b
|
||||
eor v3.16b, v3.16b, v6.16b
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
#endif
|
||||
b .LcbcdecloopNx
|
||||
.Lcbcdec1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lcbcdecout
|
||||
#endif
|
||||
.Lcbcdecloop:
|
||||
ld1 {v1.16b}, [x1], #16 /* get next ct block */
|
||||
mov v0.16b, v1.16b /* ...and copy to v0 */
|
||||
decrypt_block v0, w3, x2, x5, w6
|
||||
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
|
||||
mov v7.16b, v1.16b /* ct is next iv */
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
bne .Lcbcdecloop
|
||||
.Lcbcdecout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_cbc_decrypt)
|
||||
|
||||
|
||||
/*
|
||||
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
||||
* int blocks, u8 ctr[], int first)
|
||||
*/
|
||||
|
||||
AES_ENTRY(aes_ctr_encrypt)
|
||||
FRAME_PUSH
|
||||
cbnz w6, .Lctrfirst /* 1st time around? */
|
||||
umov x5, v4.d[1] /* keep swabbed ctr in reg */
|
||||
rev x5, x5
|
||||
#if INTERLEAVE >= 2
|
||||
cmn w5, w4 /* 32 bit overflow? */
|
||||
bcs .Lctrinc
|
||||
add x5, x5, #1 /* increment BE ctr */
|
||||
b .LctrincNx
|
||||
#else
|
||||
b .Lctrinc
|
||||
#endif
|
||||
.Lctrfirst:
|
||||
enc_prepare w3, x2, x6
|
||||
ld1 {v4.16b}, [x5]
|
||||
umov x5, v4.d[1] /* keep swabbed ctr in reg */
|
||||
rev x5, x5
|
||||
#if INTERLEAVE >= 2
|
||||
cmn w5, w4 /* 32 bit overflow? */
|
||||
bcs .Lctrloop
|
||||
.LctrloopNx:
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lctr1x
|
||||
#if INTERLEAVE == 2
|
||||
mov v0.8b, v4.8b
|
||||
mov v1.8b, v4.8b
|
||||
rev x7, x5
|
||||
add x5, x5, #1
|
||||
ins v0.d[1], x7
|
||||
rev x7, x5
|
||||
add x5, x5, #1
|
||||
ins v1.d[1], x7
|
||||
ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
|
||||
do_encrypt_block2x
|
||||
eor v0.16b, v0.16b, v2.16b
|
||||
eor v1.16b, v1.16b, v3.16b
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
#else
|
||||
ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
|
||||
dup v7.4s, w5
|
||||
mov v0.16b, v4.16b
|
||||
add v7.4s, v7.4s, v8.4s
|
||||
mov v1.16b, v4.16b
|
||||
rev32 v8.16b, v7.16b
|
||||
mov v2.16b, v4.16b
|
||||
mov v3.16b, v4.16b
|
||||
mov v1.s[3], v8.s[0]
|
||||
mov v2.s[3], v8.s[1]
|
||||
mov v3.s[3], v8.s[2]
|
||||
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
|
||||
do_encrypt_block4x
|
||||
eor v0.16b, v5.16b, v0.16b
|
||||
ld1 {v5.16b}, [x1], #16 /* get 1 input block */
|
||||
eor v1.16b, v6.16b, v1.16b
|
||||
eor v2.16b, v7.16b, v2.16b
|
||||
eor v3.16b, v5.16b, v3.16b
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
add x5, x5, #INTERLEAVE
|
||||
#endif
|
||||
cbz w4, .LctroutNx
|
||||
.LctrincNx:
|
||||
rev x7, x5
|
||||
ins v4.d[1], x7
|
||||
b .LctrloopNx
|
||||
.LctroutNx:
|
||||
sub x5, x5, #1
|
||||
rev x7, x5
|
||||
ins v4.d[1], x7
|
||||
b .Lctrout
|
||||
.Lctr1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lctrout
|
||||
#endif
|
||||
.Lctrloop:
|
||||
mov v0.16b, v4.16b
|
||||
encrypt_block v0, w3, x2, x6, w7
|
||||
subs w4, w4, #1
|
||||
bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
|
||||
ld1 {v3.16b}, [x1], #16
|
||||
eor v3.16b, v0.16b, v3.16b
|
||||
st1 {v3.16b}, [x0], #16
|
||||
beq .Lctrout
|
||||
.Lctrinc:
|
||||
adds x5, x5, #1 /* increment BE ctr */
|
||||
rev x7, x5
|
||||
ins v4.d[1], x7
|
||||
bcc .Lctrloop /* no overflow? */
|
||||
umov x7, v4.d[0] /* load upper word of ctr */
|
||||
rev x7, x7 /* ... to handle the carry */
|
||||
add x7, x7, #1
|
||||
rev x7, x7
|
||||
ins v4.d[0], x7
|
||||
b .Lctrloop
|
||||
.Lctrhalfblock:
|
||||
ld1 {v3.8b}, [x1]
|
||||
eor v3.8b, v0.8b, v3.8b
|
||||
st1 {v3.8b}, [x0]
|
||||
.Lctrout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_ctr_encrypt)
|
||||
.ltorg
|
||||
|
||||
|
||||
/*
|
||||
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
|
||||
* int blocks, u8 const rk2[], u8 iv[], int first)
|
||||
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
|
||||
* int blocks, u8 const rk2[], u8 iv[], int first)
|
||||
*/
|
||||
|
||||
.macro next_tweak, out, in, const, tmp
|
||||
sshr \tmp\().2d, \in\().2d, #63
|
||||
and \tmp\().16b, \tmp\().16b, \const\().16b
|
||||
add \out\().2d, \in\().2d, \in\().2d
|
||||
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
|
||||
eor \out\().16b, \out\().16b, \tmp\().16b
|
||||
.endm
|
||||
|
||||
.Lxts_mul_x:
|
||||
.word 1, 0, 0x87, 0
|
||||
|
||||
AES_ENTRY(aes_xts_encrypt)
|
||||
FRAME_PUSH
|
||||
cbz w7, .LxtsencloopNx
|
||||
|
||||
ld1 {v4.16b}, [x6]
|
||||
enc_prepare w3, x5, x6
|
||||
encrypt_block v4, w3, x5, x6, w7 /* first tweak */
|
||||
enc_switch_key w3, x2, x6
|
||||
ldr q7, .Lxts_mul_x
|
||||
b .LxtsencNx
|
||||
|
||||
.LxtsencloopNx:
|
||||
ldr q7, .Lxts_mul_x
|
||||
next_tweak v4, v4, v7, v8
|
||||
.LxtsencNx:
|
||||
#if INTERLEAVE >= 2
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lxtsenc1x
|
||||
#if INTERLEAVE == 2
|
||||
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
|
||||
next_tweak v5, v4, v7, v8
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
do_encrypt_block2x
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
cbz w4, .LxtsencoutNx
|
||||
next_tweak v4, v5, v7, v8
|
||||
b .LxtsencNx
|
||||
.LxtsencoutNx:
|
||||
mov v4.16b, v5.16b
|
||||
b .Lxtsencout
|
||||
#else
|
||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
|
||||
next_tweak v5, v4, v7, v8
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
next_tweak v6, v5, v7, v8
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
eor v2.16b, v2.16b, v6.16b
|
||||
next_tweak v7, v6, v7, v8
|
||||
eor v3.16b, v3.16b, v7.16b
|
||||
do_encrypt_block4x
|
||||
eor v3.16b, v3.16b, v7.16b
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
eor v2.16b, v2.16b, v6.16b
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
mov v4.16b, v7.16b
|
||||
cbz w4, .Lxtsencout
|
||||
b .LxtsencloopNx
|
||||
#endif
|
||||
.Lxtsenc1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lxtsencout
|
||||
#endif
|
||||
.Lxtsencloop:
|
||||
ld1 {v1.16b}, [x1], #16
|
||||
eor v0.16b, v1.16b, v4.16b
|
||||
encrypt_block v0, w3, x2, x6, w7
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
beq .Lxtsencout
|
||||
next_tweak v4, v4, v7, v8
|
||||
b .Lxtsencloop
|
||||
.Lxtsencout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_xts_encrypt)
|
||||
|
||||
|
||||
AES_ENTRY(aes_xts_decrypt)
|
||||
FRAME_PUSH
|
||||
cbz w7, .LxtsdecloopNx
|
||||
|
||||
ld1 {v4.16b}, [x6]
|
||||
enc_prepare w3, x5, x6
|
||||
encrypt_block v4, w3, x5, x6, w7 /* first tweak */
|
||||
dec_prepare w3, x2, x6
|
||||
ldr q7, .Lxts_mul_x
|
||||
b .LxtsdecNx
|
||||
|
||||
.LxtsdecloopNx:
|
||||
ldr q7, .Lxts_mul_x
|
||||
next_tweak v4, v4, v7, v8
|
||||
.LxtsdecNx:
|
||||
#if INTERLEAVE >= 2
|
||||
subs w4, w4, #INTERLEAVE
|
||||
bmi .Lxtsdec1x
|
||||
#if INTERLEAVE == 2
|
||||
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
|
||||
next_tweak v5, v4, v7, v8
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
do_decrypt_block2x
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
st1 {v0.16b-v1.16b}, [x0], #32
|
||||
cbz w4, .LxtsdecoutNx
|
||||
next_tweak v4, v5, v7, v8
|
||||
b .LxtsdecNx
|
||||
.LxtsdecoutNx:
|
||||
mov v4.16b, v5.16b
|
||||
b .Lxtsdecout
|
||||
#else
|
||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
||||
next_tweak v5, v4, v7, v8
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
next_tweak v6, v5, v7, v8
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
eor v2.16b, v2.16b, v6.16b
|
||||
next_tweak v7, v6, v7, v8
|
||||
eor v3.16b, v3.16b, v7.16b
|
||||
do_decrypt_block4x
|
||||
eor v3.16b, v3.16b, v7.16b
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
eor v1.16b, v1.16b, v5.16b
|
||||
eor v2.16b, v2.16b, v6.16b
|
||||
st1 {v0.16b-v3.16b}, [x0], #64
|
||||
mov v4.16b, v7.16b
|
||||
cbz w4, .Lxtsdecout
|
||||
b .LxtsdecloopNx
|
||||
#endif
|
||||
.Lxtsdec1x:
|
||||
adds w4, w4, #INTERLEAVE
|
||||
beq .Lxtsdecout
|
||||
#endif
|
||||
.Lxtsdecloop:
|
||||
ld1 {v1.16b}, [x1], #16
|
||||
eor v0.16b, v1.16b, v4.16b
|
||||
decrypt_block v0, w3, x2, x6, w7
|
||||
eor v0.16b, v0.16b, v4.16b
|
||||
st1 {v0.16b}, [x0], #16
|
||||
subs w4, w4, #1
|
||||
beq .Lxtsdecout
|
||||
next_tweak v4, v4, v7, v8
|
||||
b .Lxtsdecloop
|
||||
.Lxtsdecout:
|
||||
FRAME_POP
|
||||
ret
|
||||
AES_ENDPROC(aes_xts_decrypt)
|
|
@ -0,0 +1,382 @@
|
|||
/*
|
||||
* linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
|
||||
*
|
||||
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define AES_ENTRY(func) ENTRY(neon_ ## func)
|
||||
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
|
||||
|
||||
/* multiply by polynomial 'x' in GF(2^8) */
|
||||
.macro mul_by_x, out, in, temp, const
|
||||
sshr \temp, \in, #7
|
||||
add \out, \in, \in
|
||||
and \temp, \temp, \const
|
||||
eor \out, \out, \temp
|
||||
.endm
|
||||
|
||||
/* preload the entire Sbox */
|
||||
.macro prepare, sbox, shiftrows, temp
|
||||
adr \temp, \sbox
|
||||
movi v12.16b, #0x40
|
||||
ldr q13, \shiftrows
|
||||
movi v14.16b, #0x1b
|
||||
ld1 {v16.16b-v19.16b}, [\temp], #64
|
||||
ld1 {v20.16b-v23.16b}, [\temp], #64
|
||||
ld1 {v24.16b-v27.16b}, [\temp], #64
|
||||
ld1 {v28.16b-v31.16b}, [\temp]
|
||||
.endm
|
||||
|
||||
/* do preload for encryption */
|
||||
.macro enc_prepare, ignore0, ignore1, temp
|
||||
prepare .LForward_Sbox, .LForward_ShiftRows, \temp
|
||||
.endm
|
||||
|
||||
.macro enc_switch_key, ignore0, ignore1, temp
|
||||
/* do nothing */
|
||||
.endm
|
||||
|
||||
/* do preload for decryption */
|
||||
.macro dec_prepare, ignore0, ignore1, temp
|
||||
prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
|
||||
.endm
|
||||
|
||||
/* apply SubBytes transformation using the the preloaded Sbox */
|
||||
.macro sub_bytes, in
|
||||
sub v9.16b, \in\().16b, v12.16b
|
||||
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
|
||||
sub v10.16b, v9.16b, v12.16b
|
||||
tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
|
||||
sub v11.16b, v10.16b, v12.16b
|
||||
tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
|
||||
tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
|
||||
.endm
|
||||
|
||||
/* apply MixColumns transformation */
|
||||
.macro mix_columns, in
|
||||
mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
|
||||
rev32 v8.8h, \in\().8h
|
||||
eor \in\().16b, v10.16b, \in\().16b
|
||||
shl v9.4s, v8.4s, #24
|
||||
shl v11.4s, \in\().4s, #24
|
||||
sri v9.4s, v8.4s, #8
|
||||
sri v11.4s, \in\().4s, #8
|
||||
eor v9.16b, v9.16b, v8.16b
|
||||
eor v10.16b, v10.16b, v9.16b
|
||||
eor \in\().16b, v10.16b, v11.16b
|
||||
.endm
|
||||
|
||||
/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
|
||||
.macro inv_mix_columns, in
|
||||
mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
|
||||
mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
|
||||
eor \in\().16b, \in\().16b, v11.16b
|
||||
rev32 v11.8h, v11.8h
|
||||
eor \in\().16b, \in\().16b, v11.16b
|
||||
mix_columns \in
|
||||
.endm
|
||||
|
||||
.macro do_block, enc, in, rounds, rk, rkp, i
|
||||
ld1 {v15.16b}, [\rk]
|
||||
add \rkp, \rk, #16
|
||||
mov \i, \rounds
|
||||
1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
|
||||
tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
|
||||
sub_bytes \in
|
||||
ld1 {v15.16b}, [\rkp], #16
|
||||
subs \i, \i, #1
|
||||
beq 2222f
|
||||
.if \enc == 1
|
||||
mix_columns \in
|
||||
.else
|
||||
inv_mix_columns \in
|
||||
.endif
|
||||
b 1111b
|
||||
2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
|
||||
.endm
|
||||
|
||||
.macro encrypt_block, in, rounds, rk, rkp, i
|
||||
do_block 1, \in, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
.macro decrypt_block, in, rounds, rk, rkp, i
|
||||
do_block 0, \in, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Interleaved versions: functionally equivalent to the
|
||||
* ones above, but applied to 2 or 4 AES states in parallel.
|
||||
*/
|
||||
|
||||
.macro sub_bytes_2x, in0, in1
|
||||
sub v8.16b, \in0\().16b, v12.16b
|
||||
sub v9.16b, \in1\().16b, v12.16b
|
||||
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
|
||||
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
|
||||
sub v10.16b, v8.16b, v12.16b
|
||||
sub v11.16b, v9.16b, v12.16b
|
||||
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
|
||||
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
|
||||
sub v8.16b, v10.16b, v12.16b
|
||||
sub v9.16b, v11.16b, v12.16b
|
||||
tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
|
||||
tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
|
||||
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
|
||||
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
|
||||
.endm
|
||||
|
||||
.macro sub_bytes_4x, in0, in1, in2, in3
|
||||
sub v8.16b, \in0\().16b, v12.16b
|
||||
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
|
||||
sub v9.16b, \in1\().16b, v12.16b
|
||||
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
|
||||
sub v10.16b, \in2\().16b, v12.16b
|
||||
tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
|
||||
sub v11.16b, \in3\().16b, v12.16b
|
||||
tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
|
||||
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
|
||||
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
|
||||
sub v8.16b, v8.16b, v12.16b
|
||||
tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
|
||||
sub v9.16b, v9.16b, v12.16b
|
||||
tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
|
||||
sub v10.16b, v10.16b, v12.16b
|
||||
tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
|
||||
sub v11.16b, v11.16b, v12.16b
|
||||
tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
|
||||
sub v8.16b, v8.16b, v12.16b
|
||||
tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
|
||||
sub v9.16b, v9.16b, v12.16b
|
||||
tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
|
||||
sub v10.16b, v10.16b, v12.16b
|
||||
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
|
||||
sub v11.16b, v11.16b, v12.16b
|
||||
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
|
||||
tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
|
||||
tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
|
||||
.endm
|
||||
|
||||
.macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
|
||||
sshr \tmp0\().16b, \in0\().16b, #7
|
||||
add \out0\().16b, \in0\().16b, \in0\().16b
|
||||
sshr \tmp1\().16b, \in1\().16b, #7
|
||||
and \tmp0\().16b, \tmp0\().16b, \const\().16b
|
||||
add \out1\().16b, \in1\().16b, \in1\().16b
|
||||
and \tmp1\().16b, \tmp1\().16b, \const\().16b
|
||||
eor \out0\().16b, \out0\().16b, \tmp0\().16b
|
||||
eor \out1\().16b, \out1\().16b, \tmp1\().16b
|
||||
.endm
|
||||
|
||||
.macro mix_columns_2x, in0, in1
|
||||
mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
|
||||
rev32 v10.8h, \in0\().8h
|
||||
rev32 v11.8h, \in1\().8h
|
||||
eor \in0\().16b, v8.16b, \in0\().16b
|
||||
eor \in1\().16b, v9.16b, \in1\().16b
|
||||
shl v12.4s, v10.4s, #24
|
||||
shl v13.4s, v11.4s, #24
|
||||
eor v8.16b, v8.16b, v10.16b
|
||||
sri v12.4s, v10.4s, #8
|
||||
shl v10.4s, \in0\().4s, #24
|
||||
eor v9.16b, v9.16b, v11.16b
|
||||
sri v13.4s, v11.4s, #8
|
||||
shl v11.4s, \in1\().4s, #24
|
||||
sri v10.4s, \in0\().4s, #8
|
||||
eor \in0\().16b, v8.16b, v12.16b
|
||||
sri v11.4s, \in1\().4s, #8
|
||||
eor \in1\().16b, v9.16b, v13.16b
|
||||
eor \in0\().16b, v10.16b, \in0\().16b
|
||||
eor \in1\().16b, v11.16b, \in1\().16b
|
||||
.endm
|
||||
|
||||
.macro inv_mix_cols_2x, in0, in1
|
||||
mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
|
||||
mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
|
||||
eor \in0\().16b, \in0\().16b, v8.16b
|
||||
eor \in1\().16b, \in1\().16b, v9.16b
|
||||
rev32 v8.8h, v8.8h
|
||||
rev32 v9.8h, v9.8h
|
||||
eor \in0\().16b, \in0\().16b, v8.16b
|
||||
eor \in1\().16b, \in1\().16b, v9.16b
|
||||
mix_columns_2x \in0, \in1
|
||||
.endm
|
||||
|
||||
.macro inv_mix_cols_4x, in0, in1, in2, in3
|
||||
mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
|
||||
mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
|
||||
mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
|
||||
mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
|
||||
eor \in0\().16b, \in0\().16b, v8.16b
|
||||
eor \in1\().16b, \in1\().16b, v9.16b
|
||||
eor \in2\().16b, \in2\().16b, v10.16b
|
||||
eor \in3\().16b, \in3\().16b, v11.16b
|
||||
rev32 v8.8h, v8.8h
|
||||
rev32 v9.8h, v9.8h
|
||||
rev32 v10.8h, v10.8h
|
||||
rev32 v11.8h, v11.8h
|
||||
eor \in0\().16b, \in0\().16b, v8.16b
|
||||
eor \in1\().16b, \in1\().16b, v9.16b
|
||||
eor \in2\().16b, \in2\().16b, v10.16b
|
||||
eor \in3\().16b, \in3\().16b, v11.16b
|
||||
mix_columns_2x \in0, \in1
|
||||
mix_columns_2x \in2, \in3
|
||||
.endm
|
||||
|
||||
.macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
|
||||
ld1 {v15.16b}, [\rk]
|
||||
add \rkp, \rk, #16
|
||||
mov \i, \rounds
|
||||
1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
|
||||
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
|
||||
sub_bytes_2x \in0, \in1
|
||||
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
|
||||
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
|
||||
ld1 {v15.16b}, [\rkp], #16
|
||||
subs \i, \i, #1
|
||||
beq 2222f
|
||||
.if \enc == 1
|
||||
mix_columns_2x \in0, \in1
|
||||
ldr q13, .LForward_ShiftRows
|
||||
.else
|
||||
inv_mix_cols_2x \in0, \in1
|
||||
ldr q13, .LReverse_ShiftRows
|
||||
.endif
|
||||
movi v12.16b, #0x40
|
||||
b 1111b
|
||||
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
|
||||
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
|
||||
.endm
|
||||
|
||||
.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
|
||||
ld1 {v15.16b}, [\rk]
|
||||
add \rkp, \rk, #16
|
||||
mov \i, \rounds
|
||||
1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
|
||||
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
|
||||
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
|
||||
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
|
||||
sub_bytes_4x \in0, \in1, \in2, \in3
|
||||
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
|
||||
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
|
||||
tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
|
||||
tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
|
||||
ld1 {v15.16b}, [\rkp], #16
|
||||
subs \i, \i, #1
|
||||
beq 2222f
|
||||
.if \enc == 1
|
||||
mix_columns_2x \in0, \in1
|
||||
mix_columns_2x \in2, \in3
|
||||
ldr q13, .LForward_ShiftRows
|
||||
.else
|
||||
inv_mix_cols_4x \in0, \in1, \in2, \in3
|
||||
ldr q13, .LReverse_ShiftRows
|
||||
.endif
|
||||
movi v12.16b, #0x40
|
||||
b 1111b
|
||||
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
|
||||
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
|
||||
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
|
||||
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
|
||||
.endm
|
||||
|
||||
.macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
|
||||
do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
.macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
|
||||
do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
|
||||
do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
.macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
|
||||
do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
|
||||
.endm
|
||||
|
||||
#include "aes-modes.S"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.LForward_ShiftRows:
|
||||
.byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
|
||||
.byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
|
||||
|
||||
.LReverse_ShiftRows:
|
||||
.byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
|
||||
.byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
|
||||
|
||||
.LForward_Sbox:
|
||||
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
|
||||
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
|
||||
.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
|
||||
.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
|
||||
.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
|
||||
.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
|
||||
.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
|
||||
.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
|
||||
.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
|
||||
.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
|
||||
.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
|
||||
.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
|
||||
.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
|
||||
.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
|
||||
.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
|
||||
.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
|
||||
.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
|
||||
.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
|
||||
.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
|
||||
.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
|
||||
.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
|
||||
.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
|
||||
.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
|
||||
.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
|
||||
.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
|
||||
.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
|
||||
.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
|
||||
.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
|
||||
.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
|
||||
.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
|
||||
.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
|
||||
.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
|
||||
|
||||
.LReverse_Sbox:
|
||||
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
|
||||
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
|
||||
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
|
||||
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
|
||||
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
|
||||
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
|
||||
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
|
||||
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
|
||||
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
|
||||
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
|
||||
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
|
||||
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
|
||||
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
|
||||
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
|
||||
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
|
||||
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
|
||||
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
|
||||
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
|
||||
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
|
||||
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
|
||||
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
|
||||
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
|
||||
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
|
||||
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
|
||||
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
|
||||
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
|
||||
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
|
||||
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
|
||||
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
|
||||
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
|
||||
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
|
||||
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
|
||||
*
|
||||
* Copyright (c) 2009 Intel Corp.
|
||||
* Author: Huang Ying <ying.huang@intel.com>
|
||||
* Vinodh Gopal
|
||||
* Erdinc Ozturk
|
||||
* Deniz Karakoyunlu
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
DATA .req v0
|
||||
SHASH .req v1
|
||||
IN1 .req v2
|
||||
T1 .req v2
|
||||
T2 .req v3
|
||||
T3 .req v4
|
||||
VZR .req v5
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
/*
|
||||
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
|
||||
* struct ghash_key const *k, const char *head)
|
||||
*/
|
||||
ENTRY(pmull_ghash_update)
|
||||
ld1 {DATA.16b}, [x1]
|
||||
ld1 {SHASH.16b}, [x3]
|
||||
eor VZR.16b, VZR.16b, VZR.16b
|
||||
|
||||
/* do the head block first, if supplied */
|
||||
cbz x4, 0f
|
||||
ld1 {IN1.2d}, [x4]
|
||||
b 1f
|
||||
|
||||
0: ld1 {IN1.2d}, [x2], #16
|
||||
sub w0, w0, #1
|
||||
1: ext IN1.16b, IN1.16b, IN1.16b, #8
|
||||
CPU_LE( rev64 IN1.16b, IN1.16b )
|
||||
eor DATA.16b, DATA.16b, IN1.16b
|
||||
|
||||
/* multiply DATA by SHASH in GF(2^128) */
|
||||
ext T2.16b, DATA.16b, DATA.16b, #8
|
||||
ext T3.16b, SHASH.16b, SHASH.16b, #8
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
eor T3.16b, T3.16b, SHASH.16b
|
||||
|
||||
pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
|
||||
pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
|
||||
pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
|
||||
eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
|
||||
ext T3.16b, VZR.16b, T2.16b, #8
|
||||
ext T2.16b, T2.16b, VZR.16b, #8
|
||||
eor DATA.16b, DATA.16b, T3.16b
|
||||
eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
|
||||
// carry-less multiplication
|
||||
|
||||
/* first phase of the reduction */
|
||||
shl T3.2d, DATA.2d, #1
|
||||
eor T3.16b, T3.16b, DATA.16b
|
||||
shl T3.2d, T3.2d, #5
|
||||
eor T3.16b, T3.16b, DATA.16b
|
||||
shl T3.2d, T3.2d, #57
|
||||
ext T2.16b, VZR.16b, T3.16b, #8
|
||||
ext T3.16b, T3.16b, VZR.16b, #8
|
||||
eor DATA.16b, DATA.16b, T2.16b
|
||||
eor T1.16b, T1.16b, T3.16b
|
||||
|
||||
/* second phase of the reduction */
|
||||
ushr T2.2d, DATA.2d, #5
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
ushr T2.2d, T2.2d, #1
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
ushr T2.2d, T2.2d, #1
|
||||
eor T1.16b, T1.16b, T2.16b
|
||||
eor DATA.16b, DATA.16b, T1.16b
|
||||
|
||||
cbnz w0, 0b
|
||||
|
||||
st1 {DATA.16b}, [x1]
|
||||
ret
|
||||
ENDPROC(pmull_ghash_update)
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
#define GHASH_BLOCK_SIZE 16
|
||||
#define GHASH_DIGEST_SIZE 16
|
||||
|
||||
struct ghash_key {
|
||||
u64 a;
|
||||
u64 b;
|
||||
};
|
||||
|
||||
struct ghash_desc_ctx {
|
||||
u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
|
||||
u8 buf[GHASH_BLOCK_SIZE];
|
||||
u32 count;
|
||||
};
|
||||
|
||||
asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
|
||||
struct ghash_key const *k, const char *head);
|
||||
|
||||
static int ghash_init(struct shash_desc *desc)
|
||||
{
|
||||
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
|
||||
*ctx = (struct ghash_desc_ctx){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ghash_update(struct shash_desc *desc, const u8 *src,
|
||||
unsigned int len)
|
||||
{
|
||||
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
|
||||
|
||||
ctx->count += len;
|
||||
|
||||
if ((partial + len) >= GHASH_BLOCK_SIZE) {
|
||||
struct ghash_key *key = crypto_shash_ctx(desc->tfm);
|
||||
int blocks;
|
||||
|
||||
if (partial) {
|
||||
int p = GHASH_BLOCK_SIZE - partial;
|
||||
|
||||
memcpy(ctx->buf + partial, src, p);
|
||||
src += p;
|
||||
len -= p;
|
||||
}
|
||||
|
||||
blocks = len / GHASH_BLOCK_SIZE;
|
||||
len %= GHASH_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
pmull_ghash_update(blocks, ctx->digest, src, key,
|
||||
partial ? ctx->buf : NULL);
|
||||
kernel_neon_end();
|
||||
src += blocks * GHASH_BLOCK_SIZE;
|
||||
}
|
||||
if (len)
|
||||
memcpy(ctx->buf + partial, src, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ghash_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
|
||||
|
||||
if (partial) {
|
||||
struct ghash_key *key = crypto_shash_ctx(desc->tfm);
|
||||
|
||||
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
|
||||
kernel_neon_end();
|
||||
}
|
||||
put_unaligned_be64(ctx->digest[1], dst);
|
||||
put_unaligned_be64(ctx->digest[0], dst + 8);
|
||||
|
||||
*ctx = (struct ghash_desc_ctx){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ghash_setkey(struct crypto_shash *tfm,
|
||||
const u8 *inkey, unsigned int keylen)
|
||||
{
|
||||
struct ghash_key *key = crypto_shash_ctx(tfm);
|
||||
u64 a, b;
|
||||
|
||||
if (keylen != GHASH_BLOCK_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* perform multiplication by 'x' in GF(2^128) */
|
||||
b = get_unaligned_be64(inkey);
|
||||
a = get_unaligned_be64(inkey + 8);
|
||||
|
||||
key->a = (a << 1) | (b >> 63);
|
||||
key->b = (b << 1) | (a >> 63);
|
||||
|
||||
if (b >> 63)
|
||||
key->b ^= 0xc200000000000000UL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg ghash_alg = {
|
||||
.digestsize = GHASH_DIGEST_SIZE,
|
||||
.init = ghash_init,
|
||||
.update = ghash_update,
|
||||
.final = ghash_final,
|
||||
.setkey = ghash_setkey,
|
||||
.descsize = sizeof(struct ghash_desc_ctx),
|
||||
.base = {
|
||||
.cra_name = "ghash",
|
||||
.cra_driver_name = "ghash-ce",
|
||||
.cra_priority = 200,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = GHASH_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct ghash_key),
|
||||
.cra_module = THIS_MODULE,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init ghash_ce_mod_init(void)
|
||||
{
|
||||
return crypto_register_shash(&ghash_alg);
|
||||
}
|
||||
|
||||
static void __exit ghash_ce_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&ghash_alg);
|
||||
}
|
||||
|
||||
module_cpu_feature_match(PMULL, ghash_ce_mod_init);
|
||||
module_exit(ghash_ce_mod_exit);
|
|
@ -0,0 +1,153 @@
|
|||
/*
|
||||
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
k0 .req v0
|
||||
k1 .req v1
|
||||
k2 .req v2
|
||||
k3 .req v3
|
||||
|
||||
t0 .req v4
|
||||
t1 .req v5
|
||||
|
||||
dga .req q6
|
||||
dgav .req v6
|
||||
dgb .req s7
|
||||
dgbv .req v7
|
||||
|
||||
dg0q .req q12
|
||||
dg0s .req s12
|
||||
dg0v .req v12
|
||||
dg1s .req s13
|
||||
dg1v .req v13
|
||||
dg2s .req s14
|
||||
|
||||
.macro add_only, op, ev, rc, s0, dg1
|
||||
.ifc \ev, ev
|
||||
add t1.4s, v\s0\().4s, \rc\().4s
|
||||
sha1h dg2s, dg0s
|
||||
.ifnb \dg1
|
||||
sha1\op dg0q, \dg1, t0.4s
|
||||
.else
|
||||
sha1\op dg0q, dg1s, t0.4s
|
||||
.endif
|
||||
.else
|
||||
.ifnb \s0
|
||||
add t0.4s, v\s0\().4s, \rc\().4s
|
||||
.endif
|
||||
sha1h dg1s, dg0s
|
||||
sha1\op dg0q, dg2s, t1.4s
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
|
||||
sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
|
||||
add_only \op, \ev, \rc, \s1, \dg1
|
||||
sha1su1 v\s0\().4s, v\s3\().4s
|
||||
.endm
|
||||
|
||||
/*
|
||||
* The SHA1 round constants
|
||||
*/
|
||||
.align 4
|
||||
.Lsha1_rcon:
|
||||
.word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
|
||||
|
||||
/*
|
||||
* void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
|
||||
* u8 *head, long bytes)
|
||||
*/
|
||||
ENTRY(sha1_ce_transform)
|
||||
/* load round constants */
|
||||
adr x6, .Lsha1_rcon
|
||||
ld1r {k0.4s}, [x6], #4
|
||||
ld1r {k1.4s}, [x6], #4
|
||||
ld1r {k2.4s}, [x6], #4
|
||||
ld1r {k3.4s}, [x6]
|
||||
|
||||
/* load state */
|
||||
ldr dga, [x2]
|
||||
ldr dgb, [x2, #16]
|
||||
|
||||
/* load partial state (if supplied) */
|
||||
cbz x3, 0f
|
||||
ld1 {v8.4s-v11.4s}, [x3]
|
||||
b 1f
|
||||
|
||||
/* load input */
|
||||
0: ld1 {v8.4s-v11.4s}, [x1], #64
|
||||
sub w0, w0, #1
|
||||
|
||||
1:
|
||||
CPU_LE( rev32 v8.16b, v8.16b )
|
||||
CPU_LE( rev32 v9.16b, v9.16b )
|
||||
CPU_LE( rev32 v10.16b, v10.16b )
|
||||
CPU_LE( rev32 v11.16b, v11.16b )
|
||||
|
||||
2: add t0.4s, v8.4s, k0.4s
|
||||
mov dg0v.16b, dgav.16b
|
||||
|
||||
add_update c, ev, k0, 8, 9, 10, 11, dgb
|
||||
add_update c, od, k0, 9, 10, 11, 8
|
||||
add_update c, ev, k0, 10, 11, 8, 9
|
||||
add_update c, od, k0, 11, 8, 9, 10
|
||||
add_update c, ev, k1, 8, 9, 10, 11
|
||||
|
||||
add_update p, od, k1, 9, 10, 11, 8
|
||||
add_update p, ev, k1, 10, 11, 8, 9
|
||||
add_update p, od, k1, 11, 8, 9, 10
|
||||
add_update p, ev, k1, 8, 9, 10, 11
|
||||
add_update p, od, k2, 9, 10, 11, 8
|
||||
|
||||
add_update m, ev, k2, 10, 11, 8, 9
|
||||
add_update m, od, k2, 11, 8, 9, 10
|
||||
add_update m, ev, k2, 8, 9, 10, 11
|
||||
add_update m, od, k2, 9, 10, 11, 8
|
||||
add_update m, ev, k3, 10, 11, 8, 9
|
||||
|
||||
add_update p, od, k3, 11, 8, 9, 10
|
||||
add_only p, ev, k3, 9
|
||||
add_only p, od, k3, 10
|
||||
add_only p, ev, k3, 11
|
||||
add_only p, od
|
||||
|
||||
/* update state */
|
||||
add dgbv.2s, dgbv.2s, dg1v.2s
|
||||
add dgav.4s, dgav.4s, dg0v.4s
|
||||
|
||||
cbnz w0, 0b
|
||||
|
||||
/*
|
||||
* Final block: add padding and total bit count.
|
||||
* Skip if we have no total byte count in x4. In that case, the input
|
||||
* size was not a round multiple of the block size, and the padding is
|
||||
* handled by the C code.
|
||||
*/
|
||||
cbz x4, 3f
|
||||
movi v9.2d, #0
|
||||
mov x8, #0x80000000
|
||||
movi v10.2d, #0
|
||||
ror x7, x4, #29 // ror(lsl(x4, 3), 32)
|
||||
fmov d8, x8
|
||||
mov x4, #0
|
||||
mov v11.d[0], xzr
|
||||
mov v11.d[1], x7
|
||||
b 2b
|
||||
|
||||
/* store new state */
|
||||
3: str dga, [x2]
|
||||
str dgb, [x2, #16]
|
||||
ret
|
||||
ENDPROC(sha1_ce_transform)
|
|
@ -0,0 +1,174 @@
|
|||
/*
|
||||
* sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/sha.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
|
||||
u8 *head, long bytes);
|
||||
|
||||
static int sha1_init(struct shash_desc *desc)
|
||||
{
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
*sctx = (struct sha1_state){
|
||||
.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
|
||||
};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha1_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
|
||||
|
||||
sctx->count += len;
|
||||
|
||||
if ((partial + len) >= SHA1_BLOCK_SIZE) {
|
||||
int blocks;
|
||||
|
||||
if (partial) {
|
||||
int p = SHA1_BLOCK_SIZE - partial;
|
||||
|
||||
memcpy(sctx->buffer + partial, data, p);
|
||||
data += p;
|
||||
len -= p;
|
||||
}
|
||||
|
||||
blocks = len / SHA1_BLOCK_SIZE;
|
||||
len %= SHA1_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(16);
|
||||
sha1_ce_transform(blocks, data, sctx->state,
|
||||
partial ? sctx->buffer : NULL, 0);
|
||||
kernel_neon_end();
|
||||
|
||||
data += blocks * SHA1_BLOCK_SIZE;
|
||||
partial = 0;
|
||||
}
|
||||
if (len)
|
||||
memcpy(sctx->buffer + partial, data, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha1_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
|
||||
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
__be64 bits = cpu_to_be64(sctx->count << 3);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int i;
|
||||
|
||||
u32 padlen = SHA1_BLOCK_SIZE
|
||||
- ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
|
||||
|
||||
sha1_update(desc, padding, padlen);
|
||||
sha1_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
|
||||
for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha1_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha1_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *out)
|
||||
{
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int blocks;
|
||||
int i;
|
||||
|
||||
if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
|
||||
sha1_update(desc, data, len);
|
||||
return sha1_final(desc, out);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use a fast path if the input is a multiple of 64 bytes. In
|
||||
* this case, there is no need to copy data around, and we can
|
||||
* perform the entire digest calculation in a single invocation
|
||||
* of sha1_ce_transform()
|
||||
*/
|
||||
blocks = len / SHA1_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(16);
|
||||
sha1_ce_transform(blocks, data, sctx->state, NULL, len);
|
||||
kernel_neon_end();
|
||||
|
||||
for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha1_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha1_export(struct shash_desc *desc, void *out)
|
||||
{
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
struct sha1_state *dst = out;
|
||||
|
||||
*dst = *sctx;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha1_import(struct shash_desc *desc, const void *in)
|
||||
{
|
||||
struct sha1_state *sctx = shash_desc_ctx(desc);
|
||||
struct sha1_state const *src = in;
|
||||
|
||||
*sctx = *src;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.init = sha1_init,
|
||||
.update = sha1_update,
|
||||
.final = sha1_final,
|
||||
.finup = sha1_finup,
|
||||
.export = sha1_export,
|
||||
.import = sha1_import,
|
||||
.descsize = sizeof(struct sha1_state),
|
||||
.digestsize = SHA1_DIGEST_SIZE,
|
||||
.statesize = sizeof(struct sha1_state),
|
||||
.base = {
|
||||
.cra_name = "sha1",
|
||||
.cra_driver_name = "sha1-ce",
|
||||
.cra_priority = 200,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = SHA1_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init sha1_ce_mod_init(void)
|
||||
{
|
||||
return crypto_register_shash(&alg);
|
||||
}
|
||||
|
||||
static void __exit sha1_ce_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_shash(&alg);
|
||||
}
|
||||
|
||||
module_cpu_feature_match(SHA1, sha1_ce_mod_init);
|
||||
module_exit(sha1_ce_mod_fini);
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
* sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
||||
dga .req q20
|
||||
dgav .req v20
|
||||
dgb .req q21
|
||||
dgbv .req v21
|
||||
|
||||
t0 .req v22
|
||||
t1 .req v23
|
||||
|
||||
dg0q .req q24
|
||||
dg0v .req v24
|
||||
dg1q .req q25
|
||||
dg1v .req v25
|
||||
dg2q .req q26
|
||||
dg2v .req v26
|
||||
|
||||
.macro add_only, ev, rc, s0
|
||||
mov dg2v.16b, dg0v.16b
|
||||
.ifeq \ev
|
||||
add t1.4s, v\s0\().4s, \rc\().4s
|
||||
sha256h dg0q, dg1q, t0.4s
|
||||
sha256h2 dg1q, dg2q, t0.4s
|
||||
.else
|
||||
.ifnb \s0
|
||||
add t0.4s, v\s0\().4s, \rc\().4s
|
||||
.endif
|
||||
sha256h dg0q, dg1q, t1.4s
|
||||
sha256h2 dg1q, dg2q, t1.4s
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add_update, ev, rc, s0, s1, s2, s3
|
||||
sha256su0 v\s0\().4s, v\s1\().4s
|
||||
add_only \ev, \rc, \s1
|
||||
sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
|
||||
.endm
|
||||
|
||||
/*
|
||||
* The SHA-256 round constants
|
||||
*/
|
||||
.align 4
|
||||
.Lsha2_rcon:
|
||||
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
|
||||
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
|
||||
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
|
||||
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
|
||||
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
|
||||
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
|
||||
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
|
||||
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
|
||||
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
|
||||
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
|
||||
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
|
||||
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
|
||||
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
|
||||
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
|
||||
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
|
||||
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
|
||||
/*
|
||||
* void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
|
||||
* u8 *head, long bytes)
|
||||
*/
|
||||
ENTRY(sha2_ce_transform)
|
||||
/* load round constants */
|
||||
adr x8, .Lsha2_rcon
|
||||
ld1 { v0.4s- v3.4s}, [x8], #64
|
||||
ld1 { v4.4s- v7.4s}, [x8], #64
|
||||
ld1 { v8.4s-v11.4s}, [x8], #64
|
||||
ld1 {v12.4s-v15.4s}, [x8]
|
||||
|
||||
/* load state */
|
||||
ldp dga, dgb, [x2]
|
||||
|
||||
/* load partial input (if supplied) */
|
||||
cbz x3, 0f
|
||||
ld1 {v16.4s-v19.4s}, [x3]
|
||||
b 1f
|
||||
|
||||
/* load input */
|
||||
0: ld1 {v16.4s-v19.4s}, [x1], #64
|
||||
sub w0, w0, #1
|
||||
|
||||
1:
|
||||
CPU_LE( rev32 v16.16b, v16.16b )
|
||||
CPU_LE( rev32 v17.16b, v17.16b )
|
||||
CPU_LE( rev32 v18.16b, v18.16b )
|
||||
CPU_LE( rev32 v19.16b, v19.16b )
|
||||
|
||||
2: add t0.4s, v16.4s, v0.4s
|
||||
mov dg0v.16b, dgav.16b
|
||||
mov dg1v.16b, dgbv.16b
|
||||
|
||||
add_update 0, v1, 16, 17, 18, 19
|
||||
add_update 1, v2, 17, 18, 19, 16
|
||||
add_update 0, v3, 18, 19, 16, 17
|
||||
add_update 1, v4, 19, 16, 17, 18
|
||||
|
||||
add_update 0, v5, 16, 17, 18, 19
|
||||
add_update 1, v6, 17, 18, 19, 16
|
||||
add_update 0, v7, 18, 19, 16, 17
|
||||
add_update 1, v8, 19, 16, 17, 18
|
||||
|
||||
add_update 0, v9, 16, 17, 18, 19
|
||||
add_update 1, v10, 17, 18, 19, 16
|
||||
add_update 0, v11, 18, 19, 16, 17
|
||||
add_update 1, v12, 19, 16, 17, 18
|
||||
|
||||
add_only 0, v13, 17
|
||||
add_only 1, v14, 18
|
||||
add_only 0, v15, 19
|
||||
add_only 1
|
||||
|
||||
/* update state */
|
||||
add dgav.4s, dgav.4s, dg0v.4s
|
||||
add dgbv.4s, dgbv.4s, dg1v.4s
|
||||
|
||||
/* handled all input blocks? */
|
||||
cbnz w0, 0b
|
||||
|
||||
/*
|
||||
* Final block: add padding and total bit count.
|
||||
* Skip if we have no total byte count in x4. In that case, the input
|
||||
* size was not a round multiple of the block size, and the padding is
|
||||
* handled by the C code.
|
||||
*/
|
||||
cbz x4, 3f
|
||||
movi v17.2d, #0
|
||||
mov x8, #0x80000000
|
||||
movi v18.2d, #0
|
||||
ror x7, x4, #29 // ror(lsl(x4, 3), 32)
|
||||
fmov d16, x8
|
||||
mov x4, #0
|
||||
mov v19.d[0], xzr
|
||||
mov v19.d[1], x7
|
||||
b 2b
|
||||
|
||||
/* store new state */
|
||||
3: stp dga, dgb, [x2]
|
||||
ret
|
||||
ENDPROC(sha2_ce_transform)
|
|
@ -0,0 +1,255 @@
|
|||
/*
|
||||
* sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
|
||||
*
|
||||
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <asm/neon.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/sha.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
|
||||
u8 *head, long bytes);
|
||||
|
||||
static int sha224_init(struct shash_desc *desc)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
*sctx = (struct sha256_state){
|
||||
.state = {
|
||||
SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
|
||||
SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
|
||||
}
|
||||
};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_init(struct shash_desc *desc)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
*sctx = (struct sha256_state){
|
||||
.state = {
|
||||
SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
|
||||
SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
|
||||
}
|
||||
};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha2_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
|
||||
|
||||
sctx->count += len;
|
||||
|
||||
if ((partial + len) >= SHA256_BLOCK_SIZE) {
|
||||
int blocks;
|
||||
|
||||
if (partial) {
|
||||
int p = SHA256_BLOCK_SIZE - partial;
|
||||
|
||||
memcpy(sctx->buf + partial, data, p);
|
||||
data += p;
|
||||
len -= p;
|
||||
}
|
||||
|
||||
blocks = len / SHA256_BLOCK_SIZE;
|
||||
len %= SHA256_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(28);
|
||||
sha2_ce_transform(blocks, data, sctx->state,
|
||||
partial ? sctx->buf : NULL, 0);
|
||||
kernel_neon_end();
|
||||
|
||||
data += blocks * SHA256_BLOCK_SIZE;
|
||||
partial = 0;
|
||||
}
|
||||
if (len)
|
||||
memcpy(sctx->buf + partial, data, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void sha2_final(struct shash_desc *desc)
|
||||
{
|
||||
static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
|
||||
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
__be64 bits = cpu_to_be64(sctx->count << 3);
|
||||
u32 padlen = SHA256_BLOCK_SIZE
|
||||
- ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
|
||||
|
||||
sha2_update(desc, padding, padlen);
|
||||
sha2_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
}
|
||||
|
||||
static int sha224_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int i;
|
||||
|
||||
sha2_final(desc);
|
||||
|
||||
for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha256_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int i;
|
||||
|
||||
sha2_final(desc);
|
||||
|
||||
for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha256_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void sha2_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
int blocks;
|
||||
|
||||
if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
|
||||
sha2_update(desc, data, len);
|
||||
sha2_final(desc);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use a fast path if the input is a multiple of 64 bytes. In
|
||||
* this case, there is no need to copy data around, and we can
|
||||
* perform the entire digest calculation in a single invocation
|
||||
* of sha2_ce_transform()
|
||||
*/
|
||||
blocks = len / SHA256_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(28);
|
||||
sha2_ce_transform(blocks, data, sctx->state, NULL, len);
|
||||
kernel_neon_end();
|
||||
data += blocks * SHA256_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
static int sha224_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int i;
|
||||
|
||||
sha2_finup(desc, data, len);
|
||||
|
||||
for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha256_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
__be32 *dst = (__be32 *)out;
|
||||
int i;
|
||||
|
||||
sha2_finup(desc, data, len);
|
||||
|
||||
for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
|
||||
put_unaligned_be32(sctx->state[i], dst++);
|
||||
|
||||
*sctx = (struct sha256_state){};
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha2_export(struct shash_desc *desc, void *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
struct sha256_state *dst = out;
|
||||
|
||||
*dst = *sctx;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha2_import(struct shash_desc *desc, const void *in)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
struct sha256_state const *src = in;
|
||||
|
||||
*sctx = *src;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg algs[] = { {
|
||||
.init = sha224_init,
|
||||
.update = sha2_update,
|
||||
.final = sha224_final,
|
||||
.finup = sha224_finup,
|
||||
.export = sha2_export,
|
||||
.import = sha2_import,
|
||||
.descsize = sizeof(struct sha256_state),
|
||||
.digestsize = SHA224_DIGEST_SIZE,
|
||||
.statesize = sizeof(struct sha256_state),
|
||||
.base = {
|
||||
.cra_name = "sha224",
|
||||
.cra_driver_name = "sha224-ce",
|
||||
.cra_priority = 200,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = SHA256_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
}, {
|
||||
.init = sha256_init,
|
||||
.update = sha2_update,
|
||||
.final = sha256_final,
|
||||
.finup = sha256_finup,
|
||||
.export = sha2_export,
|
||||
.import = sha2_import,
|
||||
.descsize = sizeof(struct sha256_state),
|
||||
.digestsize = SHA256_DIGEST_SIZE,
|
||||
.statesize = sizeof(struct sha256_state),
|
||||
.base = {
|
||||
.cra_name = "sha256",
|
||||
.cra_driver_name = "sha256-ce",
|
||||
.cra_priority = 200,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = SHA256_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
} };
|
||||
|
||||
static int __init sha2_ce_mod_init(void)
|
||||
{
|
||||
return crypto_register_shashes(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
static void __exit sha2_ce_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_cpu_feature_match(SHA2, sha2_ce_mod_init);
|
||||
module_exit(sha2_ce_mod_fini);
|
|
@ -40,6 +40,7 @@ generic-y += segment.h
|
|||
generic-y += sembuf.h
|
||||
generic-y += serial.h
|
||||
generic-y += shmbuf.h
|
||||
generic-y += simd.h
|
||||
generic-y += sizes.h
|
||||
generic-y += socket.h
|
||||
generic-y += sockios.h
|
||||
|
|
|
@ -37,8 +37,21 @@ struct fpsimd_state {
|
|||
u32 fpcr;
|
||||
};
|
||||
};
|
||||
/* the id of the last cpu to have restored this state */
|
||||
unsigned int cpu;
|
||||
};
|
||||
|
||||
/*
|
||||
* Struct for stacking the bottom 'n' FP/SIMD registers.
|
||||
*/
|
||||
struct fpsimd_partial_state {
|
||||
u32 fpsr;
|
||||
u32 fpcr;
|
||||
u32 num_regs;
|
||||
__uint128_t vregs[32];
|
||||
};
|
||||
|
||||
|
||||
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
|
||||
/* Masks for extracting the FPSR and FPCR from the FPSCR */
|
||||
#define VFP_FPSCR_STAT_MASK 0xf800009f
|
||||
|
@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
|
|||
extern void fpsimd_thread_switch(struct task_struct *next);
|
||||
extern void fpsimd_flush_thread(void);
|
||||
|
||||
extern void fpsimd_preserve_current_state(void);
|
||||
extern void fpsimd_restore_current_state(void);
|
||||
extern void fpsimd_update_current_state(struct fpsimd_state *state);
|
||||
|
||||
extern void fpsimd_flush_task_state(struct task_struct *target);
|
||||
|
||||
extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
|
||||
u32 num_regs);
|
||||
extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -62,3 +62,38 @@
|
|||
ldr w\tmpnr, [\state, #16 * 2 + 4]
|
||||
msr fpcr, x\tmpnr
|
||||
.endm
|
||||
|
||||
.altmacro
|
||||
.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
|
||||
mrs x\tmpnr1, fpsr
|
||||
str w\numnr, [\state, #8]
|
||||
mrs x\tmpnr2, fpcr
|
||||
stp w\tmpnr1, w\tmpnr2, [\state]
|
||||
adr x\tmpnr1, 0f
|
||||
add \state, \state, x\numnr, lsl #4
|
||||
sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
|
||||
br x\tmpnr1
|
||||
.irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
|
||||
.irp qb, %(qa + 1)
|
||||
stp q\qa, q\qb, [\state, # -16 * \qa - 16]
|
||||
.endr
|
||||
.endr
|
||||
0:
|
||||
.endm
|
||||
|
||||
.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
|
||||
ldp w\tmpnr1, w\tmpnr2, [\state]
|
||||
msr fpsr, x\tmpnr1
|
||||
msr fpcr, x\tmpnr2
|
||||
adr x\tmpnr1, 0f
|
||||
ldr w\tmpnr2, [\state, #8]
|
||||
add \state, \state, x\tmpnr2, lsl #4
|
||||
sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
|
||||
br x\tmpnr1
|
||||
.irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
|
||||
.irp qb, %(qa + 1)
|
||||
ldp q\qa, q\qb, [\state, # -16 * \qa - 16]
|
||||
.endr
|
||||
.endr
|
||||
0:
|
||||
.endm
|
||||
|
|
|
@ -8,7 +8,11 @@
|
|||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#define cpu_has_neon() (1)
|
||||
|
||||
void kernel_neon_begin(void);
|
||||
#define kernel_neon_begin() kernel_neon_begin_partial(32)
|
||||
|
||||
void kernel_neon_begin_partial(u32 num_regs);
|
||||
void kernel_neon_end(void);
|
||||
|
|
|
@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void)
|
|||
#define TIF_SIGPENDING 0
|
||||
#define TIF_NEED_RESCHED 1
|
||||
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
|
||||
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
|
||||
#define TIF_SYSCALL_TRACE 8
|
||||
#define TIF_SYSCALL_AUDIT 9
|
||||
#define TIF_SYSCALL_TRACEPOINT 10
|
||||
|
@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void)
|
|||
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
|
||||
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
|
||||
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
|
||||
#define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
|
||||
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
|
||||
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
|
||||
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
|
||||
|
@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void)
|
|||
#define _TIF_32BIT (1 << TIF_32BIT)
|
||||
|
||||
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
|
||||
_TIF_NOTIFY_RESUME)
|
||||
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
|
||||
|
||||
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
|
||||
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
|
||||
|
|
|
@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
|
|||
fpsimd_restore x0, 8
|
||||
ret
|
||||
ENDPROC(fpsimd_load_state)
|
||||
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
|
||||
/*
|
||||
* Save the bottom n FP registers.
|
||||
*
|
||||
* x0 - pointer to struct fpsimd_partial_state
|
||||
*/
|
||||
ENTRY(fpsimd_save_partial_state)
|
||||
fpsimd_save_partial x0, 1, 8, 9
|
||||
ret
|
||||
ENDPROC(fpsimd_load_partial_state)
|
||||
|
||||
/*
|
||||
* Load the bottom n FP registers.
|
||||
*
|
||||
* x0 - pointer to struct fpsimd_partial_state
|
||||
*/
|
||||
ENTRY(fpsimd_load_partial_state)
|
||||
fpsimd_restore_partial x0, 8, 9
|
||||
ret
|
||||
ENDPROC(fpsimd_load_partial_state)
|
||||
|
||||
#endif
|
||||
|
|
|
@ -562,7 +562,7 @@ fast_work_pending:
|
|||
str x0, [sp, #S_X0] // returned x0
|
||||
work_pending:
|
||||
tbnz x1, #TIF_NEED_RESCHED, work_resched
|
||||
/* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */
|
||||
/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
|
||||
ldr x2, [sp, #S_PSTATE]
|
||||
mov x0, sp // 'regs'
|
||||
tst x2, #PSR_MODE_MASK // user mode regs?
|
||||
|
|
|
@ -34,6 +34,60 @@
|
|||
#define FPEXC_IXF (1 << 4)
|
||||
#define FPEXC_IDF (1 << 7)
|
||||
|
||||
/*
|
||||
* In order to reduce the number of times the FPSIMD state is needlessly saved
|
||||
* and restored, we need to keep track of two things:
|
||||
* (a) for each task, we need to remember which CPU was the last one to have
|
||||
* the task's FPSIMD state loaded into its FPSIMD registers;
|
||||
* (b) for each CPU, we need to remember which task's userland FPSIMD state has
|
||||
* been loaded into its FPSIMD registers most recently, or whether it has
|
||||
* been used to perform kernel mode NEON in the meantime.
|
||||
*
|
||||
* For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
|
||||
* the id of the current CPU everytime the state is loaded onto a CPU. For (b),
|
||||
* we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
|
||||
* address of the userland FPSIMD state of the task that was loaded onto the CPU
|
||||
* the most recently, or NULL if kernel mode NEON has been performed after that.
|
||||
*
|
||||
* With this in place, we no longer have to restore the next FPSIMD state right
|
||||
* when switching between tasks. Instead, we can defer this check to userland
|
||||
* resume, at which time we verify whether the CPU's fpsimd_last_state and the
|
||||
* task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
|
||||
* can omit the FPSIMD restore.
|
||||
*
|
||||
* As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
|
||||
* indicate whether or not the userland FPSIMD state of the current task is
|
||||
* present in the registers. The flag is set unless the FPSIMD registers of this
|
||||
* CPU currently contain the most recent userland FPSIMD state of the current
|
||||
* task.
|
||||
*
|
||||
* For a certain task, the sequence may look something like this:
|
||||
* - the task gets scheduled in; if both the task's fpsimd_state.cpu field
|
||||
* contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
|
||||
* variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
|
||||
* cleared, otherwise it is set;
|
||||
*
|
||||
* - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
|
||||
* userland FPSIMD state is copied from memory to the registers, the task's
|
||||
* fpsimd_state.cpu field is set to the id of the current CPU, the current
|
||||
* CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
|
||||
* TIF_FOREIGN_FPSTATE flag is cleared;
|
||||
*
|
||||
* - the task executes an ordinary syscall; upon return to userland, the
|
||||
* TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
|
||||
* restored;
|
||||
*
|
||||
* - the task executes a syscall which executes some NEON instructions; this is
|
||||
* preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
|
||||
* register contents to memory, clears the fpsimd_last_state per-cpu variable
|
||||
* and sets the TIF_FOREIGN_FPSTATE flag;
|
||||
*
|
||||
* - the task gets preempted after kernel_neon_end() is called; as we have not
|
||||
* returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
|
||||
* whatever is in the FPSIMD registers is not saved to memory, but discarded.
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
|
||||
|
||||
/*
|
||||
* Trapped FP/ASIMD access.
|
||||
*/
|
||||
|
@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
|
|||
|
||||
void fpsimd_thread_switch(struct task_struct *next)
|
||||
{
|
||||
/* check if not kernel threads */
|
||||
if (current->mm)
|
||||
/*
|
||||
* Save the current FPSIMD state to memory, but only if whatever is in
|
||||
* the registers is in fact the most recent userland FPSIMD state of
|
||||
* 'current'.
|
||||
*/
|
||||
if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
if (next->mm)
|
||||
fpsimd_load_state(&next->thread.fpsimd_state);
|
||||
|
||||
if (next->mm) {
|
||||
/*
|
||||
* If we are switching to a task whose most recent userland
|
||||
* FPSIMD state is already in the registers of *this* cpu,
|
||||
* we can skip loading the state from memory. Otherwise, set
|
||||
* the TIF_FOREIGN_FPSTATE flag so the state will be loaded
|
||||
* upon the next return to userland.
|
||||
*/
|
||||
struct fpsimd_state *st = &next->thread.fpsimd_state;
|
||||
|
||||
if (__this_cpu_read(fpsimd_last_state) == st
|
||||
&& st->cpu == smp_processor_id())
|
||||
clear_ti_thread_flag(task_thread_info(next),
|
||||
TIF_FOREIGN_FPSTATE);
|
||||
else
|
||||
set_ti_thread_flag(task_thread_info(next),
|
||||
TIF_FOREIGN_FPSTATE);
|
||||
}
|
||||
}
|
||||
|
||||
void fpsimd_flush_thread(void)
|
||||
{
|
||||
preempt_disable();
|
||||
memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
|
||||
fpsimd_load_state(¤t->thread.fpsimd_state);
|
||||
set_thread_flag(TIF_FOREIGN_FPSTATE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Save the userland FPSIMD state of 'current' to memory, but only if the state
|
||||
* currently held in the registers does in fact belong to 'current'
|
||||
*/
|
||||
void fpsimd_preserve_current_state(void)
|
||||
{
|
||||
preempt_disable();
|
||||
if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Load the userland FPSIMD state of 'current' from memory, but only if the
|
||||
* FPSIMD state already held in the registers is /not/ the most recent FPSIMD
|
||||
* state of 'current'
|
||||
*/
|
||||
void fpsimd_restore_current_state(void)
|
||||
{
|
||||
preempt_disable();
|
||||
if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
|
||||
struct fpsimd_state *st = ¤t->thread.fpsimd_state;
|
||||
|
||||
fpsimd_load_state(st);
|
||||
this_cpu_write(fpsimd_last_state, st);
|
||||
st->cpu = smp_processor_id();
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Load an updated userland FPSIMD state for 'current' from memory and set the
|
||||
* flag that indicates that the FPSIMD register contents are the most recent
|
||||
* FPSIMD state of 'current'
|
||||
*/
|
||||
void fpsimd_update_current_state(struct fpsimd_state *state)
|
||||
{
|
||||
preempt_disable();
|
||||
fpsimd_load_state(state);
|
||||
if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
|
||||
struct fpsimd_state *st = ¤t->thread.fpsimd_state;
|
||||
|
||||
this_cpu_write(fpsimd_last_state, st);
|
||||
st->cpu = smp_processor_id();
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Invalidate live CPU copies of task t's FPSIMD state
|
||||
*/
|
||||
void fpsimd_flush_task_state(struct task_struct *t)
|
||||
{
|
||||
t->thread.fpsimd_state.cpu = NR_CPUS;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
|
||||
static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
|
||||
static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
|
||||
|
||||
/*
|
||||
* Kernel-side NEON support functions
|
||||
*/
|
||||
void kernel_neon_begin(void)
|
||||
void kernel_neon_begin_partial(u32 num_regs)
|
||||
{
|
||||
/* Avoid using the NEON in interrupt context */
|
||||
BUG_ON(in_interrupt());
|
||||
preempt_disable();
|
||||
if (in_interrupt()) {
|
||||
struct fpsimd_partial_state *s = this_cpu_ptr(
|
||||
in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
|
||||
|
||||
if (current->mm)
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
BUG_ON(num_regs > 32);
|
||||
fpsimd_save_partial_state(s, roundup(num_regs, 2));
|
||||
} else {
|
||||
/*
|
||||
* Save the userland FPSIMD state if we have one and if we
|
||||
* haven't done so already. Clear fpsimd_last_state to indicate
|
||||
* that there is no longer userland FPSIMD state in the
|
||||
* registers.
|
||||
*/
|
||||
preempt_disable();
|
||||
if (current->mm &&
|
||||
!test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
this_cpu_write(fpsimd_last_state, NULL);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(kernel_neon_begin);
|
||||
EXPORT_SYMBOL(kernel_neon_begin_partial);
|
||||
|
||||
void kernel_neon_end(void)
|
||||
{
|
||||
if (current->mm)
|
||||
fpsimd_load_state(¤t->thread.fpsimd_state);
|
||||
|
||||
preempt_enable();
|
||||
if (in_interrupt()) {
|
||||
struct fpsimd_partial_state *s = this_cpu_ptr(
|
||||
in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
|
||||
fpsimd_load_partial_state(s);
|
||||
} else {
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(kernel_neon_end);
|
||||
|
||||
|
@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
|
|||
{
|
||||
switch (cmd) {
|
||||
case CPU_PM_ENTER:
|
||||
if (current->mm)
|
||||
if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
break;
|
||||
case CPU_PM_EXIT:
|
||||
if (current->mm)
|
||||
fpsimd_load_state(¤t->thread.fpsimd_state);
|
||||
set_thread_flag(TIF_FOREIGN_FPSTATE);
|
||||
break;
|
||||
case CPU_PM_ENTER_FAILED:
|
||||
default:
|
||||
|
|
|
@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task)
|
|||
|
||||
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
|
||||
{
|
||||
fpsimd_save_state(¤t->thread.fpsimd_state);
|
||||
fpsimd_preserve_current_state();
|
||||
*dst = *src;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
|
|||
return ret;
|
||||
|
||||
target->thread.fpsimd_state.user_fpsimd = newstate;
|
||||
fpsimd_flush_task_state(target);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target,
|
|||
uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
|
||||
}
|
||||
|
||||
fpsimd_flush_task_state(target);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
|
|||
int err;
|
||||
|
||||
/* dump the hardware registers to the fpsimd_state structure */
|
||||
fpsimd_save_state(fpsimd);
|
||||
fpsimd_preserve_current_state();
|
||||
|
||||
/* copy the FP and status/control registers */
|
||||
err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
|
||||
|
@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
|
|||
__get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
|
||||
|
||||
/* load the hardware registers from the fpsimd_state structure */
|
||||
if (!err) {
|
||||
preempt_disable();
|
||||
fpsimd_load_state(&fpsimd);
|
||||
preempt_enable();
|
||||
}
|
||||
if (!err)
|
||||
fpsimd_update_current_state(&fpsimd);
|
||||
|
||||
return err ? -EFAULT : 0;
|
||||
}
|
||||
|
@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
|
|||
clear_thread_flag(TIF_NOTIFY_RESUME);
|
||||
tracehook_notify_resume(regs);
|
||||
}
|
||||
|
||||
if (thread_flags & _TIF_FOREIGN_FPSTATE)
|
||||
fpsimd_restore_current_state();
|
||||
|
||||
}
|
||||
|
|
|
@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
|
|||
* Note that this also saves V16-31, which aren't visible
|
||||
* in AArch32.
|
||||
*/
|
||||
fpsimd_save_state(fpsimd);
|
||||
fpsimd_preserve_current_state();
|
||||
|
||||
/* Place structure header on the stack */
|
||||
__put_user_error(magic, &frame->magic, err);
|
||||
|
@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
|
|||
* We don't need to touch the exception register, so
|
||||
* reload the hardware state.
|
||||
*/
|
||||
if (!err) {
|
||||
preempt_disable();
|
||||
fpsimd_load_state(&fpsimd);
|
||||
preempt_enable();
|
||||
}
|
||||
if (!err)
|
||||
fpsimd_update_current_state(&fpsimd);
|
||||
|
||||
return err ? -EFAULT : 0;
|
||||
}
|
||||
|
|
|
@ -4,22 +4,27 @@
|
|||
/*
|
||||
* This is the most generic implementation of unaligned accesses
|
||||
* and should work almost anywhere.
|
||||
*
|
||||
* If an architecture can handle unaligned accesses in hardware,
|
||||
* it may want to use the linux/unaligned/access_ok.h implementation
|
||||
* instead.
|
||||
*/
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
/* Set by the arch if it can handle unaligned accesses in hardware. */
|
||||
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
# include <linux/unaligned/access_ok.h>
|
||||
#endif
|
||||
|
||||
#if defined(__LITTLE_ENDIAN)
|
||||
# include <linux/unaligned/le_struct.h>
|
||||
# include <linux/unaligned/be_byteshift.h>
|
||||
# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
# include <linux/unaligned/le_struct.h>
|
||||
# include <linux/unaligned/be_byteshift.h>
|
||||
# endif
|
||||
# include <linux/unaligned/generic.h>
|
||||
# define get_unaligned __get_unaligned_le
|
||||
# define put_unaligned __put_unaligned_le
|
||||
#elif defined(__BIG_ENDIAN)
|
||||
# include <linux/unaligned/be_struct.h>
|
||||
# include <linux/unaligned/le_byteshift.h>
|
||||
# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
# include <linux/unaligned/be_struct.h>
|
||||
# include <linux/unaligned/le_byteshift.h>
|
||||
# endif
|
||||
# include <linux/unaligned/generic.h>
|
||||
# define get_unaligned __get_unaligned_be
|
||||
# define put_unaligned __put_unaligned_be
|
||||
|
|
Loading…
Reference in New Issue