crypto: aesni-intel - Ported implementation to x86-32

The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:

x86:                   i568       aes-ni    delta
ECB, 256 bit:     93.8 MB/s   123.3 MB/s   +31.4%
CBC, 256 bit:     84.8 MB/s   262.3 MB/s  +209.3%
LRW, 256 bit:    108.6 MB/s   222.1 MB/s  +104.5%
XTS, 256 bit:    105.0 MB/s   205.5 MB/s   +95.7%

Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:

x86-64:           old impl.    new impl.    delta
ECB, 256 bit:    121.1 MB/s   123.0 MB/s    +1.5%
CBC, 256 bit:    285.3 MB/s   290.8 MB/s    +1.9%
LRW, 256 bit:    263.7 MB/s   265.3 MB/s    +0.6%
XTS, 256 bit:    251.1 MB/s   255.3 MB/s    +1.7%

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Mathias Krause 2010-11-27 16:34:46 +08:00 committed by Herbert Xu
parent 21ea28abcf
commit 0d258efb6a
3 changed files with 191 additions and 40 deletions

View file

@ -20,6 +20,9 @@
* Wajdi Feghali (wajdi.k.feghali@intel.com)
* Copyright (c) 2010, Intel Corporation.
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <minipli@googlemail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -95,12 +98,16 @@ enc: .octa 0x2
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12
#ifdef __x86_64__
#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
@ -109,6 +116,18 @@ enc: .octa 0x2
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
#else
#define AREG %eax
#define KEYP %edi
#define OUTP AREG
#define UKEYP OUTP
#define INP %edx
#define LEN %esi
#define IVP %ebp
#define KLEN %ebx
#define T1 %ecx
#define TKEYP T1
#endif
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@ -1247,10 +1266,11 @@ _key_expansion_256a:
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
movaps %xmm0, (%rcx)
add $0x10, %rcx
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
ret
.align 4
_key_expansion_192a:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@ -1268,12 +1288,13 @@ _key_expansion_192a:
movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
movaps %xmm6, (%rcx)
movaps %xmm6, (TKEYP)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 16(%rcx)
add $0x20, %rcx
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
ret
.align 4
_key_expansion_192b:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@ -1288,10 +1309,11 @@ _key_expansion_192b:
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
movaps %xmm0, (%rcx)
add $0x10, %rcx
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
ret
.align 4
_key_expansion_256b:
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
@ -1299,8 +1321,8 @@ _key_expansion_256b:
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
movaps %xmm2, (%rcx)
add $0x10, %rcx
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
ret
/*
@ -1308,17 +1330,23 @@ _key_expansion_256b:
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
movups (%rsi), %xmm0 # user key (first 16 bytes)
movaps %xmm0, (%rdi)
lea 0x10(%rdi), %rcx # key addr
movl %edx, 480(%rdi)
#ifndef __x86_64__
pushl KEYP
movl 8(%esp), KEYP # ctx
movl 12(%esp), UKEYP # in_key
movl 16(%esp), %edx # key_len
#endif
movups (UKEYP), %xmm0 # user key (first 16 bytes)
movaps %xmm0, (KEYP)
lea 0x10(KEYP), TKEYP # key addr
movl %edx, 480(KEYP)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
movups 0x10(%rsi), %xmm2 # other user key
movaps %xmm2, (%rcx)
add $0x10, %rcx
movups 0x10(UKEYP), %xmm2 # other user key
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
AESKEYGENASSIST 0x1 %xmm0 %xmm1
@ -1347,7 +1375,7 @@ ENTRY(aesni_set_key)
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
movq 0x10(%rsi), %xmm2 # other user key
movq 0x10(UKEYP), %xmm2 # other user key
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@ -1387,33 +1415,47 @@ ENTRY(aesni_set_key)
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
sub $0x10, %rcx
movaps (%rdi), %xmm0
movaps (%rcx), %xmm1
movaps %xmm0, 240(%rcx)
movaps %xmm1, 240(%rdi)
add $0x10, %rdi
lea 240-16(%rcx), %rsi
sub $0x10, TKEYP
movaps (KEYP), %xmm0
movaps (TKEYP), %xmm1
movaps %xmm0, 240(TKEYP)
movaps %xmm1, 240(KEYP)
add $0x10, KEYP
lea 240-16(TKEYP), UKEYP
.align 4
.Ldec_key_loop:
movaps (%rdi), %xmm0
movaps (KEYP), %xmm0
AESIMC %xmm0 %xmm1
movaps %xmm1, (%rsi)
add $0x10, %rdi
sub $0x10, %rsi
cmp %rcx, %rdi
movaps %xmm1, (UKEYP)
add $0x10, KEYP
sub $0x10, UKEYP
cmp TKEYP, KEYP
jb .Ldec_key_loop
xor %rax, %rax
xor AREG, AREG
#ifndef __x86_64__
popl KEYP
#endif
ret
/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
#ifndef __x86_64__
pushl KEYP
pushl KLEN
movl 12(%esp), KEYP
movl 16(%esp), OUTP
movl 20(%esp), INP
#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
#ifndef __x86_64__
popl KLEN
popl KEYP
#endif
ret
/*
@ -1428,6 +1470,7 @@ ENTRY(aesni_enc)
* KEY
* TKEYP (T1)
*/
.align 4
_aesni_enc1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@ -1490,6 +1533,7 @@ _aesni_enc1:
* KEY
* TKEYP (T1)
*/
.align 4
_aesni_enc4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@ -1583,11 +1627,22 @@ _aesni_enc4:
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
#ifndef __x86_64__
pushl KEYP
pushl KLEN
movl 12(%esp), KEYP
movl 16(%esp), OUTP
movl 20(%esp), INP
#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
#ifndef __x86_64__
popl KLEN
popl KEYP
#endif
ret
/*
@ -1602,6 +1657,7 @@ ENTRY(aesni_dec)
* KEY
* TKEYP (T1)
*/
.align 4
_aesni_dec1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@ -1664,6 +1720,7 @@ _aesni_dec1:
* KEY
* TKEYP (T1)
*/
.align 4
_aesni_dec4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@ -1758,6 +1815,15 @@ _aesni_dec4:
* size_t len)
*/
ENTRY(aesni_ecb_enc)
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
movl 16(%esp), KEYP
movl 20(%esp), OUTP
movl 24(%esp), INP
movl 28(%esp), LEN
#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
@ -1794,6 +1860,11 @@ ENTRY(aesni_ecb_enc)
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
#ifndef __x86_64__
popl KLEN
popl KEYP
popl LEN
#endif
ret
/*
@ -1801,6 +1872,15 @@ ENTRY(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
movl 16(%esp), KEYP
movl 20(%esp), OUTP
movl 24(%esp), INP
movl 28(%esp), LEN
#endif
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
@ -1838,6 +1918,11 @@ ENTRY(aesni_ecb_dec)
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
#ifndef __x86_64__
popl KLEN
popl KEYP
popl LEN
#endif
ret
/*
@ -1845,6 +1930,17 @@ ENTRY(aesni_ecb_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
movl 20(%esp), KEYP
movl 24(%esp), OUTP
movl 28(%esp), INP
movl 32(%esp), LEN
movl 36(%esp), IVP
#endif
cmp $16, LEN
jb .Lcbc_enc_ret
mov 480(KEYP), KLEN
@ -1862,6 +1958,12 @@ ENTRY(aesni_cbc_enc)
jge .Lcbc_enc_loop
movups STATE, (IVP)
.Lcbc_enc_ret:
#ifndef __x86_64__
popl KLEN
popl KEYP
popl LEN
popl IVP
#endif
ret
/*
@ -1869,6 +1971,17 @@ ENTRY(aesni_cbc_enc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
movl 20(%esp), KEYP
movl 24(%esp), OUTP
movl 28(%esp), INP
movl 32(%esp), LEN
movl 36(%esp), IVP
#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
@ -1882,16 +1995,30 @@ ENTRY(aesni_cbc_dec)
movaps IN1, STATE1
movups 0x10(INP), IN2
movaps IN2, STATE2
#ifdef __x86_64__
movups 0x20(INP), IN3
movaps IN3, STATE3
movups 0x30(INP), IN4
movaps IN4, STATE4
#else
movups 0x20(INP), IN1
movaps IN1, STATE3
movups 0x30(INP), IN2
movaps IN2, STATE4
#endif
call _aesni_dec4
pxor IV, STATE1
#ifdef __x86_64__
pxor IN1, STATE2
pxor IN2, STATE3
pxor IN3, STATE4
movaps IN4, IV
#else
pxor (INP), STATE2
pxor 0x10(INP), STATE3
pxor IN1, STATE4
movaps IN2, IV
#endif
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
@ -1919,8 +2046,15 @@ ENTRY(aesni_cbc_dec)
.Lcbc_dec_ret:
movups IV, (IVP)
.Lcbc_dec_just_ret:
#ifndef __x86_64__
popl KLEN
popl KEYP
popl LEN
popl IVP
#endif
ret
#ifdef __x86_64__
.align 16
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@ -1936,6 +2070,7 @@ ENTRY(aesni_cbc_dec)
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
*/
.align 4
_aesni_inc_init:
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
@ -1960,6 +2095,7 @@ _aesni_inc_init:
* CTR: == output IV, in little endian
* TCTR_LOW: == lower qword of CTR
*/
.align 4
_aesni_inc:
paddq INC, CTR
add $1, TCTR_LOW
@ -2031,3 +2167,4 @@ ENTRY(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
ret
#endif

View file

@ -94,8 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
#endif
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
@ -410,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
},
};
#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct blkcipher_walk *walk)
{
@ -475,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
},
},
};
#endif
static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
@ -622,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
},
};
#ifdef CONFIG_X86_64
static int ablk_ctr_init(struct crypto_tfm *tfm)
{
struct cryptd_ablkcipher *cryptd_tfm;
@ -698,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
},
};
#endif
#endif
#ifdef HAS_LRW
static int ablk_lrw_init(struct crypto_tfm *tfm)
@ -1249,18 +1255,20 @@ static int __init aesni_init(void)
goto blk_ecb_err;
if ((err = crypto_register_alg(&blk_cbc_alg)))
goto blk_cbc_err;
if ((err = crypto_register_alg(&blk_ctr_alg)))
goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ecb_alg)))
goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
#ifdef CONFIG_X86_64
if ((err = crypto_register_alg(&blk_ctr_alg)))
goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ctr_alg)))
goto ablk_ctr_err;
#ifdef HAS_CTR
if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
goto ablk_rfc3686_ctr_err;
#endif
#endif
#ifdef HAS_LRW
if ((err = crypto_register_alg(&ablk_lrw_alg)))
goto ablk_lrw_err;
@ -1296,18 +1304,20 @@ ablk_pcbc_err:
crypto_unregister_alg(&ablk_lrw_alg);
ablk_lrw_err:
#endif
#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
ablk_rfc3686_ctr_err:
#endif
crypto_unregister_alg(&ablk_ctr_alg);
ablk_ctr_err:
crypto_unregister_alg(&blk_ctr_alg);
blk_ctr_err:
#endif
crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
crypto_unregister_alg(&ablk_ecb_alg);
ablk_ecb_err:
crypto_unregister_alg(&blk_ctr_alg);
blk_ctr_err:
crypto_unregister_alg(&blk_cbc_alg);
blk_cbc_err:
crypto_unregister_alg(&blk_ecb_alg);
@ -1332,13 +1342,15 @@ static void __exit aesni_exit(void)
#ifdef HAS_LRW
crypto_unregister_alg(&ablk_lrw_alg);
#endif
#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
#endif
crypto_unregister_alg(&ablk_ctr_alg);
crypto_unregister_alg(&blk_ctr_alg);
#endif
crypto_unregister_alg(&ablk_cbc_alg);
crypto_unregister_alg(&ablk_ecb_alg);
crypto_unregister_alg(&blk_ctr_alg);
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);

View file

@ -539,8 +539,9 @@ config CRYPTO_AES_X86_64
config CRYPTO_AES_NI_INTEL
tristate "AES cipher algorithms (AES-NI)"
depends on (X86 || UML_X86) && 64BIT
select CRYPTO_AES_X86_64
depends on (X86 || UML_X86)
select CRYPTO_AES_X86_64 if 64BIT
select CRYPTO_AES_586 if !64BIT
select CRYPTO_CRYPTD
select CRYPTO_ALGAPI
select CRYPTO_FPU
@ -563,9 +564,10 @@ config CRYPTO_AES_NI_INTEL
See <http://csrc.nist.gov/encryption/aes/> for more information.
In addition to AES cipher algorithm support, the
acceleration for some popular block cipher mode is supported
too, including ECB, CBC, CTR, LRW, PCBC, XTS.
In addition to AES cipher algorithm support, the acceleration
for some popular block cipher mode is supported too, including
ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
acceleration for CTR.
config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm"