android_kernel_samsung_msm8976/arch/x86/crypto/cast5_avx_glue.c
Johannes Goetzfried 4d6d6a2c85 crypto: cast5 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast5 block
cipher. The implementation processes sixteen blocks in parallel (four 4 block
chunk AVX operations). The table-lookups are done in general-purpose registers.
For small blocksizes the functions from the generic module are called. A good
performance increase is provided for blocksizes greater or equal to 128B.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmark results:

Intel Core i5-2500 CPU (fam:6, model:42, step:7)

cast5-avx-x86_64 vs. cast5-generic
64bit key:
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.99x   0.99x   1.00x   1.00x   1.02x   1.01x
64B     1.00x   1.00x   0.98x   1.00x   1.01x   1.02x
256B    2.03x   2.01x   0.95x   2.11x   2.12x   2.13x
1024B   2.30x   2.24x   0.95x   2.29x   2.35x   2.35x
8192B   2.31x   2.27x   0.95x   2.31x   2.39x   2.39x

128bit key:
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.99x   0.99x   1.00x   1.00x   1.01x   1.01x
64B     1.00x   1.00x   0.98x   1.01x   1.02x   1.01x
256B    2.17x   2.13x   0.96x   2.19x   2.19x   2.19x
1024B   2.29x   2.32x   0.95x   2.34x   2.37x   2.38x
8192B   2.35x   2.32x   0.95x   2.35x   2.39x   2.39x

Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2012-08-01 17:47:30 +08:00

531 lines
13 KiB
C

/*
* Glue Code for the AVX assembler implemention of the Cast5 Cipher
*
* Copyright (C) 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
*/
#include <linux/module.h>
#include <linux/hardirq.h>
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/cast5.h>
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
#define CAST5_PARALLEL_BLOCKS 16
asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast5_enc_blk_16way(ctx, dst, src, false);
}
static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast5_enc_blk_16way(ctx, dst, src, true);
}
static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
cast5_dec_blk_16way(ctx, dst, src);
}
static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
NULL, fpu_enabled, nbytes);
}
static inline void cast5_fpu_end(bool fpu_enabled)
{
return glue_fpu_end(fpu_enabled);
}
static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
bool enc)
{
bool fpu_enabled = false;
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes;
int err;
err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk->nbytes)) {
u8 *wsrc = walk->src.virt.addr;
u8 *wdst = walk->dst.virt.addr;
fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
if (enc)
cast5_enc_blk_xway(ctx, wdst, wsrc);
else
cast5_dec_blk_xway(ctx, wdst, wsrc);
wsrc += bsize * CAST5_PARALLEL_BLOCKS;
wdst += bsize * CAST5_PARALLEL_BLOCKS;
nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
} while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
do {
if (enc)
__cast5_encrypt(ctx, wdst, wsrc);
else
__cast5_decrypt(ctx, wdst, wsrc);
wsrc += bsize;
wdst += bsize;
nbytes -= bsize;
} while (nbytes >= bsize);
done:
err = blkcipher_walk_done(desc, walk, nbytes);
}
cast5_fpu_end(fpu_enabled);
return err;
}
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
blkcipher_walk_init(&walk, dst, src, nbytes);
return ecb_crypt(desc, &walk, true);
}
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
blkcipher_walk_init(&walk, dst, src, nbytes);
return ecb_crypt(desc, &walk, false);
}
static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 *iv = (u64 *)walk->iv;
do {
*dst = *src ^ *iv;
__cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
iv = dst;
src += 1;
dst += 1;
nbytes -= bsize;
} while (nbytes >= bsize);
*(u64 *)walk->iv ^= *iv;
return nbytes;
}
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
while ((nbytes = walk.nbytes)) {
nbytes = __cbc_encrypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
return err;
}
static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
u64 last_iv;
int i;
/* Start of the last block. */
src += nbytes / bsize - 1;
dst += nbytes / bsize - 1;
last_iv = *src;
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
src -= CAST5_PARALLEL_BLOCKS - 1;
dst -= CAST5_PARALLEL_BLOCKS - 1;
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
ivs[i] = src[i];
cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
*(dst + (i + 1)) ^= *(ivs + i);
nbytes -= bsize;
if (nbytes < bsize)
goto done;
*dst ^= *(src - 1);
src -= 1;
dst -= 1;
} while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
for (;;) {
__cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
nbytes -= bsize;
if (nbytes < bsize)
break;
*dst ^= *(src - 1);
src -= 1;
dst -= 1;
}
done:
*dst ^= *(u64 *)walk->iv;
*(u64 *)walk->iv = last_iv;
return nbytes;
}
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes)) {
fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
nbytes = __cbc_decrypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
cast5_fpu_end(fpu_enabled);
return err;
}
static void ctr_crypt_final(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
u8 *ctrblk = walk->iv;
u8 keystream[CAST5_BLOCK_SIZE];
u8 *src = walk->src.virt.addr;
u8 *dst = walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
__cast5_encrypt(ctx, keystream, ctrblk);
crypto_xor(keystream, src, nbytes);
memcpy(dst, keystream, nbytes);
crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
}
static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
struct blkcipher_walk *walk)
{
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
__be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
int i;
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
/* create ctrblks for parallel encrypt */
for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
ctrblocks[i] = cpu_to_be64(ctrblk++);
}
cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
(u8 *)ctrblocks);
src += CAST5_PARALLEL_BLOCKS;
dst += CAST5_PARALLEL_BLOCKS;
nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
} while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
if (nbytes < bsize)
goto done;
}
/* Handle leftovers */
do {
if (dst != src)
*dst = *src;
ctrblocks[0] = cpu_to_be64(ctrblk++);
__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
*dst ^= ctrblocks[0];
src += 1;
dst += 1;
nbytes -= bsize;
} while (nbytes >= bsize);
done:
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
return nbytes;
}
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
bool fpu_enabled = false;
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
nbytes = __ctr_crypt(desc, &walk);
err = blkcipher_walk_done(desc, &walk, nbytes);
}
cast5_fpu_end(fpu_enabled);
if (walk.nbytes) {
ctr_crypt_final(desc, &walk);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg cast5_algs[6] = { {
.cra_name = "__ecb-cast5-avx",
.cra_driver_name = "__driver-ecb-cast5-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.setkey = cast5_setkey,
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
},
},
}, {
.cra_name = "__cbc-cast5-avx",
.cra_driver_name = "__driver-cbc-cast5-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.setkey = cast5_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
},
},
}, {
.cra_name = "__ctr-cast5-avx",
.cra_driver_name = "__driver-ctr-cast5-avx",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.ivsize = CAST5_BLOCK_SIZE,
.setkey = cast5_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
},
},
}, {
.cra_name = "ecb(cast5)",
.cra_driver_name = "ecb-cast5-avx",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "cbc(cast5)",
.cra_driver_name = "cbc-cast5-avx",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.ivsize = CAST5_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
},
},
}, {
.cra_name = "ctr(cast5)",
.cra_driver_name = "ctr-cast5-avx",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 0,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_u = {
.ablkcipher = {
.min_keysize = CAST5_MIN_KEY_SIZE,
.max_keysize = CAST5_MAX_KEY_SIZE,
.ivsize = CAST5_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_encrypt,
.geniv = "chainiv",
},
},
} };
static int __init cast5_init(void)
{
u64 xcr0;
if (!cpu_has_avx || !cpu_has_osxsave) {
pr_info("AVX instructions are not detected.\n");
return -ENODEV;
}
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
pr_info("AVX detected but unusable.\n");
return -ENODEV;
}
return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
}
static void __exit cast5_exit(void)
{
crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
}
module_init(cast5_init);
module_exit(cast5_exit);
MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
MODULE_ALIAS("cast5");