[PATCH 3/4] cast5: add three rounds parallel handling to generic C implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Mar 31 17:59:39 CEST 2019
* cipher/cast5.c (do_encrypt_block_3, do_decrypt_block_3): New.
(_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec, _gcry_cast5_cfb_dec): Use
new three block functions.
--
Benchmark on aarch64 (cortex-a53, 816 Mhz):
Before:
CAST5 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 35.24 ns/B 27.07 MiB/s 28.75 c/B
CFB dec | 34.62 ns/B 27.54 MiB/s 28.25 c/B
CTR enc | 35.39 ns/B 26.95 MiB/s 28.88 c/B
After (~40%-50% faster):
CAST5 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 23.05 ns/B 41.38 MiB/s 18.81 c/B
CFB dec | 24.49 ns/B 38.94 MiB/s 19.98 c/B
CTR dec | 24.57 ns/B 38.82 MiB/s 20.05 c/B
Benchmark on i386 (haswell, 4000 Mhz):
Before:
CAST5 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 6.92 ns/B 137.7 MiB/s 27.69 c/B
CFB dec | 6.83 ns/B 139.7 MiB/s 27.32 c/B
CTR enc | 7.01 ns/B 136.1 MiB/s 28.03 c/B
After (~70% faster):
CAST5 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 3.97 ns/B 240.1 MiB/s 15.89 c/B
CFB dec | 3.96 ns/B 241.0 MiB/s 15.83 c/B
CTR enc | 4.01 ns/B 237.8 MiB/s 16.04 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/cast5.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 237 insertions(+), 8 deletions(-)
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 65485ba23..7219e3eaf 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -534,6 +534,97 @@ encrypt_block (void *context , byte *outbuf, const byte *inbuf)
}
+static void
+do_encrypt_block_3( CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+ u32 l0, r0, t0, l1, r1, t1, l2, r2, t2;
+ u32 I; /* used by the Fx macros */
+ u32 *Km;
+ u32 Kr;
+
+ Km = c->Km;
+ Kr = buf_get_le32(c->Kr + 0);
+
+ l0 = buf_get_be32(inbuf + 0);
+ r0 = buf_get_be32(inbuf + 4);
+ l1 = buf_get_be32(inbuf + 8);
+ r1 = buf_get_be32(inbuf + 12);
+ l2 = buf_get_be32(inbuf + 16);
+ r2 = buf_get_be32(inbuf + 20);
+
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 0], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 0], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 0], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 1], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 1], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 1], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 2], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 2], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 2], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 3], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 3], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 3], Kr & 31);
+ Kr = buf_get_le32(c->Kr + 4);
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 4], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 4], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 4], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 5], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 5], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 5], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 6], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 6], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 6], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 7], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 7], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 7], Kr & 31);
+ Kr = buf_get_le32(c->Kr + 8);
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 8], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 8], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 8], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 9], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 9], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 9], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[10], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[10], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[10], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[11], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[11], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[11], Kr & 31);
+ Kr = buf_get_le32(c->Kr + 12);
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[12], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[12], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[12], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[13], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[13], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[13], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[14], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[14], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[14], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[15], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[15], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[15], Kr & 31);
+
+ buf_put_be32(outbuf + 0, r0);
+ buf_put_be32(outbuf + 4, l0);
+ buf_put_be32(outbuf + 8, r1);
+ buf_put_be32(outbuf + 12, l1);
+ buf_put_be32(outbuf + 16, r2);
+ buf_put_be32(outbuf + 20, l2);
+}
+
+
static void
do_decrypt_block (CAST5_context *c, byte *outbuf, const byte *inbuf )
{
@@ -577,6 +668,97 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
return /*burn_stack*/ (20+4*sizeof(void*));
}
+
+static void
+do_decrypt_block_3 (CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+ u32 l0, r0, t0, l1, r1, t1, l2, r2, t2;
+ u32 I;
+ u32 *Km;
+ u32 Kr;
+
+ Km = c->Km;
+ Kr = buf_get_be32(c->Kr + 12);
+
+ l0 = buf_get_be32(inbuf + 0);
+ r0 = buf_get_be32(inbuf + 4);
+ l1 = buf_get_be32(inbuf + 8);
+ r1 = buf_get_be32(inbuf + 12);
+ l2 = buf_get_be32(inbuf + 16);
+ r2 = buf_get_be32(inbuf + 20);
+
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[15], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[15], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[15], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[14], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[14], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[14], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[13], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[13], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[13], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[12], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[12], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[12], Kr & 31);
+ Kr = buf_get_be32(c->Kr + 8);
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[11], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[11], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[11], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[10], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[10], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[10], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 9], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 9], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 9], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 8], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 8], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 8], Kr & 31);
+ Kr = buf_get_be32(c->Kr + 4);
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 7], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 7], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 7], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 6], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 6], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 6], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 5], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 5], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 5], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 4], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 4], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 4], Kr & 31);
+ Kr = buf_get_be32(c->Kr + 0);
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 3], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 3], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 3], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 2], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 2], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 2], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 1], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 1], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 1], Kr & 31);
+ Kr >>= 8;
+ t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 0], Kr & 31);
+ t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 0], Kr & 31);
+ t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 0], Kr & 31);
+
+ buf_put_be32(outbuf + 0, r0);
+ buf_put_be32(outbuf + 4, l0);
+ buf_put_be32(outbuf + 8, r1);
+ buf_put_be32(outbuf + 12, l1);
+ buf_put_be32(outbuf + 16, r2);
+ buf_put_be32(outbuf + 20, l2);
+}
+
#endif /*!USE_ARM_ASM*/
@@ -590,9 +772,8 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
CAST5_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char tmpbuf[CAST5_BLOCKSIZE];
- int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
-
+ unsigned char tmpbuf[CAST5_BLOCKSIZE * 3];
+ int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
#ifdef USE_AMD64_ASM
{
@@ -610,7 +791,6 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
}
/* Use generic code to handle smaller chunks... */
- /* TODO: use caching instead? */
}
#elif defined(USE_ARM_ASM)
{
@@ -625,10 +805,28 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
}
/* Use generic code to handle smaller chunks... */
- /* TODO: use caching instead? */
}
#endif
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+ for ( ;nblocks >= 3; nblocks -= 3)
+ {
+ /* Prepare the counter blocks. */
+ cipher_block_cpy (tmpbuf + 0, ctr, CAST5_BLOCKSIZE);
+ cipher_block_cpy (tmpbuf + 8, ctr, CAST5_BLOCKSIZE);
+ cipher_block_cpy (tmpbuf + 16, ctr, CAST5_BLOCKSIZE);
+ cipher_block_add (tmpbuf + 8, 1, CAST5_BLOCKSIZE);
+ cipher_block_add (tmpbuf + 16, 2, CAST5_BLOCKSIZE);
+ cipher_block_add (ctr, 3, CAST5_BLOCKSIZE);
+ /* Encrypt the counter. */
+ do_encrypt_block_3(ctx, tmpbuf, tmpbuf);
+ /* XOR the input with the encrypted counter and store in output. */
+ buf_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE * 3);
+ outbuf += CAST5_BLOCKSIZE * 3;
+ inbuf += CAST5_BLOCKSIZE * 3;
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
/* Encrypt the counter. */
@@ -655,8 +853,8 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
CAST5_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char savebuf[CAST5_BLOCKSIZE];
- int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+ unsigned char savebuf[CAST5_BLOCKSIZE * 3];
+ int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
#ifdef USE_AMD64_ASM
{
@@ -691,6 +889,22 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
}
#endif
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+ for ( ;nblocks >= 3; nblocks -= 3)
+ {
+ /* INBUF is needed later and it may be identical to OUTBUF, so store
+ the intermediate result to SAVEBUF. */
+ do_decrypt_block_3 (ctx, savebuf, inbuf);
+
+ cipher_block_xor_1 (savebuf + 0, iv, CAST5_BLOCKSIZE);
+ cipher_block_xor_1 (savebuf + 8, inbuf, CAST5_BLOCKSIZE * 2);
+ cipher_block_cpy (iv, inbuf + 16, CAST5_BLOCKSIZE);
+ buf_cpy (outbuf, savebuf, CAST5_BLOCKSIZE * 3);
+ inbuf += CAST5_BLOCKSIZE * 3;
+ outbuf += CAST5_BLOCKSIZE * 3;
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
/* INBUF is needed later and it may be identical to OUTBUF, so store
@@ -715,7 +929,8 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
CAST5_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+ unsigned char tmpbuf[CAST5_BLOCKSIZE * 3];
+ int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
#ifdef USE_AMD64_ASM
{
@@ -750,6 +965,19 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
}
#endif
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+ for ( ;nblocks >= 3; nblocks -= 3 )
+ {
+ cipher_block_cpy (tmpbuf + 0, iv, CAST5_BLOCKSIZE);
+ cipher_block_cpy (tmpbuf + 8, inbuf + 0, CAST5_BLOCKSIZE * 2);
+ cipher_block_cpy (iv, inbuf + 16, CAST5_BLOCKSIZE);
+ do_encrypt_block_3 (ctx, tmpbuf, tmpbuf);
+ buf_xor (outbuf, inbuf, tmpbuf, CAST5_BLOCKSIZE * 3);
+ outbuf += CAST5_BLOCKSIZE * 3;
+ inbuf += CAST5_BLOCKSIZE * 3;
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
do_encrypt_block(ctx, iv, iv);
@@ -758,6 +986,7 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
inbuf += CAST5_BLOCKSIZE;
}
+ wipememory(tmpbuf, sizeof(tmpbuf));
_gcry_burn_stack(burn_stack_depth);
}
More information about the Gcrypt-devel
mailing list