[PATCH 07/10] Add parallelized AES-NI ECB encryption
Jussi Kivilinna
jussi.kivilinna at mbnet.fi
Fri Nov 23 18:22:25 CET 2012
* cipher/cipher-internal.h (struct gcry_cipher_handle): Add
bulk.ecb_enc.
* cipher/cipher.c (gcry_cipher_open) [USE_AES]: Set bulk.ecb_enc
to _gcry_aes_ecb_enc.
(do_ecb_encrypt): Redirect call into bulk.ecb_enc if non-null.
* src/cipher.h (_gcry_aes_ecb_enc): Add new function prototype.
* cipher/rijndeal.c (_gcry_aes_ecb_enc): Add new function.
[USE_AESNI] (do_aesni_enc_vec4): Add new function.
--
Parallelized ECB encryption is ~2.0x faster on Intel Sandy-Bridge (x86-64).
Before:
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 690ms 350ms 2130ms 470ms 1890ms 670ms 2220ms 2240ms 490ms 490ms
AES192 900ms 440ms 2460ms 560ms 2210ms 840ms 2550ms 2560ms 570ms 570ms
AES256 1040ms 520ms 2800ms 640ms 2550ms 970ms 2840ms 2850ms 660ms 650ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 340ms 360ms 2130ms 470ms 1870ms 690ms 2200ms 2250ms 500ms 490ms
AES192 430ms 440ms 2460ms 550ms 2210ms 820ms 2540ms 2560ms 570ms 570ms
AES256 500ms 520ms 2790ms 640ms 2540ms 960ms 2830ms 2840ms 650ms 650ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
cipher/cipher-internal.h | 3 +
cipher/cipher.c | 8 ++
cipher/rijndael.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++
src/cipher.h | 2 +
4 files changed, 187 insertions(+)
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index dcce708..edd8e17 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -89,6 +89,9 @@ struct gcry_cipher_handle
void (*ctr_enc)(void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+ void (*ecb_enc)(void *context, void *outbuf_arg,
+ const void *inbuf_arg,
+ unsigned int nblocks);
void (*ecb_dec)(void *context, void *outbuf_arg,
const void *inbuf_arg,
unsigned int nblocks);
diff --git a/cipher/cipher.c b/cipher/cipher.c
index b0f9773..edc84f7 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -716,6 +716,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
h->bulk.cbc_enc = _gcry_aes_cbc_enc;
h->bulk.cbc_dec = _gcry_aes_cbc_dec;
h->bulk.ctr_enc = _gcry_aes_ctr_enc;
+ h->bulk.ecb_enc = _gcry_aes_ecb_enc;
h->bulk.ecb_dec = _gcry_aes_ecb_dec;
break;
#endif /*USE_AES*/
@@ -859,6 +860,13 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
nblocks = inbuflen / c->cipher->blocksize;
+ if (nblocks && c->bulk.ecb_enc)
+ {
+ c->bulk.ecb_enc (&c->context.c, outbuf, inbuf, nblocks);
+
+ return 0;
+ }
+
for (n=0; n < nblocks; n++ )
{
c->cipher->encrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 421b159..5110c72 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -822,6 +822,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
}
+/* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesenclast_xmm0_xmm1
+ aesenclast_xmm0_xmm2
+ aesenclast_xmm0_xmm3
+ aesenclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
* and output through SSE registers xmm1 to xmm4. */
static void
@@ -1476,6 +1585,71 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
}
+/* Bulk encryption of complete blocks in ECB mode. This function is only
+ * intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_aes_ecb_enc (void *context, void *outbuf_arg,
+ const void *inbuf_arg, unsigned int nblocks)
+{
+ RIJNDAEL_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ aesni_prepare ();
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile
+ ("movdqu %%xmm1, 0*16(%[outbuf])\n\t" /* store output blocks */
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+ : /* No output */
+ : [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ do_aesni_enc_aligned (ctx, outbuf, inbuf);
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ aesni_cleanup ();
+ aesni_cleanup_2_5 ();
+ }
+#endif
+ else
+ for ( ;nblocks; nblocks-- )
+ {
+ rijndael_encrypt(context, outbuf, inbuf);
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+}
+
+
/* Decrypt one block. A and B need to be aligned on a 4 byte boundary
and the decryption must have been prepared. A and B may be the
diff --git a/src/cipher.h b/src/cipher.h
index 6b34e90..66367c1 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -94,6 +94,8 @@ void _gcry_aes_cbc_dec (void *context, unsigned char *iv,
void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+void _gcry_aes_ecb_enc (void *context, void *outbuf_arg,
+ const void *inbuf_arg, unsigned int nblocks);
void _gcry_aes_ecb_dec (void *context, void *outbuf_arg,
const void *inbuf_arg, unsigned int nblocks);
More information about the Gcrypt-devel
mailing list