[PATCH 02/10] Improve parallelizability of CBC decryption for AES-NI
Jussi Kivilinna
jussi.kivilinna at mbnet.fi
Fri Nov 23 18:21:59 CET 2012
* cipher/rijndael.c (_gcry_aes_cbc_dec) [USE_AESNI]: Add AES-NI
specific CBC mode loop with temporary block and IV stored in free SSE
registers.
--
Benchmark results on Intel Core i5-2450M (x86-64) show ~2.5x improvement:
Before:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 690ms 780ms 2940ms 2110ms 1880ms 670ms 2250ms 2250ms 490ms 500ms
AES192 890ms 930ms 3260ms 2390ms 2220ms 820ms 2580ms 2590ms 560ms 570ms
AES256 1040ms 1070ms 3590ms 2640ms 2540ms 970ms 2880ms 2890ms 650ms 650ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2920ms 720ms 1900ms 660ms 2260ms 2250ms 480ms 500ms
AES192 860ms 930ms 3250ms 870ms 2210ms 830ms 2580ms 2580ms 570ms 570ms
AES256 1020ms 1080ms 3580ms 1030ms 2550ms 970ms 2880ms 2870ms 660ms 660ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
cipher/rijndael.c | 97 +++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 75 insertions(+), 22 deletions(-)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index d081b42..104f869 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1582,33 +1582,86 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
int i;
unsigned char savebuf[BLOCKSIZE];
- aesni_prepare ();
- for ( ;nblocks; nblocks-- )
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
{
- /* We need to save INBUF away because it may be identical to
- OUTBUF. */
- memcpy (savebuf, inbuf, BLOCKSIZE);
+ aesni_prepare ();
- if (0)
- ;
+ if (!ctx->decryption_prepared )
+ {
+ prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+
+ /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
+ storage, out-of-order CPUs see parallellism even over loop iterations
+ and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
+ improvements are possible with do_aesni_cbc_dec_4() when implemented.
+ */
+ asm volatile
+ ("movdqu %[iv], %%xmm3\n\t" /* use xmm3 as fast IV storage */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile
+ ("movdqu %[inbuf], %%xmm2\n\t" /* use xmm2 as savebuf */
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory");
+
+ /* uses only xmm0 and xmm1 */
+ do_aesni_dec_aligned (ctx, outbuf, inbuf);
+
+ asm volatile
+ ("movdqu %[outbuf], %%xmm0\n\t"
+ "pxor %%xmm3, %%xmm0\n\t" /* xor IV with output */
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ "movdqu %%xmm2, %%xmm3\n\t" /* store savebuf as new IV */
+ : /* No output */
+ : [outbuf] "m" (*outbuf)
+ : "memory");
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile
+ ("movdqu %%xmm3, %[iv]\n\t" /* store IV */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ aesni_cleanup ();
+ aesni_cleanup_2_4 ();
+ }
+#endif /*USE_AESNI*/
+ else
+ for ( ;nblocks; nblocks-- )
+ {
+ /* We need to save INBUF away because it may be identical to
+ OUTBUF. */
+ memcpy (savebuf, inbuf, BLOCKSIZE);
+
+ if (0)
+ ;
#ifdef USE_PADLOCK
- else if (ctx->use_padlock)
- do_padlock (ctx, 1, outbuf, inbuf);
+ else if (ctx->use_padlock)
+ do_padlock (ctx, 1, outbuf, inbuf);
#endif /*USE_PADLOCK*/
-#ifdef USE_AESNI
- else if (ctx->use_aesni)
- do_aesni (ctx, 1, outbuf, inbuf);
-#endif /*USE_AESNI*/
- else
- do_decrypt (ctx, outbuf, inbuf);
+ else
+ do_decrypt (ctx, outbuf, inbuf);
- for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
- outbuf[i] ^= *ivp++;
- memcpy (iv, savebuf, BLOCKSIZE);
- inbuf += BLOCKSIZE;
- outbuf += BLOCKSIZE;
- }
- aesni_cleanup ();
+ for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+ outbuf[i] ^= *ivp++;
+ memcpy (iv, savebuf, BLOCKSIZE);
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
_gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
}
More information about the Gcrypt-devel
mailing list