[PATCH 2/7] Tweak AES-NI bulk CTR mode slightly
Jussi Kivilinna
jussi.kivilinna at iki.fi
Tue Nov 5 16:57:41 CET 2013
* cipher/rijndael.c [USE_AESNI] (aesni_cleanup_2_5): Rename to...
(aesni_cleanup_2_6): ...this and clear also 'xmm6'.
[USE_AESNI && __i386__] (do_aesni_ctr, do_aesni_ctr_4): Prevent
inlining only on i386, allow on AMD64.
[USE_AESNI] (do_aesni_ctr, do_aesni_ctr_4): Use counter block from
'xmm5' and byte-swap mask from 'xmm6'.
(_gcry_aes_ctr_enc) [USE_AESNI]: Preload counter block to 'xmm5' and
byte-swap mask to 'xmm6'.
(_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Use
'aesni_cleanup_2_6'.
--
Small tweak that yeilds ~5% more speed on Intel Core i5-4570.
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.274 ns/B 3482.5 MiB/s 0.877 c/B
CTR dec | 0.274 ns/B 3486.8 MiB/s 0.876 c/B
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.288 ns/B 3312.5 MiB/s 0.922 c/B
CTR dec | 0.288 ns/B 3312.6 MiB/s 0.922 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael.c | 83 +++++++++++++++++++++++++++++------------------------
1 file changed, 45 insertions(+), 38 deletions(-)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index b8cc18b..645cf37 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -189,11 +189,12 @@ typedef struct
do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
"pxor %%xmm1, %%xmm1\n" :: ); \
} while (0)
-# define aesni_cleanup_2_5() \
+# define aesni_cleanup_2_6() \
do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \
"pxor %%xmm3, %%xmm3\n" \
"pxor %%xmm4, %%xmm4\n" \
- "pxor %%xmm5, %%xmm5\n":: ); \
+ "pxor %%xmm5, %%xmm5\n" \
+ "pxor %%xmm6, %%xmm6\n":: ); \
} while (0)
#else
# define aesni_prepare() do { } while (0)
@@ -207,8 +208,8 @@ typedef struct
/* Function prototypes. */
-#ifdef USE_AESNI
-/* We don't want to inline these functions to help gcc allocate enough
+#if defined(__i386__) && defined(USE_AESNI)
+/* We don't want to inline these functions on i386 to help gcc allocate enough
registers. */
static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr,
unsigned char *b, const unsigned char *a)
@@ -1165,15 +1166,13 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
{
#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
- static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
- { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
- asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
- "movaps %%xmm0, %%xmm2\n\t"
+ asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
+ "movdqa %%xmm0, %%xmm2\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
- "pshufb %[mask], %%xmm2\n\t"
+ "pshufb %%xmm6, %%xmm2\n\t"
"psubq %%xmm1, %%xmm2\n\t" /* xmm2++ (big endian) */
/* detect if 64-bit carry handling is needed */
@@ -1187,11 +1186,10 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
".Lno_carry%=:\n\t"
- "pshufb %[mask], %%xmm2\n\t"
- "movdqa %%xmm2, (%[ctr])\n" /* Update CTR. */
+ "pshufb %%xmm6, %%xmm2\n\t"
+ "movdqa %%xmm2, (%[ctr])\n\t" /* Update CTR (mem). */
- "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
- "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
+ "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */
"movdqa 0x10(%[key]), %%xmm1\n\t"
aesenc_xmm1_xmm0
"movdqa 0x20(%[key]), %%xmm1\n\t"
@@ -1234,8 +1232,7 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
: [src] "m" (*a),
[ctr] "r" (ctr),
[key] "r" (ctx->keyschenc),
- [rounds] "g" (ctx->rounds),
- [mask] "m" (*be_mask)
+ [rounds] "g" (ctx->rounds)
: "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenclast_xmm1_xmm0
@@ -1256,9 +1253,6 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
#define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
#define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
- static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
- { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
/* Register usage:
esi keyschedule
xmm0 CTR-0
@@ -1266,21 +1260,22 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
xmm2 CTR-1
xmm3 CTR-2
xmm4 CTR-3
- xmm5 temp
+ xmm5 copy of *ctr
+ xmm6 endian swapping mask
*/
- asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
- "movaps %%xmm0, %%xmm2\n\t"
+ asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
+ "movdqa %%xmm0, %%xmm2\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
- "pshufb %[mask], %%xmm2\n\t" /* xmm2 := le(xmm2) */
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */
"psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
- "movaps %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
+ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
"psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
- "movaps %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
+ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
"psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
- "movaps %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
+ "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
"psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
/* detect if 64-bit carry handling is needed */
@@ -1307,13 +1302,15 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
"psubq %%xmm1, %%xmm5\n\t"
".Lno_carry%=:\n\t"
- "pshufb %[mask], %%xmm2\n\t" /* xmm2 := be(xmm2) */
- "pshufb %[mask], %%xmm3\n\t" /* xmm3 := be(xmm3) */
- "pshufb %[mask], %%xmm4\n\t" /* xmm4 := be(xmm4) */
- "pshufb %[mask], %%xmm5\n\t" /* xmm5 := be(xmm5) */
- "movdqa %%xmm5, (%[ctr])\n" /* Update CTR. */
-
"movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "movl %[rounds], %%esi\n\t"
+
+ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */
+ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */
+ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */
+ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */
+ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
+
"pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
"pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
"pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
@@ -1364,7 +1361,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
aesenc_xmm1_xmm3
aesenc_xmm1_xmm4
"movdqa 0xa0(%[key]), %%xmm1\n\t"
- "cmpl $10, %[rounds]\n\t"
+ "cmpl $10, %%esi\n\t"
"jz .Lenclast%=\n\t"
aesenc_xmm1_xmm0
aesenc_xmm1_xmm2
@@ -1376,7 +1373,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
aesenc_xmm1_xmm3
aesenc_xmm1_xmm4
"movdqa 0xc0(%[key]), %%xmm1\n\t"
- "cmpl $12, %[rounds]\n\t"
+ "cmpl $12, %%esi\n\t"
"jz .Lenclast%=\n\t"
aesenc_xmm1_xmm0
aesenc_xmm1_xmm2
@@ -1416,8 +1413,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
[src] "r" (a),
[dst] "r" (b),
[key] "r" (ctx->keyschenc),
- [rounds] "g" (ctx->rounds),
- [mask] "m" (*be_mask)
+ [rounds] "g" (ctx->rounds)
: "%esi", "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenc_xmm1_xmm2
@@ -1653,7 +1649,18 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
#ifdef USE_AESNI
else if (ctx->use_aesni)
{
+ static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
aesni_prepare ();
+
+ asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+ "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */
+ : /* No output */
+ : [mask] "m" (*be_mask),
+ [ctr] "m" (*ctr)
+ : "memory");
+
for ( ;nblocks > 3 ; nblocks -= 4 )
{
do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
@@ -1667,7 +1674,7 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
inbuf += BLOCKSIZE;
}
aesni_cleanup ();
- aesni_cleanup_2_5 ();
+ aesni_cleanup_2_6 ();
}
#endif /*USE_AESNI*/
else
@@ -1953,7 +1960,7 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
inbuf += BLOCKSIZE;
}
aesni_cleanup ();
- aesni_cleanup_2_5 ();
+ aesni_cleanup_2_6 ();
}
#endif /*USE_AESNI*/
else
@@ -2074,7 +2081,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
: "memory");
aesni_cleanup ();
- aesni_cleanup_2_5 ();
+ aesni_cleanup_2_6 ();
}
#endif /*USE_AESNI*/
else
More information about the Gcrypt-devel
mailing list