[PATCH 1/4] rijndael: split AES-NI functions to separate file

Mon Dec 1 20:12:41 CET 2014

* cipher/Makefile.in: Add 'rijndael-aesni.c'.
* cipher/rijndael-aesni.c: New.
* cipher/rijndael-internal.h: New.
* cipher/rijndael.c (MAXKC, MAXROUNDS, BLOCKSIZE, ATTR_ALIGNED_16)
(USE_AMD64_ASM, USE_ARM_ASM, USE_PADLOCK, USE_AESNI, RIJNDAEL_context)
(keyschenc, keyschdec, padlockkey): Move to 'rijndael-internal.h'.
(u128_s, aesni_prepare, aesni_cleanup, aesni_cleanup_2_6)
(aesni_do_setkey, do_aesni_enc, do_aesni_dec, do_aesni_enc_vec4)
(do_aesni_dec_vec4, do_aesni_cfb, do_aesni_ctr, do_aesni_ctr_4): Move
to 'rijndael-aesni.c'.
(prepare_decryption, rijndael_encrypt, _gcry_aes_cfb_enc)
(_gcry_aes_cbc_enc, _gcry_aes_ctr_enc, rijndael_decrypt)
(_gcry_aes_cfb_dec, _gcry_aes_cbc_dec) [USE_AESNI]: Move to functions
in 'rijdael-aesni.c'.
* configure.ac [mpi_cpu_arch=x86]: Add 'rijndael-aesni.lo'.
--

Clean-up rijndael.c before new new hardware acceleration support gets added.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am         |    3 
 cipher/rijndael-aesni.c    | 1288 +++++++++++++++++++++++++++++++++++++++++
 cipher/rijndael-internal.h |  118 ++++
 cipher/rijndael.c          | 1393 ++------------------------------------------
 configure.ac               |    7 
 5 files changed, 1478 insertions(+), 1331 deletions(-)
 create mode 100644 cipher/rijndael-aesni.c
 create mode 100644 cipher/rijndael-internal.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 22018b3..19b3692 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -74,7 +74,8 @@ gostr3411-94.c \
 md4.c \
 md5.c \
 poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
-rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \
+rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
+  rijndael-amd64.S rijndael-arm.S \
 rmd160.c \
 rsa.c \
 salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
new file mode 100644
index 0000000..15ed4ad
--- /dev/null
+++ b/cipher/rijndael-aesni.c
@@ -0,0 +1,1288 @@
+/* AES-NI accelerated AES for Libgcrypt
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+
+
+#ifdef USE_AESNI
+
+
+typedef struct u128_s { u32 a, b, c, d; } u128_t;
+
+
+/* Two macros to be called prior and after the use of AESNI
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#define aesni_prepare() do { } while (0)
+#define aesni_cleanup()                                                \
+  do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
+                     "pxor %%xmm1, %%xmm1\n" :: );                     \
+  } while (0)
+#define aesni_cleanup_2_6()                                            \
+  do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                         \
+                     "pxor %%xmm3, %%xmm3\n"                           \
+                     "pxor %%xmm4, %%xmm4\n"                           \
+                     "pxor %%xmm5, %%xmm5\n"                           \
+                     "pxor %%xmm6, %%xmm6\n":: );                      \
+  } while (0)
+
+
+void
+_gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  aesni_prepare();
+
+  if (ctx->rounds < 12)
+    {
+      /* 128-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEY_EXPAND128 \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm3\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm2\n\t" \
+	"pxor   %%xmm2, %%xmm1\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key   */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x01)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x02)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x04)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x08)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x10)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x20)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x40)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x80)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x1b)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x36)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEY_EXPAND128
+    }
+  else if (ctx->rounds == 12)
+    {
+      /* 192-bit key */
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND192 \
+	"pshufd $0x55, %%xmm2, %%xmm2\n\t" \
+	"movdqu %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t" \
+	"pshufd $0xff, %%xmm1, %%xmm2\n\t" \
+	"movdqu %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movq 16(%[key]), %%xmm3\n\t"     /* xmm3 := key[16..23]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"    /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x80)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND192
+    }
+  else if (ctx->rounds > 12)
+    {
+      /* 256-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND256_A \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t"
+#define AESKEY_EXPAND256_B \
+	"pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movdqu 16(%[key]), %%xmm3\n\t"   /* xmm3 := key[16..31]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1  */
+
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND256_A
+#undef AESKEY_EXPAND256_B
+    }
+
+  aesni_cleanup();
+  aesni_cleanup_2_6();
+}
+
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  /* The AES-NI decrypt instructions use the Equivalent Inverse
+     Cipher, thus we can't use the the standard decrypt key
+     preparation.  */
+  u128_t *ekey = (u128_t *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)ctx->keyschdec;
+  int rr;
+  int r;
+
+  aesni_prepare();
+
+#define DO_AESNI_AESIMC() \
+  asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
+                /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
+                ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
+                "movdqa %%xmm1, %[dkey]" \
+                : [dkey] "=m" (dkey[r]) \
+                : [ekey] "m" (ekey[rr]) \
+                : "memory")
+
+  dkey[0] = ekey[ctx->rounds];
+  r=1;
+  rr=ctx->rounds-1;
+  DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
+  if (ctx->rounds > 10)
+    {
+      DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
+      DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
+      if (ctx->rounds > 12)
+        {
+          DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
+          DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
+        }
+    }
+
+  dkey[r] = ekey[0];
+
+#undef DO_AESNI_AESIMC
+
+  aesni_cleanup();
+}
+
+
+/* Encrypt one block using the Intel AES-NI instructions.  A and B may
+   be the same.
+
+   Our problem here is that gcc does not allow the "x" constraint for
+   SSE registers in asm unless you compile with -msse.  The common
+   wisdom is to use a separate file for SSE instructions and build it
+   separately.  This would require a lot of extra build system stuff,
+   similar to what we do in mpi/ for the asm stuff.  What we do
+   instead is to use standard registers and a bit more of plain asm
+   which copies the data and key stuff to the SSE registers and later
+   back.  If we decide to implement some block modes with parallelized
+   AES instructions, it might indeed be better to use plain asm ala
+   mpi/.  */
+static inline void
+do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
+              const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+  /* Note: For now we relax the alignment requirement for A and B: It
+     does not make much difference because in many case we would need
+     to memcpy them to an extra buffer; using the movdqu is much faster
+     that memcpy and movdqa.  For CFB we know that the IV is properly
+     aligned but that is a special case.  We should better implement
+     CFB direct in asm.  */
+  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
+                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %%xmm0, %[dst]\n"
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+static inline void
+do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
+              const unsigned char *a)
+{
+#define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
+#define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
+  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
+                "movdqa (%[key]), %%xmm1\n\t"
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm1_xmm0
+                "movdqu %%xmm0, %[dst]\n"
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm1_xmm0
+#undef aesdeclast_xmm1_xmm0
+}
+
+
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static inline void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
+/* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static inline void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm0_xmm1
+                aesdeclast_xmm0_xmm2
+                aesdeclast_xmm0_xmm3
+                aesdeclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
+/* Perform a CFB encryption or decryption round using the
+   initialization vector IV and the input block A.  Write the result
+   to the output block B and update IV.  IV needs to be 16 byte
+   aligned.  */
+static inline void
+do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
+              unsigned char *iv, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+  asm volatile ("movdqa %[iv], %%xmm0\n\t"      /* xmm0 := IV     */
+                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %[src], %%xmm1\n\t"      /* Save input.  */
+                "pxor %%xmm1, %%xmm0\n\t"        /* xmm0 = input ^ IV  */
+
+                "cmpl $1, %[decrypt]\n\t"
+                "jz .Ldecrypt_%=\n\t"
+                "movdqa %%xmm0, %[iv]\n\t"       /* [encrypt] Store IV.  */
+                "jmp .Lleave_%=\n"
+                ".Ldecrypt_%=:\n\t"
+                "movdqa %%xmm1, %[iv]\n"         /* [decrypt] Store IV.  */
+                ".Lleave_%=:\n\t"
+                "movdqu %%xmm0, %[dst]\n"        /* Store output.   */
+                : [iv] "+m" (*iv), [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds),
+                  [decrypt] "m" (decrypt_flag)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+/* Perform a CTR encryption round using the counter CTR and the input
+   block A.  Write the result to the output block B and update CTR.
+   CTR needs to be a 16 byte aligned little-endian value.  */
+static void
+do_aesni_ctr (const RIJNDAEL_context *ctx,
+              unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5)  */
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++ (big endian) */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "cmpl   $0xffffffff, 12(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "psubq   %%xmm1, %%xmm5\n\t"    /* add carry to upper 64bits */
+
+                ".Lno_carry%=:\n\t"
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).       */
+
+                "pxor (%[key]), %%xmm0\n\t"     /* xmm1 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
+                "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
+
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Four blocks at a time variant of do_aesni_ctr.  */
+static void
+do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
+#define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
+#define aesenc_xmm1_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm1_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
+#define aesenclast_xmm1_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
+#define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
+
+  /* Register usage:
+      esi   keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+   */
+
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+                "movl %[rounds], %%esi\n\t"
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %%esi\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %%esi\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                aesenclast_xmm1_xmm2
+                aesenclast_xmm1_xmm3
+                aesenclast_xmm1_xmm4
+
+                "movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
+                "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
+
+                "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
+                "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
+                "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
+
+                "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
+                "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
+                "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
+
+                "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
+                "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
+                "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
+
+                :
+                : [ctr] "r" (ctr),
+                  [src] "r" (a),
+                  [dst] "r" (b),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds)
+                : "%esi", "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenc_xmm1_xmm2
+#undef aesenc_xmm1_xmm3
+#undef aesenc_xmm1_xmm4
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm1_xmm2
+#undef aesenclast_xmm1_xmm3
+#undef aesenclast_xmm1_xmm4
+}
+
+
+void
+_gcry_aes_aesni_encrypt (RIJNDAEL_context *ctx, unsigned char *dst,
+                         const unsigned char *src)
+{
+  aesni_prepare ();
+  do_aesni_enc (ctx, dst, src);
+  aesni_cleanup ();
+}
+
+
+void
+_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+                         const unsigned char *inbuf, unsigned char *iv,
+                         size_t nblocks)
+{
+  aesni_prepare ();
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+  aesni_cleanup ();
+}
+
+
+void
+_gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+                         const unsigned char *inbuf, unsigned char *iv,
+                         size_t nblocks, int cbc_mac)
+{
+  unsigned char *last_iv;
+
+  aesni_prepare ();
+
+  last_iv = iv;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
+          SSE registers.  */
+      asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                    "movdqu %[inbuf], %%xmm1\n\t"
+                    "pxor %%xmm0, %%xmm1\n\t"
+                    "movdqu %%xmm1, %[outbuf]\n\t"
+                    : /* No output */
+                    : [iv] "m" (*last_iv),
+                      [inbuf] "m" (*inbuf),
+                      [outbuf] "m" (*outbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx, outbuf, outbuf);
+
+      last_iv = outbuf;
+      inbuf += BLOCKSIZE;
+      if (!cbc_mac)
+        outbuf += BLOCKSIZE;
+    }
+
+  if (last_iv != iv)
+    {
+      asm volatile ("movdqu %[last], %%xmm0\n\t"
+                    "movdqu %%xmm0, %[iv]\n\t"
+                    : /* No output */
+                    : [last] "m" (*last_iv),
+                      [iv] "m" (*iv)
+                    : "memory" );
+    }
+
+  aesni_cleanup ();
+}
+
+
+void
+_gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+                         const unsigned char *inbuf, unsigned char *ctr,
+                         size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+  aesni_prepare ();
+
+  asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+                "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
+                : /* No output */
+                : [mask] "m" (*be_mask),
+                  [ctr] "m" (*ctr)
+                : "memory");
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
+    {
+      do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_ctr (ctx, ctr, outbuf, inbuf);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_decrypt (RIJNDAEL_context *ctx, unsigned char *dst,
+                         const unsigned char *src)
+{
+  aesni_prepare ();
+  do_aesni_dec (ctx, dst, src);
+  aesni_cleanup ();
+}
+
+
+void
+_gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+                         const unsigned char *inbuf, unsigned char *iv,
+                         size_t nblocks)
+{
+  aesni_prepare ();
+
+  /* CFB decryption can be parallelized */
+  for ( ;nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile
+        ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+         "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+         "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+         "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+         "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+         "movdqu %%xmm0,         (%[iv])\n\t"
+         : /* No output */
+         : [inbuf] "r" (inbuf), [iv] "r" (iv)
+         : "memory");
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile
+        ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm1\n\t"
+         "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+         "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm2\n\t"
+         "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+         "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm3\n\t"
+         "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+         "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm4\n\t"
+         "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+         : /* No output */
+         : [inbuf] "r" (inbuf),
+           [outbuf] "r" (outbuf)
+         : "memory");
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+			 const unsigned char *inbuf, unsigned char *iv,
+			 size_t nblocks)
+{
+  aesni_prepare ();
+
+  asm volatile
+    ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
+     : /* No output */
+     : [iv] "m" (*iv)
+     : "memory");
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
+    {
+      asm volatile
+        ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+         "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+         "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+         "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+         : /* No output */
+         : [inbuf] "r" (inbuf)
+         : "memory");
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile
+        ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+         "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm2\n\t"		/* xor IV with output */
+         "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm3\n\t"		/* xor IV with output */
+         "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm4\n\t"		/* xor IV with output */
+         "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+         : /* No output */
+         : [inbuf] "r" (inbuf),
+           [outbuf] "r" (outbuf)
+         : "memory");
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile
+        ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+         : /* No output */
+         : [inbuf] "m" (*inbuf)
+         : "memory");
+
+      /* uses only xmm0 and xmm1 */
+      do_aesni_dec (ctx, outbuf, inbuf);
+
+      asm volatile
+        ("movdqu %[outbuf], %%xmm0\n\t"
+         "pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
+         "movdqu %%xmm0, %[outbuf]\n\t"
+         "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
+         : /* No output */
+         : [outbuf] "m" (*outbuf)
+         : "memory");
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile
+    ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
+     : /* No output */
+     : [iv] "m" (*iv)
+     : "memory");
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+#endif /* USE_AESNI */
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
new file mode 100644
index 0000000..762ea76
--- /dev/null
+++ b/cipher/rijndael-internal.h
@@ -0,0 +1,118 @@
+/* Rijndael (AES) for GnuPG
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_RIJNDAEL_INTERNAL_H
+#define G10_RIJNDAEL_INTERNAL_H
+
+#include "types.h"  /* for byte and u32 typedefs */
+
+
+#define MAXKC                   (256/32)
+#define MAXROUNDS               14
+#define BLOCKSIZE               (128/8)
+
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+#  define USE_ARM_ASM 1
+# endif
+#endif
+
+/* USE_PADLOCK indicates whether to compile the padlock specific
+   code.  */
+#undef USE_PADLOCK
+#ifdef ENABLE_PADLOCK_SUPPORT
+# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+#  if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)
+#   define USE_PADLOCK 1
+#  endif
+# endif
+#endif /*ENABLE_PADLOCK_SUPPORT*/
+
+/* USE_AESNI inidicates whether to compile with Intel AES-NI code.  We
+   need the vector-size attribute which seems to be available since
+   gcc 3.  However, to be on the safe side we require at least gcc 4.  */
+#undef USE_AESNI
+#ifdef ENABLE_AESNI_SUPPORT
+# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define USE_AESNI 1
+#  endif
+# endif
+#endif /* ENABLE_AESNI_SUPPORT */
+
+
+/* Our context object.  */
+typedef struct
+{
+  /* The first fields are the keyschedule arrays.  This is so that
+     they are aligned on a 16 byte boundary if using gcc.  This
+     alignment is required for the AES-NI code and a good idea in any
+     case.  The alignment is guaranteed due to the way cipher.c
+     allocates the space for the context.  The PROPERLY_ALIGNED_TYPE
+     hack is used to force a minimal alignment if not using gcc of if
+     the alignment requirement is higher that 16 bytes.  */
+  union
+  {
+    PROPERLY_ALIGNED_TYPE dummy;
+    byte keyschedule[MAXROUNDS+1][4][4];
+#ifdef USE_PADLOCK
+    /* The key as passed to the padlock engine.  It is only used if
+       the padlock engine is used (USE_PADLOCK, below).  */
+    unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#endif /*USE_PADLOCK*/
+  } u1;
+  union
+  {
+    PROPERLY_ALIGNED_TYPE dummy;
+    byte keyschedule[MAXROUNDS+1][4][4];
+  } u2;
+  int rounds;                /* Key-length-dependent number of rounds.  */
+  unsigned int decryption_prepared:1; /* The decryption key schedule is available.  */
+#ifdef USE_PADLOCK
+  unsigned int use_padlock:1;         /* Padlock shall be used.  */
+#endif /*USE_PADLOCK*/
+#ifdef USE_AESNI
+  unsigned int use_aesni:1;           /* AES-NI shall be used.  */
+#endif /*USE_AESNI*/
+} RIJNDAEL_context ATTR_ALIGNED_16;
+
+/* Macros defining alias for the keyschedules.  */
+#define keyschenc  u1.keyschedule
+#define keyschdec  u2.keyschedule
+#define padlockkey u1.padlock_key
+
+#endif /* G10_RIJNDAEL_INTERNAL_H */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8019f0a..4a10a6b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -47,60 +47,8 @@
 #include "cipher.h"
 #include "bufhelp.h"
 #include "cipher-selftest.h"
+#include "rijndael-internal.h"
 
-#define MAXKC			(256/32)
-#define MAXROUNDS		14
-#define BLOCKSIZE               (128/8)
-
-
-/* Helper macro to force alignment to 16 bytes.  */
-#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
-# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
-#else
-# define ATTR_ALIGNED_16
-#endif
-
-
-/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
-#undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
-# define USE_AMD64_ASM 1
-#endif
-
-/* USE_ARM_ASM indicates whether to use ARM assembly code. */
-#undef USE_ARM_ASM
-#if defined(__ARMEL__)
-# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-#  define USE_ARM_ASM 1
-# endif
-#endif
-
-/* USE_PADLOCK indicates whether to compile the padlock specific
-   code.  */
-#undef USE_PADLOCK
-#ifdef ENABLE_PADLOCK_SUPPORT
-# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
-#  if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)
-#   define USE_PADLOCK 1
-#  endif
-# endif
-#endif /*ENABLE_PADLOCK_SUPPORT*/
-
-/* USE_AESNI inidicates whether to compile with Intel AES-NI code.  We
-   need the vector-size attribute which seems to be available since
-   gcc 3.  However, to be on the safe side we require at least gcc 4.  */
-#undef USE_AESNI
-#ifdef ENABLE_AESNI_SUPPORT
-# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
-#  if __GNUC__ >= 4
-#   define USE_AESNI 1
-#  endif
-# endif
-#endif /* ENABLE_AESNI_SUPPORT */
-
-#ifdef USE_AESNI
-  typedef struct u128_s { u32 a, b, c, d; } u128_t;
-#endif /*USE_AESNI*/
 
 /* Define an u32 variant for the sake of gcc 4.4's strict aliasing.  */
 #if __GNUC__ > 4 || ( __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )
@@ -123,6 +71,38 @@ extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
 					  int rounds);
 #endif /*USE_AMD64_ASM*/
 
+#ifdef USE_AESNI
+/* AES-NI (AMD64 & i386) accelerated implementations of AES */
+extern void _gcry_aes_aesni_do_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern void _gcry_aes_aesni_encrypt (RIJNDAEL_context *ctx, unsigned char *dst,
+                                     const unsigned char *src);
+extern void _gcry_aes_aesni_decrypt (RIJNDAEL_context *ctx, unsigned char *dst,
+                                     const unsigned char *src);
+extern void _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx,
+                                     unsigned char *outbuf,
+                                     const unsigned char *inbuf,
+                                     unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx,
+                                     unsigned char *outbuf,
+                                     const unsigned char *inbuf,
+                                     unsigned char *iv, size_t nblocks,
+                                     int cbc_mac);
+extern void _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx,
+                                     unsigned char *outbuf,
+                                     const unsigned char *inbuf,
+                                     unsigned char *ctr, size_t nblocks);
+extern void _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx,
+                                     unsigned char *outbuf,
+                                     const unsigned char *inbuf,
+                                     unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx,
+                                     unsigned char *outbuf,
+                                     const unsigned char *inbuf,
+                                     unsigned char *iv, size_t nblocks);
+#endif
+
 #ifdef USE_ARM_ASM
 /* ARM assembly implementations of AES */
 extern void _gcry_aes_arm_encrypt_block(const void *keysched_enc,
@@ -136,326 +116,19 @@ extern void _gcry_aes_arm_decrypt_block(const void *keysched_dec,
 					  int rounds);
 #endif /*USE_ARM_ASM*/
 
-
 

-/* Our context object.  */
-typedef struct
-{
-  /* The first fields are the keyschedule arrays.  This is so that
-     they are aligned on a 16 byte boundary if using gcc.  This
-     alignment is required for the AES-NI code and a good idea in any
-     case.  The alignment is guaranteed due to the way cipher.c
-     allocates the space for the context.  The PROPERLY_ALIGNED_TYPE
-     hack is used to force a minimal alignment if not using gcc of if
-     the alignment requirement is higher that 16 bytes.  */
-  union
-  {
-    PROPERLY_ALIGNED_TYPE dummy;
-    byte keyschedule[MAXROUNDS+1][4][4];
-#ifdef USE_PADLOCK
-    /* The key as passed to the padlock engine.  It is only used if
-       the padlock engine is used (USE_PADLOCK, below).  */
-    unsigned char padlock_key[16] __attribute__ ((aligned (16)));
-#endif /*USE_PADLOCK*/
-  } u1;
-  union
-  {
-    PROPERLY_ALIGNED_TYPE dummy;
-    byte keyschedule[MAXROUNDS+1][4][4];
-  } u2;
-  int rounds;                /* Key-length-dependent number of rounds.  */
-  unsigned int decryption_prepared:1; /* The decryption key schedule is available.  */
-#ifdef USE_PADLOCK
-  unsigned int use_padlock:1;         /* Padlock shall be used.  */
-#endif /*USE_PADLOCK*/
-#ifdef USE_AESNI
-  unsigned int use_aesni:1;           /* AES-NI shall be used.  */
-#endif /*USE_AESNI*/
-} RIJNDAEL_context ATTR_ALIGNED_16;
-
-/* Macros defining alias for the keyschedules.  */
-#define keyschenc  u1.keyschedule
-#define keyschdec  u2.keyschedule
-#define padlockkey u1.padlock_key
-
-/* Two macros to be called prior and after the use of AESNI
-   instructions.  There should be no external function calls between
-   the use of these macros.  There purpose is to make sure that the
-   SSE regsiters are cleared and won't reveal any information about
-   the key or the data.  */
-#ifdef USE_AESNI
-# define aesni_prepare() do { } while (0)
-# define aesni_cleanup()                                                \
-  do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                          \
-                     "pxor %%xmm1, %%xmm1\n" :: );                      \
-  } while (0)
-# define aesni_cleanup_2_6()                                            \
-  do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                          \
-                     "pxor %%xmm3, %%xmm3\n"                            \
-                     "pxor %%xmm4, %%xmm4\n"                            \
-                     "pxor %%xmm5, %%xmm5\n"                            \
-                     "pxor %%xmm6, %%xmm6\n":: );                       \
-  } while (0)
-#else
-# define aesni_prepare() do { } while (0)
-# define aesni_cleanup() do { } while (0)
-#endif
-
 
 /* All the numbers.  */
 #include "rijndael-tables.h"
 
 
 

-/* Function prototypes.  */
-#if defined(__i386__) && defined(USE_AESNI)
-/* We don't want to inline these functions on i386 to help gcc allocate enough
-   registers.  */
-static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr,
-                          unsigned char *b, const unsigned char *a)
-  __attribute__ ((__noinline__));
-static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr,
-                            unsigned char *b, const unsigned char *a)
-  __attribute__ ((__noinline__));
-#endif /*USE_AESNI*/
 
+/* Function prototypes.  */
 static const char *selftest(void);
 
 
 

-#ifdef USE_AESNI
-static void
-aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
-{
-  aesni_prepare();
-
-  if (ctx->rounds < 12)
-    {
-      /* 128-bit key */
-#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
-	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
-#define AESKEY_EXPAND128 \
-	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
-	"movdqa %%xmm1, %%xmm3\n\t" \
-	"pslldq $4, %%xmm3\n\t" \
-	"pxor   %%xmm3, %%xmm1\n\t" \
-	"pslldq $4, %%xmm3\n\t" \
-	"pxor   %%xmm3, %%xmm1\n\t" \
-	"pslldq $4, %%xmm3\n\t" \
-	"pxor   %%xmm3, %%xmm2\n\t" \
-	"pxor   %%xmm2, %%xmm1\n\t"
-
-      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key   */
-                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x01)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x02)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x04)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x08)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x10)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x20)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x40)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x80)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x1b)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x36)
-                    AESKEY_EXPAND128
-                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
-                    :
-                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
-                    : "cc", "memory" );
-#undef AESKEYGENASSIST_xmm1_xmm2
-#undef AESKEY_EXPAND128
-    }
-  else if (ctx->rounds == 12)
-    {
-      /* 192-bit key */
-#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
-	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
-#define AESKEY_EXPAND192 \
-	"pshufd $0x55, %%xmm2, %%xmm2\n\t" \
-	"movdqu %%xmm1, %%xmm4\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pxor %%xmm2, %%xmm1\n\t" \
-	"pshufd $0xff, %%xmm1, %%xmm2\n\t" \
-	"movdqu %%xmm3, %%xmm4\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm3\n\t" \
-	"pxor %%xmm2, %%xmm3\n\t"
-
-      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
-                    "movq 16(%[key]), %%xmm3\n\t"     /* xmm3 := key[16..23]  */
-                    "movdqa %%xmm1, (%[ksch])\n\t"    /* ksch[0] := xmm1  */
-                    "movdqa %%xmm3, %%xmm5\n\t"
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x01)
-                    AESKEY_EXPAND192
-                    "shufpd $0, %%xmm1, %%xmm5\n\t"
-                    "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5  */
-                    "movdqa %%xmm1, %%xmm6\n\t"
-                    "shufpd $1, %%xmm3, %%xmm6\n\t"
-                    "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6  */
-                    AESKEYGENASSIST_xmm3_xmm2(0x02)
-                    AESKEY_EXPAND192
-                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
-                    "movdqa %%xmm3, %%xmm5\n\t"
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x04)
-                    AESKEY_EXPAND192
-                    "shufpd $0, %%xmm1, %%xmm5\n\t"
-                    "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5  */
-                    "movdqa %%xmm1, %%xmm6\n\t"
-                    "shufpd $1, %%xmm3, %%xmm6\n\t"
-                    "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6  */
-                    AESKEYGENASSIST_xmm3_xmm2(0x08)
-                    AESKEY_EXPAND192
-                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
-                    "movdqa %%xmm3, %%xmm5\n\t"
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x10)
-                    AESKEY_EXPAND192
-                    "shufpd $0, %%xmm1, %%xmm5\n\t"
-                    "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5  */
-                    "movdqa %%xmm1, %%xmm6\n\t"
-                    "shufpd $1, %%xmm3, %%xmm6\n\t"
-                    "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6  */
-                    AESKEYGENASSIST_xmm3_xmm2(0x20)
-                    AESKEY_EXPAND192
-                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
-                    "movdqa %%xmm3, %%xmm5\n\t"
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x40)
-                    AESKEY_EXPAND192
-                    "shufpd $0, %%xmm1, %%xmm5\n\t"
-                    "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5  */
-                    "movdqa %%xmm1, %%xmm6\n\t"
-                    "shufpd $1, %%xmm3, %%xmm6\n\t"
-                    "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6  */
-                    AESKEYGENASSIST_xmm3_xmm2(0x80)
-                    AESKEY_EXPAND192
-                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
-                    :
-                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
-                    : "cc", "memory" );
-#undef AESKEYGENASSIST_xmm3_xmm2
-#undef AESKEY_EXPAND192
-    }
-  else if (ctx->rounds > 12)
-    {
-      /* 256-bit key */
-#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
-	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
-#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
-	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
-#define AESKEY_EXPAND256_A \
-	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
-	"movdqa %%xmm1, %%xmm4\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm1\n\t" \
-	"pxor %%xmm2, %%xmm1\n\t"
-#define AESKEY_EXPAND256_B \
-	"pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
-	"movdqa %%xmm3, %%xmm4\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm3\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm3\n\t" \
-	"pslldq $4, %%xmm4\n\t" \
-	"pxor %%xmm4, %%xmm3\n\t" \
-	"pxor %%xmm2, %%xmm3\n\t"
-
-      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
-                    "movdqu 16(%[key]), %%xmm3\n\t"   /* xmm3 := key[16..31]  */
-                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
-                    "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x01)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x02)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x04)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x08)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x10)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x20)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
-                    AESKEYGENASSIST_xmm1_xmm2(0x00)
-                    AESKEY_EXPAND256_B
-                    "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3  */
-
-                    AESKEYGENASSIST_xmm3_xmm2(0x40)
-                    AESKEY_EXPAND256_A
-                    "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1  */
-
-                    :
-                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
-                    : "cc", "memory" );
-#undef AESKEYGENASSIST_xmm1_xmm2
-#undef AESKEYGENASSIST_xmm3_xmm2
-#undef AESKEY_EXPAND256_A
-#undef AESKEY_EXPAND256_B
-    }
-
-  aesni_cleanup();
-  aesni_cleanup_2_6();
-}
-#endif /*USE_AESNI*/
-
-
-

 /* Perform the key setup.  */
 static gcry_err_code_t
 do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
@@ -566,7 +239,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
     }
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
-    aesni_do_setkey(ctx, key);
+    _gcry_aes_aesni_do_setkey (ctx, key);
 #endif
   else
     {
@@ -678,52 +351,7 @@ prepare_decryption( RIJNDAEL_context *ctx )
 #ifdef USE_AESNI
   if (ctx->use_aesni)
     {
-      /* The AES-NI decrypt instructions use the Equivalent Inverse
-         Cipher, thus we can't use the the standard decrypt key
-         preparation.  */
-        u128_t *ekey = (u128_t *)ctx->keyschenc;
-        u128_t *dkey = (u128_t *)ctx->keyschdec;
-        int rr;
-
-	aesni_prepare();
-
-#define DO_AESNI_AESIMC() \
-	asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
-                      /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
-                      ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
-                      "movdqa %%xmm1, %[dkey]" \
-                      : [dkey] "=m" (dkey[r]) \
-                      : [ekey] "m" (ekey[rr]) \
-                      : "memory")
-
-        dkey[0] = ekey[ctx->rounds];
-        r=1;
-	rr=ctx->rounds-1;
-	DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
-	DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
-	if (ctx->rounds > 10)
-	  {
-	    DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
-	    DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
-	    if (ctx->rounds > 12)
-	      {
-	        DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
-	        DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
-	      }
-	  }
-
-        dkey[r] = ekey[0];
-
-#undef DO_AESNI_AESIMC
-
-	aesni_cleanup();
+      _gcry_aes_aesni_prepare_decryption (ctx);
     }
   else
 #endif /*USE_AESNI*/
@@ -951,690 +579,6 @@ do_padlock (const RIJNDAEL_context *ctx, int decrypt_flag,
 #endif /*USE_PADLOCK*/
 
 
-#ifdef USE_AESNI
-/* Encrypt one block using the Intel AES-NI instructions.  A and B may
-   be the same.
-
-   Our problem here is that gcc does not allow the "x" constraint for
-   SSE registers in asm unless you compile with -msse.  The common
-   wisdom is to use a separate file for SSE instructions and build it
-   separately.  This would require a lot of extra build system stuff,
-   similar to what we do in mpi/ for the asm stuff.  What we do
-   instead is to use standard registers and a bit more of plain asm
-   which copies the data and key stuff to the SSE registers and later
-   back.  If we decide to implement some block modes with parallelized
-   AES instructions, it might indeed be better to use plain asm ala
-   mpi/.  */
-static inline void
-do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  /* Note: For now we relax the alignment requirement for A and B: It
-     does not make much difference because in many case we would need
-     to memcpy them to an extra buffer; using the movdqu is much faster
-     that memcpy and movdqa.  For CFB we know that the IV is properly
-     aligned but that is a special case.  We should better implement
-     CFB direct in asm.  */
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "r" (ctx->rounds)
-                : "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenclast_xmm1_xmm0
-}
-
-
-static inline void
-do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
-{
-#define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
-#define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesdec_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Ldeclast%=:\n\t"
-                aesdeclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschdec),
-                  [rounds] "r" (ctx->rounds)
-                : "cc", "memory");
-#undef aesdec_xmm1_xmm0
-#undef aesdeclast_xmm1_xmm0
-}
-
-
-/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
- * and output through SSE registers xmm1 to xmm4.  */
-static void
-do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
-{
-#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
-#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
-#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
-#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
-#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
-#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
-#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
-#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
-  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
-                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
-                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
-                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
-                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x20(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x30(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x40(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x50(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x60(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x70(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x80(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0x90(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0xa0(%[key]), %%xmm0\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0xb0(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0xc0(%[key]), %%xmm0\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0xd0(%[key]), %%xmm0\n\t"
-                aesenc_xmm0_xmm1
-                aesenc_xmm0_xmm2
-                aesenc_xmm0_xmm3
-                aesenc_xmm0_xmm4
-                "movdqa 0xe0(%[key]), %%xmm0\n"
-
-                ".Ldeclast%=:\n\t"
-                aesenclast_xmm0_xmm1
-                aesenclast_xmm0_xmm2
-                aesenclast_xmm0_xmm3
-                aesenclast_xmm0_xmm4
-                : /* no output */
-                : [key] "r" (ctx->keyschenc),
-                  [rounds] "r" (ctx->rounds)
-                : "cc", "memory");
-#undef aesenc_xmm0_xmm1
-#undef aesenc_xmm0_xmm2
-#undef aesenc_xmm0_xmm3
-#undef aesenc_xmm0_xmm4
-#undef aesenclast_xmm0_xmm1
-#undef aesenclast_xmm0_xmm2
-#undef aesenclast_xmm0_xmm3
-#undef aesenclast_xmm0_xmm4
-}
-
-
-/* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
- * and output through SSE registers xmm1 to xmm4.  */
-static void
-do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
-{
-#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
-#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
-#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
-#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
-#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
-#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
-#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
-#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
-  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
-                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
-                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
-                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
-                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x20(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x30(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x40(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x50(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x60(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x70(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x80(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0x90(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0xa0(%[key]), %%xmm0\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0xb0(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0xc0(%[key]), %%xmm0\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Ldeclast%=\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0xd0(%[key]), %%xmm0\n\t"
-                aesdec_xmm0_xmm1
-                aesdec_xmm0_xmm2
-                aesdec_xmm0_xmm3
-                aesdec_xmm0_xmm4
-                "movdqa 0xe0(%[key]), %%xmm0\n"
-
-                ".Ldeclast%=:\n\t"
-                aesdeclast_xmm0_xmm1
-                aesdeclast_xmm0_xmm2
-                aesdeclast_xmm0_xmm3
-                aesdeclast_xmm0_xmm4
-                : /* no output */
-                : [key] "r" (ctx->keyschdec),
-                  [rounds] "r" (ctx->rounds)
-                : "cc", "memory");
-#undef aesdec_xmm0_xmm1
-#undef aesdec_xmm0_xmm2
-#undef aesdec_xmm0_xmm3
-#undef aesdec_xmm0_xmm4
-#undef aesdeclast_xmm0_xmm1
-#undef aesdeclast_xmm0_xmm2
-#undef aesdeclast_xmm0_xmm3
-#undef aesdeclast_xmm0_xmm4
-}
-
-
-/* Perform a CFB encryption or decryption round using the
-   initialization vector IV and the input block A.  Write the result
-   to the output block B and update IV.  IV needs to be 16 byte
-   aligned.  */
-static void
-do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
-              unsigned char *iv, unsigned char *b, const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  asm volatile ("movdqa %[iv], %%xmm0\n\t"      /* xmm0 := IV     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                "movdqu %[src], %%xmm1\n\t"      /* Save input.  */
-                "pxor %%xmm1, %%xmm0\n\t"        /* xmm0 = input ^ IV  */
-
-                "cmpl $1, %[decrypt]\n\t"
-                "jz .Ldecrypt_%=\n\t"
-                "movdqa %%xmm0, %[iv]\n\t"       /* [encrypt] Store IV.  */
-                "jmp .Lleave_%=\n"
-                ".Ldecrypt_%=:\n\t"
-                "movdqa %%xmm1, %[iv]\n"         /* [decrypt] Store IV.  */
-                ".Lleave_%=:\n\t"
-                "movdqu %%xmm0, %[dst]\n"        /* Store output.   */
-                : [iv] "+m" (*iv), [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds),
-                  [decrypt] "m" (decrypt_flag)
-                : "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenclast_xmm1_xmm0
-}
-
-/* Perform a CTR encryption round using the counter CTR and the input
-   block A.  Write the result to the output block B and update CTR.
-   CTR needs to be a 16 byte aligned little-endian value.  */
-static void
-do_aesni_ctr (const RIJNDAEL_context *ctx,
-              unsigned char *ctr, unsigned char *b, const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-
-  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5)  */
-                "pcmpeqd %%xmm1, %%xmm1\n\t"
-                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
-
-                "pshufb %%xmm6, %%xmm5\n\t"
-                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++ (big endian) */
-
-                /* detect if 64-bit carry handling is needed */
-                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
-                "jne    .Lno_carry%=\n\t"
-                "cmpl   $0xffffffff, 12(%[ctr])\n\t"
-                "jne    .Lno_carry%=\n\t"
-
-                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
-                "psubq   %%xmm1, %%xmm5\n\t"    /* add carry to upper 64bits */
-
-                ".Lno_carry%=:\n\t"
-
-                "pshufb %%xmm6, %%xmm5\n\t"
-                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).       */
-
-                "pxor (%[key]), %%xmm0\n\t"     /* xmm1 ^= key[0]    */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                "movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
-                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
-                "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
-
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [ctr] "r" (ctr),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds)
-                : "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenclast_xmm1_xmm0
-}
-
-
-/* Four blocks at a time variant of do_aesni_ctr.  */
-static void
-do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
-                unsigned char *ctr, unsigned char *b, const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
-#define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
-#define aesenc_xmm1_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-#define aesenclast_xmm1_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
-#define aesenclast_xmm1_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
-#define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
-
-  /* Register usage:
-      esi   keyschedule
-      xmm0  CTR-0
-      xmm1  temp / round key
-      xmm2  CTR-1
-      xmm3  CTR-2
-      xmm4  CTR-3
-      xmm5  copy of *ctr
-      xmm6  endian swapping mask
-   */
-
-  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"   /* xmm0, xmm2 := CTR (xmm5) */
-                "movdqa %%xmm0, %%xmm2\n\t"
-                "pcmpeqd %%xmm1, %%xmm1\n\t"
-                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
-
-                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
-                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
-                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
-                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
-                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
-                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
-                "movdqa %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
-                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
-
-                /* detect if 64-bit carry handling is needed */
-                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
-                "jne    .Lno_carry%=\n\t"
-                "movl   12(%[ctr]), %%esi\n\t"
-                "bswapl %%esi\n\t"
-                "cmpl   $0xfffffffc, %%esi\n\t"
-                "jb     .Lno_carry%=\n\t"       /* no carry */
-
-                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
-                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffffc */
-                "cmpl   $0xfffffffe, %%esi\n\t"
-                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
-                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
-                /* esi == 0xffffffff */
-
-                "psubq   %%xmm1, %%xmm2\n\t"
-                ".Lcarry_xmm3%=:\n\t"
-                "psubq   %%xmm1, %%xmm3\n\t"
-                ".Lcarry_xmm4%=:\n\t"
-                "psubq   %%xmm1, %%xmm4\n\t"
-                ".Lcarry_xmm5%=:\n\t"
-                "psubq   %%xmm1, %%xmm5\n\t"
-
-                ".Lno_carry%=:\n\t"
-                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
-                "movl %[rounds], %%esi\n\t"
-
-                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
-                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
-                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
-                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
-                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
-
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %%esi\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %%esi\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                aesenc_xmm1_xmm2
-                aesenc_xmm1_xmm3
-                aesenc_xmm1_xmm4
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                aesenclast_xmm1_xmm2
-                aesenclast_xmm1_xmm3
-                aesenclast_xmm1_xmm4
-
-                "movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
-                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
-                "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
-
-                "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
-                "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
-                "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
-
-                "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
-                "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
-                "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
-
-                "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
-                "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
-                "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
-
-                :
-                : [ctr] "r" (ctr),
-                  [src] "r" (a),
-                  [dst] "r" (b),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds)
-                : "%esi", "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenc_xmm1_xmm2
-#undef aesenc_xmm1_xmm3
-#undef aesenc_xmm1_xmm4
-#undef aesenclast_xmm1_xmm0
-#undef aesenclast_xmm1_xmm2
-#undef aesenclast_xmm1_xmm3
-#undef aesenclast_xmm1_xmm4
-}
-
-#endif /*USE_AESNI*/
-
-
 static unsigned int
 rijndael_encrypt (void *context, byte *b, const byte *a)
 {
@@ -1653,9 +597,7 @@ rijndael_encrypt (void *context, byte *b, const byte *a)
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      aesni_prepare ();
-      do_aesni_enc (ctx, b, a);
-      aesni_cleanup ();
+      _gcry_aes_aesni_encrypt (ctx, b, a);
       burn_stack = 0;
     }
 #endif /*USE_AESNI*/
@@ -1703,16 +645,8 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      aesni_prepare ();
-      for ( ;nblocks; nblocks-- )
-        {
-          do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
-          outbuf += BLOCKSIZE;
-          inbuf  += BLOCKSIZE;
-        }
-      aesni_cleanup ();
-
-      burn_depth = 0; /* No stack usage. */
+      _gcry_aes_aesni_cfb_enc (ctx, outbuf, inbuf, iv, nblocks);
+      burn_depth = 0;
     }
 #endif /*USE_AESNI*/
   else
@@ -1747,40 +681,21 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   unsigned char *last_iv;
   unsigned int burn_depth = 48 + 2*sizeof(int);
-#ifdef USE_AESNI
-  int use_aesni = ctx->use_aesni;
-#endif
 
+  if (0)
+    ;
 #ifdef USE_AESNI
-  if (use_aesni)
-    aesni_prepare ();
-#endif /*USE_AESNI*/
-
-  last_iv = iv;
-
-  for ( ;nblocks; nblocks-- )
+  else if (ctx->use_aesni)
     {
-      if (0)
-        ;
-#ifdef USE_AESNI
-      else if (use_aesni)
-        {
-          /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
-             SSE registers.  */
-          asm volatile ("movdqu %[iv], %%xmm0\n\t"
-                        "movdqu %[inbuf], %%xmm1\n\t"
-                        "pxor %%xmm0, %%xmm1\n\t"
-                        "movdqu %%xmm1, %[outbuf]\n\t"
-                        : /* No output */
-                        : [iv] "m" (*last_iv),
-                          [inbuf] "m" (*inbuf),
-                          [outbuf] "m" (*outbuf)
-                        : "memory" );
-
-          do_aesni_enc (ctx, outbuf, outbuf);
-        }
+      _gcry_aes_aesni_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac);
+      burn_depth = 0;
+    }
 #endif /*USE_AESNI*/
-      else
+  else
+    {
+      last_iv = iv;
+
+      for ( ;nblocks; nblocks-- )
         {
           buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
 
@@ -1792,39 +707,17 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
 #endif /*USE_PADLOCK*/
           else
             do_encrypt (ctx, outbuf, outbuf );
-        }
 
-      last_iv = outbuf;
-      inbuf += BLOCKSIZE;
-      if (!cbc_mac)
-        outbuf += BLOCKSIZE;
-    }
+          last_iv = outbuf;
+          inbuf += BLOCKSIZE;
+          if (!cbc_mac)
+            outbuf += BLOCKSIZE;
+        }
 
-  if (last_iv != iv)
-    {
-      if (0)
-        ;
-#ifdef USE_AESNI
-      else if (use_aesni)
-        asm volatile ("movdqu %[last], %%xmm0\n\t"
-                      "movdqu %%xmm0, %[iv]\n\t"
-                      : /* No output */
-                      : [last] "m" (*last_iv),
-                        [iv] "m" (*iv)
-                      : "memory" );
-#endif /*USE_AESNI*/
-      else
+      if (last_iv != iv)
         buf_cpy (iv, last_iv, BLOCKSIZE);
     }
 
-#ifdef USE_AESNI
-   if (use_aesni)
-     {
-       aesni_cleanup ();
-       burn_depth = 0; /* No stack usage. */
-     }
-#endif /*USE_AESNI*/
-
   if (burn_depth)
     _gcry_burn_stack (burn_depth);
 }
@@ -1851,34 +744,8 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
-        { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
-      aesni_prepare ();
-
-      asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
-                    "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
-                    : /* No output */
-                    : [mask] "m" (*be_mask),
-                      [ctr] "m" (*ctr)
-                    : "memory");
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-        {
-          do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
-          outbuf += 4*BLOCKSIZE;
-          inbuf  += 4*BLOCKSIZE;
-        }
-      for ( ;nblocks; nblocks-- )
-        {
-          do_aesni_ctr (ctx, ctr, outbuf, inbuf);
-          outbuf += BLOCKSIZE;
-          inbuf  += BLOCKSIZE;
-        }
-      aesni_cleanup ();
-      aesni_cleanup_2_6 ();
-
-      burn_depth = 0; /* No stack usage. */
+      _gcry_aes_aesni_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks);
+      burn_depth = 0;
     }
 #endif /*USE_AESNI*/
   else
@@ -2076,9 +943,7 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      aesni_prepare ();
-      do_aesni_dec (ctx, b, a);
-      aesni_cleanup ();
+      _gcry_aes_aesni_decrypt (ctx, b, a);
       burn_stack = 0;
     }
 #endif /*USE_AESNI*/
@@ -2124,61 +989,8 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      aesni_prepare ();
-
-      /* CFB decryption can be parallelized */
-      for ( ;nblocks >= 4; nblocks -= 4)
-        {
-          asm volatile
-            ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
-             "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
-             "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
-             "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
-
-             "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
-             "movdqu %%xmm0,         (%[iv])\n\t"
-             : /* No output */
-             : [inbuf] "r" (inbuf), [iv] "r" (iv)
-             : "memory");
-
-          do_aesni_enc_vec4 (ctx);
-
-          asm volatile
-            ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
-             "pxor %%xmm5, %%xmm1\n\t"
-             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
-
-             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
-             "pxor %%xmm5, %%xmm2\n\t"
-             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
-
-             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
-             "pxor %%xmm5, %%xmm3\n\t"
-             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
-
-             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
-             "pxor %%xmm5, %%xmm4\n\t"
-             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
-
-             : /* No output */
-             : [inbuf] "r" (inbuf),
-               [outbuf] "r" (outbuf)
-             : "memory");
-
-          outbuf += 4*BLOCKSIZE;
-          inbuf  += 4*BLOCKSIZE;
-        }
-
-      for ( ;nblocks; nblocks-- )
-        {
-          do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
-          outbuf += BLOCKSIZE;
-          inbuf  += BLOCKSIZE;
-        }
-      aesni_cleanup ();
-      aesni_cleanup_2_6 ();
-
-      burn_depth = 0; /* No stack usage. */
+      _gcry_aes_aesni_cfb_dec (ctx, outbuf, inbuf, iv, nblocks);
+      burn_depth = 0;
     }
 #endif /*USE_AESNI*/
   else
@@ -2218,87 +1030,8 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
 #ifdef USE_AESNI
   else if (ctx->use_aesni)
     {
-      aesni_prepare ();
-
-      asm volatile
-        ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
-         : /* No output */
-         : [iv] "m" (*iv)
-         : "memory");
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-        {
-          asm volatile
-            ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
-             "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
-             "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
-             "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
-             : /* No output */
-             : [inbuf] "r" (inbuf)
-             : "memory");
-
-          do_aesni_dec_vec4 (ctx);
-
-          asm volatile
-            ("pxor %%xmm5, %%xmm1\n\t"			/* xor IV with output */
-             "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
-             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
-
-             "pxor %%xmm5, %%xmm2\n\t"			/* xor IV with output */
-             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
-             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
-
-             "pxor %%xmm5, %%xmm3\n\t"			/* xor IV with output */
-             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
-             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
-
-             "pxor %%xmm5, %%xmm4\n\t"			/* xor IV with output */
-             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
-             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
-
-             : /* No output */
-             : [inbuf] "r" (inbuf),
-               [outbuf] "r" (outbuf)
-             : "memory");
-
-          outbuf += 4*BLOCKSIZE;
-          inbuf  += 4*BLOCKSIZE;
-        }
-
-      for ( ;nblocks; nblocks-- )
-        {
-          asm volatile
-            ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
-             : /* No output */
-             : [inbuf] "m" (*inbuf)
-             : "memory");
-
-          /* uses only xmm0 and xmm1 */
-          do_aesni_dec (ctx, outbuf, inbuf);
-
-          asm volatile
-            ("movdqu %[outbuf], %%xmm0\n\t"
-             "pxor %%xmm5, %%xmm0\n\t"		/* xor IV with output */
-             "movdqu %%xmm0, %[outbuf]\n\t"
-             "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
-             : /* No output */
-             : [outbuf] "m" (*outbuf)
-             : "memory");
-
-          outbuf += BLOCKSIZE;
-          inbuf  += BLOCKSIZE;
-        }
-
-      asm volatile
-        ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
-         : /* No output */
-         : [iv] "m" (*iv)
-         : "memory");
-
-      aesni_cleanup ();
-      aesni_cleanup_2_6 ();
-
-      burn_depth = 0; /* No stack usage. */
+      _gcry_aes_aesni_cbc_dec (ctx, outbuf, inbuf, iv, nblocks);
+      burn_depth = 0;
     }
 #endif /*USE_AESNI*/
   else
diff --git a/configure.ac b/configure.ac
index a0d5fc9..369be25 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1698,6 +1698,13 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
       ;;
    esac
+
+   case "$mpi_cpu_arch" in
+     x86)
+         # Build with the AES-NI implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)