From jussi.kivilinna at iki.fi  Sat Jan  6 19:06:03 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 06 Jan 2018 20:06:03 +0200
Subject: [PATCH] mpi/ec: fix when 'unsigned long' is 32-bit but limb size is
 64-bit
Message-ID: <151526196339.10194.13412606082380457756.stgit@localhost.localdomain>

* mpi/ec.c (ec_addm_25519, ec_subm_25519, ec_mulm_25519): Cast '1' to
mpi_limb_t before left shift.
--

Patch fixes mpi/ec.c compiler warnings and failing tests cases on
Win64.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/mpi/ec.c b/mpi/ec.c
index ca293ca46..2c396a741 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -391,7 +391,7 @@ ec_addm_25519 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   borrow = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
   mpih_set_cond (n, ctx->p->d, wsize, (borrow != 0UL));
   _gcry_mpih_add_n (wp, wp, n, wsize);
-  wp[LIMB_SIZE_25519-1] &= ~(1UL << (255 % BITS_PER_MPI_LIMB));
+  wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB));
 }
 
 static void
@@ -413,7 +413,7 @@ ec_subm_25519 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   borrow = _gcry_mpih_sub_n (wp, up, vp, wsize);
   mpih_set_cond (n, ctx->p->d, wsize, (borrow != 0UL));
   _gcry_mpih_add_n (wp, wp, n, wsize);
-  wp[LIMB_SIZE_25519-1] &= ~(1UL << (255 % BITS_PER_MPI_LIMB));
+  wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB));
 }
 
 static void
@@ -436,7 +436,7 @@ ec_mulm_25519 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
 
   _gcry_mpih_mul_n (n, up, vp, wsize);
   memcpy (wp, n, wsize * BYTES_PER_MPI_LIMB);
-  wp[LIMB_SIZE_25519-1] &= ~(1UL << (255 % BITS_PER_MPI_LIMB));
+  wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB));
 
   memcpy (m, n+LIMB_SIZE_25519-1, (wsize+1) * BYTES_PER_MPI_LIMB);
   _gcry_mpih_rshift (m, m, LIMB_SIZE_25519+1, (255 % BITS_PER_MPI_LIMB));
@@ -457,7 +457,7 @@ ec_mulm_25519 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   memset (m, 0, wsize * BYTES_PER_MPI_LIMB);
   msb = (wp[LIMB_SIZE_25519-1] >> (255 % BITS_PER_MPI_LIMB));
   m[0] = (m[LIMB_SIZE_25519] * 2 + msb) * 19;
-  wp[LIMB_SIZE_25519-1] &= ~(1UL << (255 % BITS_PER_MPI_LIMB));
+  wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB));
   _gcry_mpih_add_n (wp, wp, m, wsize);
 
   m[0] = 0;


From jussi.kivilinna at iki.fi  Sat Jan  6 19:03:06 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 06 Jan 2018 20:03:06 +0200
Subject: [PATCH 3/3] Add ARMv8/AArch64 implementation of chacha20
In-Reply-To: <151526177666.9411.1967680231927273017.stgit@localhost.localdomain>
References: <151526177666.9411.1967680231927273017.stgit@localhost.localdomain>
Message-ID: <151526178679.9411.11839462781925970666.stgit@localhost.localdomain>

* cipher/Makefile.am: Add 'chacha20-aarch64.S'.
* cipher/chacha20-aarch64.S: New.
* cipher/chacha20.c (USE_AARCH64_SIMD): New.
(_gcry_chacha20_aarch_blocks4): New.
(chacha20_do_setkey): Add HWF selection for Aarch64 implementation.
* configure.ac: Add 'chacha20-aarch64.lo'.
--

Benchmark on Cortex-A53 (1152 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      7.91 ns/B     120.6 MiB/s      9.11 c/B
     STREAM dec |      7.91 ns/B     120.6 MiB/s      9.11 c/B

After (1.66x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      4.74 ns/B     201.2 MiB/s      5.46 c/B
     STREAM dec |      4.74 ns/B     201.3 MiB/s      5.46 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a24b117c2..3c4eae0b9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -65,6 +65,7 @@ arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S chacha20-armv7-neon.S \
+  chacha20-aarch64.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
new file mode 100644
index 000000000..f54e059aa
--- /dev/null
+++ b/cipher/chacha20-aarch64.S
@@ -0,0 +1,309 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, :got:name ; \
+	ldr     reg, [reg, #:got_lo12:name] ;
+
+/* register macros */
+#define INPUT     x0
+#define DST       x1
+#define SRC       x2
+#define NBLKS     x3
+#define ROUND     x4
+#define INPUT_CTR x5
+#define INPUT_POS x6
+#define CTR       x7
+
+/* vector registers */
+#define X0 v16
+#define X1 v17
+#define X2 v18
+#define X3 v19
+#define X4 v20
+#define X5 v21
+#define X6 v22
+#define X7 v23
+#define X8 v24
+#define X9 v25
+#define X10 v26
+#define X11 v27
+#define X12 v28
+#define X13 v29
+#define X14 v30
+#define X15 v31
+
+#define VCTR    v0
+#define VTMP0   v1
+#define VTMP1   v2
+#define VTMP2   v3
+#define VTMP3   v4
+#define X12_TMP v5
+#define X13_TMP v6
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define vpunpckldq(s1, s2, dst) \
+	zip1 dst.4s, s2.4s, s1.4s;
+
+#define vpunpckhdq(s1, s2, dst) \
+	zip2 dst.4s, s2.4s, s1.4s;
+
+#define vpunpcklqdq(s1, s2, dst) \
+	zip1 dst.2d, s2.2d, s1.2d;
+
+#define vpunpckhqdq(s1, s2, dst) \
+	zip2 dst.2d, s2.2d, s1.2d;
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	vpunpckhdq(x1, x0, t2); \
+	vpunpckldq(x1, x0, x0); \
+	\
+	vpunpckldq(x3, x2, t1); \
+	vpunpckhdq(x3, x2, x2); \
+	\
+	vpunpckhqdq(t1, x0, x1); \
+	vpunpcklqdq(t1, x0, x0); \
+	\
+	vpunpckhqdq(x2, t2, x3); \
+	vpunpcklqdq(x2, t2, x2);
+
+#define clear(x) \
+	eor x.16b, x.16b, x.16b;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2)		\
+	shl dst1.4s, src1.4s, #(c);		\
+	shl dst2.4s, src2.4s, #(c);		\
+	sri dst1.4s, src1.4s, #(32 - (c));	\
+	sri dst2.4s, src2.4s, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2)		\
+	rev32 dst1.8h, src1.8h;			\
+	rev32 dst2.8h, src2.8h;
+
+#define XOR(d,s1,s2) \
+	eor d.16b, s2.16b, s1.16b;
+
+#define PLUS(ds,s) \
+	add ds.4s, ds.4s, s.4s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)		\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2_16(d1, d2, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2(d1, d2,  8, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+	.long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_aarch64_blocks4
+.type _gcry_chacha20_aarch64_blocks4,%function;
+
+_gcry_chacha20_aarch64_blocks4:
+	/* input:
+	 *	x0: input
+	 *	x1: dst
+	 *	x2: src
+	 *	x3: nblks (multiple of 4)
+	 */
+
+	GET_DATA_POINTER(CTR, .Linc_counter);
+	add INPUT_CTR, INPUT, #(12*4);
+	mov INPUT_POS, INPUT;
+	ld1 {VCTR.16b}, [CTR];
+
+.Loop4:
+	/* Construct counter vectors X12 and X13 */
+
+	ld1 {X15.16b}, [INPUT_CTR];
+	mov ROUND, #20;
+	ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
+
+	dup X12.4s, X15.4s[0];
+	dup X13.4s, X15.4s[1];
+	ldr CTR, [INPUT_CTR];
+	add X12.4s, X12.4s, VCTR.4s;
+	dup X0.4s, VTMP1.4s[0];
+	dup X1.4s, VTMP1.4s[1];
+	dup X2.4s, VTMP1.4s[2];
+	dup X3.4s, VTMP1.4s[3];
+	dup X14.4s, X15.4s[2];
+	cmhi VTMP0.4s, VCTR.4s, X12.4s;
+	dup X15.4s, X15.4s[3];
+	add CTR, CTR, #4; /* Update counter */
+	dup X4.4s, VTMP2.4s[0];
+	dup X5.4s, VTMP2.4s[1];
+	dup X6.4s, VTMP2.4s[2];
+	dup X7.4s, VTMP2.4s[3];
+	sub X13.4s, X13.4s, VTMP0.4s;
+	dup X8.4s, VTMP3.4s[0];
+	dup X9.4s, VTMP3.4s[1];
+	dup X10.4s, VTMP3.4s[2];
+	dup X11.4s, VTMP3.4s[3];
+	mov X12_TMP.16b, X12.16b;
+	mov X13_TMP.16b, X13.16b;
+	str CTR, [INPUT_CTR];
+
+.Lround2:
+	subs ROUND, ROUND, #2
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,VTMP0,VTMP1)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,VTMP0,VTMP1)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,VTMP0,VTMP1)
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,VTMP0,VTMP1)
+	b.ne .Lround2;
+
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
+
+	PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
+	PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
+
+	dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 0 * 4 */
+	dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 1 * 4 */
+	dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 2 * 4 */
+	dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 3 * 4 */
+	PLUS(X0, VTMP2);
+	PLUS(X1, VTMP3);
+	PLUS(X2, X12_TMP);
+	PLUS(X3, X13_TMP);
+
+	dup VTMP2.4s, VTMP1.4s[0]; /* INPUT + 4 * 4 */
+	dup VTMP3.4s, VTMP1.4s[1]; /* INPUT + 5 * 4 */
+	dup X12_TMP.4s, VTMP1.4s[2]; /* INPUT + 6 * 4 */
+	dup X13_TMP.4s, VTMP1.4s[3]; /* INPUT + 7 * 4 */
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
+	mov INPUT_POS, INPUT;
+	PLUS(X4, VTMP2);
+	PLUS(X5, VTMP3);
+	PLUS(X6, X12_TMP);
+	PLUS(X7, X13_TMP);
+
+	dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 8 * 4 */
+	dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 9 * 4 */
+	dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 10 * 4 */
+	dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 11 * 4 */
+	dup VTMP0.4s, VTMP1.4s[2]; /* INPUT + 14 * 4 */
+	dup VTMP1.4s, VTMP1.4s[3]; /* INPUT + 15 * 4 */
+	PLUS(X8, VTMP2);
+	PLUS(X9, VTMP3);
+	PLUS(X10, X12_TMP);
+	PLUS(X11, X13_TMP);
+	PLUS(X14, VTMP0);
+	PLUS(X15, VTMP1);
+
+	transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
+
+	subs NBLKS, NBLKS, #4;
+
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X0.16b, VTMP0.16b;
+	eor VTMP1.16b, X4.16b, VTMP1.16b;
+	eor VTMP2.16b, X8.16b, VTMP2.16b;
+	eor VTMP3.16b, X12.16b, VTMP3.16b;
+	eor X12_TMP.16b, X1.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X5.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X9.16b, VTMP0.16b;
+	eor VTMP1.16b, X13.16b, VTMP1.16b;
+	eor VTMP2.16b, X2.16b, VTMP2.16b;
+	eor VTMP3.16b, X6.16b, VTMP3.16b;
+	eor X12_TMP.16b, X10.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X14.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	eor VTMP0.16b, X3.16b, VTMP0.16b;
+	eor VTMP1.16b, X7.16b, VTMP1.16b;
+	eor VTMP2.16b, X11.16b, VTMP2.16b;
+	eor VTMP3.16b, X15.16b, VTMP3.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+
+	b.ne .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(VTMP0);
+	clear(VTMP1);
+	clear(VTMP2);
+	clear(VTMP3);
+	clear(X12_TMP);
+	clear(X13_TMP);
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	eor x0, x0, x0
+	ret
+.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;
+
+#endif
+
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ac6cc29e8..e89ad2e47 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -73,6 +73,17 @@
 # endif
 #endif
 
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#  define USE_AARCH64_SIMD 1
+# endif
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
@@ -119,6 +130,13 @@ unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
 
 #endif /* USE_ARMV7_NEON */
 
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
+					    const byte *src, size_t nblks);
+
+#endif /* USE_AARCH64_SIMD */
+
 
 static const char *selftest (void);
 

@@ -338,6 +356,10 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
+#ifdef USE_AARCH64_SIMD
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+#endif
+
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
@@ -434,6 +456,20 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
     }
 #endif
 
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
+					     nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
diff --git a/configure.ac b/configure.ac
index a5aba144c..42cd4c27b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2230,6 +2230,10 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then


From jussi.kivilinna at iki.fi  Sat Jan  6 18:59:39 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 6 Jan 2018 19:59:39 +0200
Subject: [PATCH] Add ARMv8/AArch64 implementation of chacha20
In-Reply-To: <2a8e1f8d-a04a-4770-24f1-665a2b9e21a2@iki.fi>
References: <150202138983.15504.16870943748027047649.stgit@localhost.localdomain>
 <8760dzpgzs.fsf@wheatstone.g10code.de>
 <2a8e1f8d-a04a-4770-24f1-665a2b9e21a2@iki.fi>
Message-ID: <a5a84ff6-34bb-4e84-09f8-32aaeb93362d@iki.fi>

On 12.08.2017 10:11, Jussi Kivilinna wrote:
> On 07.08.2017 17:52, Werner Koch wrote:
>> On Sun,  6 Aug 2017 14:09, jussi.kivilinna at iki.fi said:
>>
>>> Patch adds ARMv8/AArch64 SIMD implementation based on public domain
>>> ARMv7/NEON implementation by Andrew Moon at:
>>>   https://github.com/floodyberry/chacha-opt
>>
>> Can you please contact the author and ask to clarify the license?  I
>> only found this in the README:
>>
>>   Public Domain. or MIT
>>
>> This is not sufficient.  We need to know who has put this into the PD.
>> There are several MIT licenses.  We need to know which one.  And also
>> the copyright holder.
>>
> 
> I've sent author e-mail on this issue, and now waiting for reply.
> 

I tried to contact author with email and through github but have not
got any response so far. 

I've prepared new implementations of poly1305 and chacha20 to replace
existing ones that are based on source on Andrew Moon's github
repositories. I'll send those to mailing list next.

-Jussi


From jussi.kivilinna at iki.fi  Sat Jan  6 19:02:56 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 06 Jan 2018 20:02:56 +0200
Subject: [PATCH 1/3] New Poly1305 implementation
Message-ID: <151526177666.9411.1967680231927273017.stgit@localhost.localdomain>

* cipher/Makefile.am: Include '../mpi' for 'longlong.h'; Remove
'poly1305-sse2-amd64.S', 'poly1305-avx2-amd64.S' and
'poly1305-armv7-neon.S'.
* cipher/poly1305-armv7-neon.S: Remove.
* cipher/poly1305-avx2-amd64.S: Remove.
* cipher/poly1305-sse2-amd64.S: Remove.
* cipher/poly1305-internal.h (POLY1305_BLOCKSIZE)
(POLY1305_STATE): New.
(POLY1305_SYSV_FUNC_ABI, POLY1305_REF_BLOCKSIZE)
(POLY1305_REF_STATESIZE, POLY1305_REF_ALIGNMENT)
(POLY1305_USE_SSE2, POLY1305_SSE2_BLOCKSIZE, POLY1305_SSE2_STATESIZE)
(POLY1305_SSE2_ALIGNMENT, POLY1305_USE_AVX2, POLY1305_AVX2_BLOCKSIZE)
(POLY1305_AVX2_STATESIZE, POLY1305_AVX2_ALIGNMENT)
(POLY1305_USE_NEON, POLY1305_NEON_BLOCKSIZE, POLY1305_NEON_STATESIZE)
(POLY1305_NEON_ALIGNMENT, POLY1305_LARGEST_BLOCKSIZE)
(POLY1305_LARGEST_STATESIZE, POLY1305_LARGEST_ALIGNMENT)
(POLY1305_STATE_BLOCKSIZE, POLY1305_STATE_STATESIZE)
(POLY1305_STATE_ALIGNMENT, OPS_FUNC_ABI, poly1305_key_s)
(poly1305_ops_s): Remove.
(poly1305_context_s): Rewrite.
* cipher/poly1305.c (_gcry_poly1305_amd64_sse2_init_ext)
(_gcry_poly1305_amd64_sse2_finish_ext)
(_gcry_poly1305_amd64_sse2_blocks, poly1305_amd64_sse2_ops)
(poly1305_init_ext_ref32, poly1305_blocks_ref32)
(poly1305_finish_ext_ref32, poly1305_default_ops)
(_gcry_poly1305_amd64_avx2_init_ext)
(_gcry_poly1305_amd64_avx2_finish_ext)
(_gcry_poly1305_amd64_avx2_blocks)
(poly1305_amd64_avx2_ops, poly1305_get_state): Remove.
(poly1305_init): Rewrite.
(USE_MPI_64BIT, USE_MPI_32BIT): New.
[USE_MPI_64BIT] (ADD_1305_64, MUL_MOD_1305_64, poly1305_blocks)
(poly1305_final): New implementation using 64-bit limbs.
[USE_MPI_32BIT] (UMUL_ADD_32, ADD_1305_32, MUL_MOD_1305_32)
(poly1305_blocks): New implementation using 32-bit limbs.
(_gcry_poly1305_update, _gcry_poly1305_finish)
(_gcry_poly1305_init): Adapt to new implementation.
* configure.ac: Remove 'poly1305-sse2-amd64.lo',
'poly1305-avx2-amd64.lo' and 'poly1305-armv7-neon.lo'.
--

Intel Core i7-4790K CPU @ 4.00GHz (x86_64):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.284 ns/B    3358.6 MiB/s      1.14 c/B

Intel Core i7-4790K CPU @ 4.00GHz (i386):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.888 ns/B    1073.9 MiB/s      3.55 c/B

Cortex-A53 @ 1152Mhz (armv7):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |      4.40 ns/B     216.7 MiB/s      5.07 c/B

Cortex-A53 @ 1152Mhz (aarch64):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |      2.60 ns/B     367.0 MiB/s      2.99 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bbfab4c82..08baa7c44 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -78,7 +78,6 @@ gost28147.c gost.h \
 gostr3411-94.c \
 md4.c \
 md5.c \
-poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
 rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
   rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
   rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S
deleted file mode 100644
index 13cb4a5d8..000000000
--- a/cipher/poly1305-armv7-neon.S
+++ /dev/null
@@ -1,744 +0,0 @@
-/* poly1305-armv7-neon.S  -  ARMv7/NEON implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
-    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON)
-
-.syntax unified
-.fpu neon
-.arm
-
-#ifdef __PIC__
-#  define GET_DATA_POINTER(reg, name, rtmp) \
-		ldr reg, 1f; \
-		ldr rtmp, 2f; \
-		b 3f; \
-	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
-	2:	.word name(GOT); \
-	3:	add reg, pc, reg; \
-		ldr reg, [reg, rtmp];
-#else
-#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
-#endif
-
-#define UNALIGNED_LDMIA2(ptr, l0, l1) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0}; \
-        vld1.32 {d0}, [ptr]!; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vpop {d0}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l1}; \
-     2: ;
-
-#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d1}; \
-        vld1.32 {d0-d1}, [ptr]!; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vmov l2, s2; \
-        vmov l3, s3; \
-        vpop {d0-d1}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l3}; \
-     2: ;
-
-.text
-
-.p2align 2
-.Lpoly1305_init_constants_neon:
-.long 0x3ffff03
-.long 0x3ffc0ff
-.long 0x3f03fff
-.long 0x00fffff
-
-.globl _gcry_poly1305_armv7_neon_init_ext
-.type  _gcry_poly1305_armv7_neon_init_ext,%function;
-_gcry_poly1305_armv7_neon_init_ext:
-.Lpoly1305_init_ext_neon_local:
-	stmfd sp!, {r4-r11, lr}
-	sub sp, sp, #32
-	mov r14, r2
-	and r2, r2, r2
-	moveq r14, #-1
-	UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
-	GET_DATA_POINTER(r7,.Lpoly1305_init_constants_neon,r8)
-	mov r6, r2
-	mov r8, r2, lsr #26
-	mov r9, r3, lsr #20
-	mov r10, r4, lsr #14
-	mov r11, r5, lsr #8
-	orr r8, r8, r3, lsl #6
-	orr r9, r9, r4, lsl #12
-	orr r10, r10, r5, lsl #18
-	ldmia r7, {r2-r5}
-	and r2, r2, r8
-	and r3, r3, r9
-	and r4, r4, r10
-	and r5, r5, r11
-	and r6, r6, 0x3ffffff
-	stmia r0!, {r2-r6}
-	eor r8, r8, r8
-	str r8, [sp, #24]
-.Lpoly1305_init_ext_neon_squareloop:
-	ldr r8, [sp, #24]
-	mov r12, #16
-	cmp r8, #2
-	beq .Lpoly1305_init_ext_neon_donesquaring
-	cmp r8, #1
-	moveq r12, #64
-	cmp r14, r12
-	bls .Lpoly1305_init_ext_neon_donesquaring
-	add r8, #1
-	str r8, [sp, #24]
-	mov r6, r6, lsl #1
-	mov r2, r2, lsl #1
-	umull r7, r8, r3, r3
-	umull r9, r10, r6, r4
-	umlal r7, r8, r6, r5
-	umlal r9, r10, r2, r3
-	add r11, r5, r5, lsl #2
-	umlal r7, r8, r2, r4
-	umlal r9, r10, r5, r11
-	str r7, [sp, #16]
-	str r8, [sp, #20]
-	mov r2, r2, lsr #1
-	mov r5, r5, lsl #1
-	str r9, [sp, #8]
-	str r10, [sp, #12]
-	umull r7, r8, r2, r2
-	umull r9, r10, r6, r2
-	add r11, r3, r3, lsl #2
-	add r12, r4, r4, lsl #2
-	umlal r7, r8, r6, r3
-	umlal r9, r10, r5, r11
-	umlal r7, r8, r5, r12
-	umlal r9, r10, r4, r12
-	mov r6, r6, lsr #1
-	mov r3, r3, lsl #1
-	add r11, r2, r2, lsl #2
-	str r7, [sp, #0]
-	str r8, [sp, #4]
-	umull r7, r8, r6, r6
-	umlal r7, r8, r3, r12
-	umlal r7, r8, r5, r11
-	and r6, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	ldr r7, [sp, #0]
-	ldr r8, [sp, #4]
-	adds r9, r9, r11
-	adc r10, r10, #0
-	and r2, r9, 0x3ffffff
-	mov r11, r9, lsr #26
-	orr r11, r11, r10, lsl #6
-	ldr r9, [sp, #8]
-	ldr r10, [sp, #12]
-	adds r7, r7, r11
-	adc r8, r8, #0
-	and r3, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	ldr r7, [sp, #16]
-	ldr r8, [sp, #20]
-	adds r9, r9, r11
-	adc r10, r10, #0
-	and r4, r9, 0x3ffffff
-	mov r11, r9, lsr #26
-	orr r11, r11, r10, lsl #6
-	adds r7, r7, r11
-	adc r8, r8, #0
-	and r5, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	add r11, r11, r11, lsl #2
-	add r6, r6, r11
-	mov r11, r6, lsr #26
-	and r6, r6, 0x3ffffff
-	add r2, r2, r11
-	stmia r0!, {r2-r6}
-	b .Lpoly1305_init_ext_neon_squareloop
-.Lpoly1305_init_ext_neon_donesquaring:
-	mov r2, #2
-	ldr r14, [sp, #24]
-	sub r14, r2, r14
-	mov r3, r14, lsl #4
-	add r3, r3, r14, lsl #2
-	add r0, r0, r3
-	eor r2, r2, r2
-	eor r3, r3, r3
-	eor r4, r4, r4
-	eor r5, r5, r5
-	eor r6, r6, r6
-	stmia r0!, {r2-r6}
-	stmia r0!, {r2-r6}
-	UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
-	stmia r0, {r2-r6}
-	add sp, sp, #32
-	ldmfd sp!, {r4-r11, lr}
-	mov r0, #(9*4+32)
-	bx lr
-.ltorg
-.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext;
-
-.globl _gcry_poly1305_armv7_neon_blocks
-.type  _gcry_poly1305_armv7_neon_blocks,%function;
-_gcry_poly1305_armv7_neon_blocks:
-.Lpoly1305_blocks_neon_local:
-	vmov.i32 q0, #0xffffffff
-	vmov.i32 d4, #1
-	vsubw.u32 q0, q0, d4
-	vstmdb sp!, {q4,q5,q6,q7}
-	stmfd sp!, {r4-r11, lr}
-	mov r8, sp
-	and sp, sp, #~63
-	sub sp, sp, #192
-	str r0, [sp, #108]
-	str r1, [sp, #112]
-	str r2, [sp, #116]
-	str r8, [sp, #120]
-	mov r3, r0
-	mov r0, r1
-	mov r1, r2
-	mov r2, r3
-	ldr r8, [r2, #116]
-	veor d15, d15, d15
-	vorr.i32 d15, #(1 << 24)
-	tst r8, #2
-	beq .Lpoly1305_blocks_neon_skip_shift8
-	vshr.u64 d15, #32
-.Lpoly1305_blocks_neon_skip_shift8:
-	tst r8, #4
-	beq .Lpoly1305_blocks_neon_skip_shift16
-	veor d15, d15, d15
-.Lpoly1305_blocks_neon_skip_shift16:
-	vst1.64 d15, [sp, :64]
-	tst r8, #1
-	bne .Lpoly1305_blocks_neon_started
-	vld1.64 {q0-q1}, [r0]!
-	vswp d1, d2
-	vmovn.i64 d21, q0
-	vshrn.i64 d22, q0, #26
-	vshrn.u64 d24, q1, #14
-	vext.8 d0, d0, d2, #4
-	vext.8 d1, d1, d3, #4
-	vshr.u64 q1, q1, #32
-	vshrn.i64 d23, q0, #20
-	vshrn.u64 d25, q1, #8
-	vand.i32 d21, #0x03ffffff
-	vand.i32 q11, #0x03ffffff
-	vand.i32 q12, #0x03ffffff
-	orr r8, r8, #1
-	sub r1, r1, #32
-	str r8, [r2, #116]
-	vorr d25, d25, d15
-	b .Lpoly1305_blocks_neon_setupr20
-.Lpoly1305_blocks_neon_started:
-	add r9, r2, #60
-	vldm r9, {d21-d25}
-.Lpoly1305_blocks_neon_setupr20:
-	vmov.i32 d0, #5
-	tst r8, #(8|16)
-	beq .Lpoly1305_blocks_neon_setupr20_simple
-	tst r8, #(8)
-	beq .Lpoly1305_blocks_neon_setupr20_r_1
-	mov r9, r2
-	add r10, r2, #20
-	vld1.64 {q9}, [r9]!
-	vld1.64 {q8}, [r10]!
-	vld1.64 {d2}, [r9]
-	vld1.64 {d20}, [r10]
-	b .Lpoly1305_blocks_neon_setupr20_hard
-.Lpoly1305_blocks_neon_setupr20_r_1:
-	mov r9, r2
-	vmov.i32 d2, #1
-	vld1.64 {q8}, [r9]!
-	veor q9, q9, q9
-	vshr.u64 d2, d2, #32
-	vld1.64 {d20}, [r9]
-.Lpoly1305_blocks_neon_setupr20_hard:
-	vzip.i32 q8, q9
-	vzip.i32 d20, d2
-	b .Lpoly1305_blocks_neon_setups20
-.Lpoly1305_blocks_neon_setupr20_simple:
-	add r9, r2, #20
-	vld1.64 {d2-d4}, [r9]
-	vdup.32 d16, d2[0]
-	vdup.32 d17, d2[1]
-	vdup.32 d18, d3[0]
-	vdup.32 d19, d3[1]
-	vdup.32 d20, d4[0]
-.Lpoly1305_blocks_neon_setups20:
-	vmul.i32 q13, q8, d0[0]
-	vmov.i64 q15, 0x00000000ffffffff
-	vmul.i32 q14, q9, d0[0]
-	vshr.u64 q15, q15, #6
-	cmp r1, #64
-	blo .Lpoly1305_blocks_neon_try32
-	add r9, sp, #16
-	add r10, r2, #40
-	add r11, sp, #64
-	str r1, [sp, #116]
-	vld1.64 {d10-d12}, [r10]
-	vmov d14, d12
-	vmul.i32 q6, q5, d0[0]
-.Lpoly1305_blocks_neon_mainloop:
-	UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
-	vmull.u32 q0, d25, d12[0]
-	mov r7, r2, lsr #26
-	vmlal.u32 q0, d24, d12[1]
-	mov r8, r3, lsr #20
-	ldr r6, [sp, #0]
-	vmlal.u32 q0, d23, d13[0]
-	mov r9, r4, lsr #14
-	vmlal.u32 q0, d22, d13[1]
-	orr r6, r6, r5, lsr #8
-	vmlal.u32 q0, d21, d14[0]
-	orr r3, r7, r3, lsl #6
-	vmull.u32 q1, d25, d12[1]
-	orr r4, r8, r4, lsl #12
-	orr r5, r9, r5, lsl #18
-	vmlal.u32 q1, d24, d13[0]
-	UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
-	vmlal.u32 q1, d23, d13[1]
-	mov r1, r7, lsr #26
-	vmlal.u32 q1, d22, d14[0]
-	ldr r11, [sp, #4]
-	mov r12, r8, lsr #20
-	vmlal.u32 q1, d21, d10[0]
-	mov r14, r9, lsr #14
-	vmull.u32 q2, d25, d13[0]
-	orr r11, r11, r10, lsr #8
-	orr r8, r1, r8, lsl #6
-	vmlal.u32 q2, d24, d13[1]
-	orr r9, r12, r9, lsl #12
-	vmlal.u32 q2, d23, d14[0]
-	orr r10, r14, r10, lsl #18
-	vmlal.u32 q2, d22, d10[0]
-	mov r12, r3
-	and r2, r2, #0x3ffffff
-	vmlal.u32 q2, d21, d10[1]
-	mov r14, r5
-	vmull.u32 q3, d25, d13[1]
-	and r3, r7, #0x3ffffff
-	vmlal.u32 q3, d24, d14[0]
-	and r5, r8, #0x3ffffff
-	vmlal.u32 q3, d23, d10[0]
-	and r7, r9, #0x3ffffff
-	vmlal.u32 q3, d22, d10[1]
-	and r8, r14, #0x3ffffff
-	vmlal.u32 q3, d21, d11[0]
-	and r9, r10, #0x3ffffff
-	add r14, sp, #128
-	vmull.u32 q4, d25, d14[0]
-	mov r10, r6
-	vmlal.u32 q4, d24, d10[0]
-	and r6, r4, #0x3ffffff
-	vmlal.u32 q4, d23, d10[1]
-	and r4, r12, #0x3ffffff
-	vmlal.u32 q4, d22, d11[0]
-	stm r14, {r2-r11}
-	vmlal.u32 q4, d21, d11[1]
-	vld1.64 {d21-d24}, [r14, :256]!
-	vld1.64 {d25}, [r14, :64]
-	UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
-	vmlal.u32 q0, d25, d26
-	mov r7, r2, lsr #26
-	vmlal.u32 q0, d24, d27
-	ldr r6, [sp, #0]
-	mov r8, r3, lsr #20
-	vmlal.u32 q0, d23, d28
-	mov r9, r4, lsr #14
-	vmlal.u32 q0, d22, d29
-	orr r6, r6, r5, lsr #8
-	vmlal.u32 q0, d21, d20
-	orr r3, r7, r3, lsl #6
-	vmlal.u32 q1, d25, d27
-	orr r4, r8, r4, lsl #12
-	orr r5, r9, r5, lsl #18
-	vmlal.u32 q1, d24, d28
-	UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
-	vmlal.u32 q1, d23, d29
-	mov r1, r7, lsr #26
-	vmlal.u32 q1, d22, d20
-	ldr r11, [sp, #4]
-	mov r12, r8, lsr #20
-	vmlal.u32 q1, d21, d16
-	mov r14, r9, lsr #14
-	vmlal.u32 q2, d25, d28
-	orr r11, r11, r10, lsr #8
-	orr r8, r1, r8, lsl #6
-	orr r9, r12, r9, lsl #12
-	vmlal.u32 q2, d24, d29
-	orr r10, r14, r10, lsl #18
-	and r2, r2, #0x3ffffff
-	mov r12, r3
-	vmlal.u32 q2, d23, d20
-	mov r14, r5
-	vmlal.u32 q2, d22, d16
-	and r3, r7, #0x3ffffff
-	vmlal.u32 q2, d21, d17
-	and r5, r8, #0x3ffffff
-	vmlal.u32 q3, d25, d29
-	and r7, r9, #0x3ffffff
-	vmlal.u32 q3, d24, d20
-	and r8, r14, #0x3ffffff
-	vmlal.u32 q3, d23, d16
-	and r9, r10, #0x3ffffff
-	vmlal.u32 q3, d22, d17
-	add r14, sp, #128
-	vmlal.u32 q3, d21, d18
-	mov r10, r6
-	vmlal.u32 q4, d25, d20
-	vmlal.u32 q4, d24, d16
-	and r6, r4, #0x3ffffff
-	vmlal.u32 q4, d23, d17
-	and r4, r12, #0x3ffffff
-	vmlal.u32 q4, d22, d18
-	stm r14, {r2-r11}
-	vmlal.u32 q4, d21, d19
-	vld1.64 {d21-d24}, [r14, :256]!
-	vld1.64 {d25}, [r14, :64]
-	vaddw.u32 q0, q0, d21
-	vaddw.u32 q1, q1, d22
-	vaddw.u32 q2, q2, d23
-	vaddw.u32 q3, q3, d24
-	vaddw.u32 q4, q4, d25
-	vshr.u64 q11, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q11
-	vshr.u64 q12, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q12
-	vshr.u64 q11, q1, #26
-	vand q1, q1, q15
-	vadd.i64 q2, q2, q11
-	vshr.u64 q12, q4, #26
-	vand q4, q4, q15
-	vadd.i64 q0, q0, q12
-	vshl.i64 q12, q12, #2
-	ldr r1, [sp, #116]
-	vadd.i64 q0, q0, q12
-	vshr.u64 q11, q2, #26
-	vand q2, q2, q15
-	vadd.i64 q3, q3, q11
-	sub r1, #64
-	vshr.u64 q12, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q12
-	cmp r1, #64
-	vshr.u64 q11, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q11
-	vmovn.i64 d21, q0
-	str r1, [sp, #116]
-	vmovn.i64 d22, q1
-	vmovn.i64 d23, q2
-	vmovn.i64 d24, q3
-	vmovn.i64 d25, q4
-	bhs .Lpoly1305_blocks_neon_mainloop
-.Lpoly1305_blocks_neon_try32:
-	cmp r1, #32
-	blo .Lpoly1305_blocks_neon_done
-	tst r0, r0
-	bne .Lpoly1305_blocks_loadm32
-	veor q0, q0, q0
-	veor q1, q1, q1
-	veor q2, q2, q2
-	veor q3, q3, q3
-	veor q4, q4, q4
-	b .Lpoly1305_blocks_continue32
-.Lpoly1305_blocks_loadm32:
-	vld1.64 {q0-q1}, [r0]!
-	veor q4, q4, q4
-	vswp d1, d2
-	veor q3, q3, q3
-	vtrn.32 q0, q4
-	vtrn.32 q1, q3
-	vshl.i64 q2, q1, #12
-	vshl.i64 q3, q3, #18
-	vshl.i64 q1, q4, #6
-	vmovl.u32 q4, d15
-.Lpoly1305_blocks_continue32:
-	vmlal.u32 q0, d25, d26
-	vmlal.u32 q0, d24, d27
-	vmlal.u32 q0, d23, d28
-	vmlal.u32 q0, d22, d29
-	vmlal.u32 q0, d21, d20
-	vmlal.u32 q1, d25, d27
-	vmlal.u32 q1, d24, d28
-	vmlal.u32 q1, d23, d29
-	vmlal.u32 q1, d22, d20
-	vmlal.u32 q1, d21, d16
-	vmlal.u32 q2, d25, d28
-	vmlal.u32 q2, d24, d29
-	vmlal.u32 q2, d23, d20
-	vmlal.u32 q2, d22, d16
-	vmlal.u32 q2, d21, d17
-	vmlal.u32 q3, d25, d29
-	vmlal.u32 q3, d24, d20
-	vmlal.u32 q3, d23, d16
-	vmlal.u32 q3, d22, d17
-	vmlal.u32 q3, d21, d18
-	vmlal.u32 q4, d25, d20
-	vmlal.u32 q4, d24, d16
-	vmlal.u32 q4, d23, d17
-	vmlal.u32 q4, d22, d18
-	vmlal.u32 q4, d21, d19
-	vshr.u64 q11, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q11
-	vshr.u64 q12, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q12
-	vshr.u64 q11, q1, #26
-	vand q1, q1, q15
-	vadd.i64 q2, q2, q11
-	vshr.u64 q12, q4, #26
-	vand q4, q4, q15
-	vadd.i64 q0, q0, q12
-	vshl.i64 q12, q12, #2
-	vadd.i64 q0, q0, q12
-	vshr.u64 q11, q2, #26
-	vand q2, q2, q15
-	vadd.i64 q3, q3, q11
-	vshr.u64 q12, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q12
-	vshr.u64 q11, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q11
-	vmovn.i64 d21, q0
-	vmovn.i64 d22, q1
-	vmovn.i64 d23, q2
-	vmovn.i64 d24, q3
-	vmovn.i64 d25, q4
-.Lpoly1305_blocks_neon_done:
-	tst r0, r0
-	beq .Lpoly1305_blocks_neon_final
-	ldr r2, [sp, #108]
-	add r2, r2, #60
-	vst1.64 {d21}, [r2]!
-	vst1.64 {d22-d25}, [r2]
-	b .Lpoly1305_blocks_neon_leave
-.Lpoly1305_blocks_neon_final:
-	vadd.u32 d10, d0, d1
-	vadd.u32 d13, d2, d3
-	vadd.u32 d11, d4, d5
-	ldr r5, [sp, #108]
-	vadd.u32 d14, d6, d7
-	vadd.u32 d12, d8, d9
-	vtrn.32 d10, d13
-	vtrn.32 d11, d14
-	vst1.64 {d10-d12}, [sp]
-	ldm sp, {r0-r4}
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	mov r12, r1, lsr #26
-	and r1, r1, #0x3ffffff
-	add r2, r2, r12
-	mov r12, r2, lsr #26
-	and r2, r2, #0x3ffffff
-	add r3, r3, r12
-	mov r12, r3, lsr #26
-	and r3, r3, #0x3ffffff
-	add r4, r4, r12
-	mov r12, r4, lsr #26
-	and r4, r4, #0x3ffffff
-	add r12, r12, r12, lsl #2
-	add r0, r0, r12
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	mov r12, r1, lsr #26
-	and r1, r1, #0x3ffffff
-	add r2, r2, r12
-	mov r12, r2, lsr #26
-	and r2, r2, #0x3ffffff
-	add r3, r3, r12
-	mov r12, r3, lsr #26
-	and r3, r3, #0x3ffffff
-	add r4, r4, r12
-	mov r12, r4, lsr #26
-	and r4, r4, #0x3ffffff
-	add r12, r12, r12, lsl #2
-	add r0, r0, r12
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	add r6, r0, #5
-	mov r12, r6, lsr #26
-	and r6, r6, #0x3ffffff
-	add r7, r1, r12
-	mov r12, r7, lsr #26
-	and r7, r7, #0x3ffffff
-	add r10, r2, r12
-	mov r12, r10, lsr #26
-	and r10, r10, #0x3ffffff
-	add r11, r3, r12
-	mov r12, #-(1 << 26)
-	add r12, r12, r11, lsr #26
-	and r11, r11, #0x3ffffff
-	add r14, r4, r12
-	mov r12, r14, lsr #31
-	sub r12, #1
-	and r6, r6, r12
-	and r7, r7, r12
-	and r10, r10, r12
-	and r11, r11, r12
-	and r14, r14, r12
-	mvn r12, r12
-	and r0, r0, r12
-	and r1, r1, r12
-	and r2, r2, r12
-	and r3, r3, r12
-	and r4, r4, r12
-	orr r0, r0, r6
-	orr r1, r1, r7
-	orr r2, r2, r10
-	orr r3, r3, r11
-	orr r4, r4, r14
-	orr r0, r0, r1, lsl #26
-	lsr r1, r1, #6
-	orr r1, r1, r2, lsl #20
-	lsr r2, r2, #12
-	orr r2, r2, r3, lsl #14
-	lsr r3, r3, #18
-	orr r3, r3, r4, lsl #8
-	add r5, r5, #60
-	stm r5, {r0-r3}
-.Lpoly1305_blocks_neon_leave:
-	sub r0, sp, #8
-	ldr sp, [sp, #120]
-	ldmfd sp!, {r4-r11, lr}
-	vldm sp!, {q4-q7}
-	sub r0, sp, r0
-	bx lr
-.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks;
-
-.globl _gcry_poly1305_armv7_neon_finish_ext
-.type  _gcry_poly1305_armv7_neon_finish_ext,%function;
-_gcry_poly1305_armv7_neon_finish_ext:
-.Lpoly1305_finish_ext_neon_local:
-	stmfd sp!, {r4-r11, lr}
-	sub sp, sp, #32
-	mov r5, r0
-	mov r6, r1
-	mov r7, r2
-	mov r8, r3
-	ands r7, r7, r7
-	beq .Lpoly1305_finish_ext_neon_noremaining
-	mov r9, sp
-	veor q0, q0, q0
-	veor q1, q1, q1
-	vst1.64 {q0-q1}, [sp]
-	tst r7, #16
-	beq .Lpoly1305_finish_ext_neon_skip16
-	vld1.u64 {q0}, [r1]!
-	vst1.64 {q0}, [r9]!
-.Lpoly1305_finish_ext_neon_skip16:
-	tst r7, #8
-	beq .Lpoly1305_finish_ext_neon_skip8
-	UNALIGNED_LDMIA2(r1, r10, r11)
-	stmia r9!, {r10-r11}
-.Lpoly1305_finish_ext_neon_skip8:
-	tst r7, #4
-	beq .Lpoly1305_finish_ext_neon_skip4
-	ldr r10, [r1], #4
-	str r10, [r9], #4
-.Lpoly1305_finish_ext_neon_skip4:
-	tst r7, #2
-	beq .Lpoly1305_finish_ext_neon_skip2
-	ldrh r10, [r1], #2
-	strh r10, [r9], #2
-.Lpoly1305_finish_ext_neon_skip2:
-	tst r7, #1
-	beq .Lpoly1305_finish_ext_neon_skip1
-	ldrb r10, [r1], #1
-	strb r10, [r9], #1
-.Lpoly1305_finish_ext_neon_skip1:
-	cmp r7, #16
-	beq .Lpoly1305_finish_ext_neon_skipfinalbit
-	mov r10, #1
-	strb r10, [r9]
-.Lpoly1305_finish_ext_neon_skipfinalbit:
-	ldr r10, [r5, #116]
-	orrhs r10, #2
-	orrlo r10, #4
-	str r10, [r5, #116]
-	mov r0, r5
-	mov r1, sp
-	mov r2, #32
-	bl .Lpoly1305_blocks_neon_local
-.Lpoly1305_finish_ext_neon_noremaining:
-	ldr r10, [r5, #116]
-	tst r10, #1
-	beq .Lpoly1305_finish_ext_neon_notstarted
-	cmp r7, #0
-	beq .Lpoly1305_finish_ext_neon_user2r
-	cmp r7, #16
-	bls .Lpoly1305_finish_ext_neon_user1
-.Lpoly1305_finish_ext_neon_user2r:
-	orr r10, r10, #8
-	b .Lpoly1305_finish_ext_neon_finalblock
-.Lpoly1305_finish_ext_neon_user1:
-	orr r10, r10, #16
-.Lpoly1305_finish_ext_neon_finalblock:
-	str r10, [r5, #116]
-	mov r0, r5
-	eor r1, r1, r1
-	mov r2, #32
-	bl .Lpoly1305_blocks_neon_local
-.Lpoly1305_finish_ext_neon_notstarted:
-	add r0, r5, #60
-	add r9, r5, #100
-	ldm r0, {r0-r3}
-	ldm r9, {r9-r12}
-	adds r0, r0, r9
-	adcs r1, r1, r10
-	adcs r2, r2, r11
-	adcs r3, r3, r12
-	stm r8, {r0-r3}
-	veor q0, q0, q0
-	veor q1, q1, q1
-	veor q2, q2, q2
-	veor q3, q3, q3
-	vstmia r5!, {q0-q3}
-	vstm r5, {q0-q3}
-	add sp, sp, #32
-	ldmfd sp!, {r4-r11, lr}
-	mov r0, #(9*4+32)
-	bx lr
-.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext;
-
-#endif
diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S
deleted file mode 100644
index 9362a5aee..000000000
--- a/cipher/poly1305-avx2-amd64.S
+++ /dev/null
@@ -1,962 +0,0 @@
-/* poly1305-avx2-amd64.S  -  AMD64/AVX2 implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-
-.text
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_init_ext
-ELF(.type  _gcry_poly1305_amd64_avx2_init_ext, at function;)
-_gcry_poly1305_amd64_avx2_init_ext:
-.Lpoly1305_init_ext_avx2_local:
-	xor %edx, %edx
-	vzeroupper
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	pushq %r15
-	pushq %rbx
-	movq %rdx, %rcx
-	vpxor %ymm0, %ymm0, %ymm0
-	movq $-1, %r8
-	testq %rcx, %rcx
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm0, 32(%rdi)
-	vmovdqu %ymm0, 64(%rdi)
-	vmovdqu %ymm0, 96(%rdi)
-	vmovdqu %ymm0, 128(%rdi)
-	movq 8(%rsi), %r9
-	cmove %r8, %rcx
-	movq $0xffc0fffffff, %r8
-	movq %r9, %r13
-	movq (%rsi), %r10
-	andq %r10, %r8
-	shrq $44, %r10
-	movq %r8, %r14
-	shlq $20, %r13
-	orq %r13, %r10
-	movq $0xfffffc0ffff, %r13
-	shrq $24, %r9
-	andq %r13, %r10
-	movq $0xffffffc0f, %r13
-	andq %r13, %r9
-	movl %r8d, %r13d
-	andl $67108863, %r13d
-	movl %r13d, 164(%rdi)
-	movq %r10, %r13
-	shrq $26, %r14
-	shlq $18, %r13
-	orq %r13, %r14
-	movq %r10, %r13
-	shrq $8, %r13
-	andl $67108863, %r14d
-	andl $67108863, %r13d
-	movl %r14d, 172(%rdi)
-	movq %r10, %r14
-	movl %r13d, 180(%rdi)
-	movq %r9, %r13
-	shrq $34, %r14
-	shlq $10, %r13
-	orq %r13, %r14
-	movq %r9, %r13
-	shrq $16, %r13
-	andl $67108863, %r14d
-	movl %r14d, 188(%rdi)
-	movl %r13d, 196(%rdi)
-	cmpq $16, %rcx
-	jbe .Lpoly1305_init_ext_avx2_continue
-	lea (%r9,%r9,4), %r11
-	shlq $2, %r11
-	lea (%r10,%r10), %rax
-	mulq %r11
-	movq %rax, %r13
-	movq %r8, %rax
-	movq %rdx, %r14
-	mulq %r8
-	addq %rax, %r13
-	lea (%r8,%r8), %rax
-	movq %r13, %r12
-	adcq %rdx, %r14
-	mulq %r10
-	shlq $20, %r14
-	movq %rax, %r15
-	shrq $44, %r12
-	movq %r11, %rax
-	orq %r12, %r14
-	movq %rdx, %r12
-	mulq %r9
-	addq %rax, %r15
-	movq %r8, %rax
-	adcq %rdx, %r12
-	addq %r15, %r14
-	lea (%r9,%r9), %r15
-	movq %r14, %rbx
-	adcq $0, %r12
-	mulq %r15
-	shlq $20, %r12
-	movq %rdx, %r11
-	shrq $44, %rbx
-	orq %rbx, %r12
-	movq %rax, %rbx
-	movq %r10, %rax
-	mulq %r10
-	addq %rax, %rbx
-	adcq %rdx, %r11
-	addq %rbx, %r12
-	movq $0xfffffffffff, %rbx
-	movq %r12, %r15
-	adcq $0, %r11
-	andq %rbx, %r13
-	shlq $22, %r11
-	andq %rbx, %r14
-	shrq $42, %r15
-	orq %r15, %r11
-	lea (%r11,%r11,4), %r11
-	addq %r11, %r13
-	movq %rbx, %r11
-	andq %r13, %r11
-	shrq $44, %r13
-	movq %r11, %r15
-	addq %r13, %r14
-	movq $0x3ffffffffff, %r13
-	andq %r14, %rbx
-	andq %r13, %r12
-	movq %rbx, %r13
-	shrq $26, %r15
-	shlq $18, %r13
-	orq %r13, %r15
-	movq %rbx, %r13
-	shrq $44, %r14
-	shrq $8, %r13
-	addq %r14, %r12
-	movl %r11d, %r14d
-	andl $67108863, %r15d
-	andl $67108863, %r14d
-	andl $67108863, %r13d
-	movl %r14d, 204(%rdi)
-	movq %rbx, %r14
-	movl %r13d, 220(%rdi)
-	movq %r12, %r13
-	shrq $34, %r14
-	shlq $10, %r13
-	orq %r13, %r14
-	movq %r12, %r13
-	shrq $16, %r13
-	andl $67108863, %r14d
-	movl %r15d, 212(%rdi)
-	movl %r14d, 228(%rdi)
-	movl %r13d, 236(%rdi)
-	cmpq $32, %rcx
-	jbe .Lpoly1305_init_ext_avx2_continue
-	movq %r9, %rax
-	lea (%rbx,%rbx,4), %r14
-	shlq $2, %r14
-	mulq %r14
-	movq %rdi, -32(%rsp)
-	lea (%r12,%r12,4), %rdi
-	shlq $2, %rdi
-	movq %rax, %r14
-	movq %r10, %rax
-	movq %rdx, %r15
-	mulq %rdi
-	movq %rax, %r13
-	movq %r11, %rax
-	movq %rcx, -16(%rsp)
-	movq %rdx, %rcx
-	mulq %r8
-	addq %rax, %r13
-	movq %rdi, %rax
-	movq %rsi, -24(%rsp)
-	adcq %rdx, %rcx
-	addq %r13, %r14
-	adcq %rcx, %r15
-	movq %r14, %rcx
-	mulq %r9
-	shlq $20, %r15
-	movq %rax, %r13
-	shrq $44, %rcx
-	movq %r11, %rax
-	orq %rcx, %r15
-	movq %rdx, %rcx
-	mulq %r10
-	movq %rax, %rsi
-	movq %rbx, %rax
-	movq %rdx, %rdi
-	mulq %r8
-	addq %rax, %rsi
-	movq %r11, %rax
-	adcq %rdx, %rdi
-	addq %rsi, %r13
-	adcq %rdi, %rcx
-	addq %r13, %r15
-	movq %r15, %rdi
-	adcq $0, %rcx
-	mulq %r9
-	shlq $20, %rcx
-	movq %rdx, %rsi
-	shrq $44, %rdi
-	orq %rdi, %rcx
-	movq %rax, %rdi
-	movq %rbx, %rax
-	mulq %r10
-	movq %rax, %r9
-	movq %r8, %rax
-	movq %rdx, %r10
-	movq $0xfffffffffff, %r8
-	mulq %r12
-	addq %rax, %r9
-	adcq %rdx, %r10
-	andq %r8, %r14
-	addq %r9, %rdi
-	adcq %r10, %rsi
-	andq %r8, %r15
-	addq %rdi, %rcx
-	movq $0x3ffffffffff, %rdi
-	movq %rcx, %r10
-	adcq $0, %rsi
-	andq %rdi, %rcx
-	shlq $22, %rsi
-	shrq $42, %r10
-	orq %r10, %rsi
-	movq -32(%rsp), %rdi
-	lea (%rsi,%rsi,4), %r9
-	movq %r8, %rsi
-	addq %r9, %r14
-	andq %r14, %rsi
-	shrq $44, %r14
-	addq %r14, %r15
-	andq %r15, %r8
-	shrq $44, %r15
-	movq %r8, %r14
-	addq %r15, %rcx
-	movl %esi, %r15d
-	movq %rcx, %r10
-	movq %r8, %r9
-	shrq $26, %rsi
-	andl $67108863, %r15d
-	shlq $18, %r14
-	shrq $34, %r8
-	orq %r14, %rsi
-	shlq $10, %r10
-	shrq $8, %r9
-	orq %r10, %r8
-	shrq $16, %rcx
-	andl $67108863, %esi
-	movl %esi, 252(%rdi)
-	andl $67108863, %r9d
-	movl %ecx, 276(%rdi)
-	andl $67108863, %r8d
-	movl %r15d, 244(%rdi)
-	movl %r9d, 260(%rdi)
-	movl %r8d, 268(%rdi)
-	movq -16(%rsp), %rcx
-	movq -24(%rsp), %rsi
-.Lpoly1305_init_ext_avx2_continue:
-	movl 16(%rsi), %r8d
-	movl %r8d, 284(%rdi)
-	movl 20(%rsi), %r9d
-	movl %r9d, 292(%rdi)
-	movl 24(%rsi), %r10d
-	movl %r10d, 300(%rdi)
-	movl 28(%rsi), %esi
-	movl %esi, 308(%rdi)
-	cmpq $48, %rcx
-	jbe .Lpoly1305_init_ext_avx2_done
-	lea (%r12,%r12,4), %r9
-	shlq $2, %r9
-	lea (%rbx,%rbx), %rax
-	mulq %r9
-	movq %rax, %rsi
-	movq %r11, %rax
-	movq %rdx, %r8
-	mulq %r11
-	addq %rax, %rsi
-	lea (%r11,%r11), %rax
-	movq %rsi, %r10
-	adcq %rdx, %r8
-	mulq %rbx
-	movq %rax, %r13
-	movq %r12, %rax
-	movq %rdx, %rcx
-	addq %r12, %r12
-	mulq %r9
-	addq %rax, %r13
-	movq %r11, %rax
-	movq $0xfffffffffff, %r9
-	adcq %rdx, %rcx
-	andq %r9, %rsi
-	mulq %r12
-	shlq $20, %r8
-	movq %rax, %r11
-	shrq $44, %r10
-	movq %rbx, %rax
-	orq %r10, %r8
-	movq %rdx, %r12
-	mulq %rbx
-	addq %r13, %r8
-	movq %r8, %r14
-	adcq $0, %rcx
-	andq %r9, %r8
-	addq %rax, %r11
-	adcq %rdx, %r12
-	shlq $20, %rcx
-	shrq $44, %r14
-	orq %r14, %rcx
-	addq %r11, %rcx
-	movq %rcx, %rbx
-	adcq $0, %r12
-	shlq $22, %r12
-	shrq $42, %rbx
-	orq %rbx, %r12
-	movq %r9, %rbx
-	lea (%r12,%r12,4), %r15
-	addq %r15, %rsi
-	andq %rsi, %rbx
-	shrq $44, %rsi
-	movl %ebx, %r11d
-	addq %rsi, %r8
-	movq $0x3ffffffffff, %rsi
-	andq %r8, %r9
-	andq %rsi, %rcx
-	shrq $44, %r8
-	movq %r9, %rax
-	addq %r8, %rcx
-	movq %r9, %r8
-	movq %rcx, %r10
-	andl $67108863, %r11d
-	shrq $26, %rbx
-	shlq $18, %r8
-	shrq $34, %r9
-	orq %r8, %rbx
-	shlq $10, %r10
-	shrq $8, %rax
-	orq %r10, %r9
-	shrq $16, %rcx
-	andl $67108863, %ebx
-	andl $67108863, %eax
-	andl $67108863, %r9d
-	movl %r11d, 184(%rdi)
-	movl %r11d, 176(%rdi)
-	movl %r11d, 168(%rdi)
-	movl %r11d, 160(%rdi)
-	movl %ebx, 216(%rdi)
-	movl %ebx, 208(%rdi)
-	movl %ebx, 200(%rdi)
-	movl %ebx, 192(%rdi)
-	movl %eax, 248(%rdi)
-	movl %eax, 240(%rdi)
-	movl %eax, 232(%rdi)
-	movl %eax, 224(%rdi)
-	movl %r9d, 280(%rdi)
-	movl %r9d, 272(%rdi)
-	movl %r9d, 264(%rdi)
-	movl %r9d, 256(%rdi)
-	movl %ecx, 312(%rdi)
-	movl %ecx, 304(%rdi)
-	movl %ecx, 296(%rdi)
-	movl %ecx, 288(%rdi)
-.Lpoly1305_init_ext_avx2_done:
-	movq $0, 320(%rdi)
-	vzeroall
-	popq %rbx
-	popq %r15
-	popq %r14
-	popq %r13
-	popq %r12
-	ret
-ELF(.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_blocks
-ELF(.type  _gcry_poly1305_amd64_avx2_blocks, at function;)
-_gcry_poly1305_amd64_avx2_blocks:
-.Lpoly1305_blocks_avx2_local:
-	vzeroupper
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %rbx
-	andq $-64, %rsp
-	subq $200, %rsp
-	movl $((1<<26)-1), %r8d
-	movl $(5), %r9d
-	movl $((1<<24)), %r10d
-	vmovd %r8d, %xmm0
-	vmovd %r9d, %xmm8
-	vmovd %r10d, %xmm7
-	vpbroadcastq %xmm0, %ymm0
-	vpbroadcastq %xmm8, %ymm8
-	vpbroadcastq %xmm7, %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-	movq 320(%rdi), %rax
-	testb $60, %al
-	je .Lpoly1305_blocks_avx2_9
-	vmovdqa 168(%rsp), %ymm7
-	vpsrldq $8, %ymm7, %ymm1
-	vmovdqa %ymm1, 168(%rsp)
-	testb $4, %al
-	je .Lpoly1305_blocks_avx2_10
-	vpermq $192, %ymm1, %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-.Lpoly1305_blocks_avx2_10:
-	testb $8, %al
-	je .Lpoly1305_blocks_avx2_11
-	vpermq $240, 168(%rsp), %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-.Lpoly1305_blocks_avx2_11:
-	testb $16, %al
-	je .Lpoly1305_blocks_avx2_12
-	vpermq $252, 168(%rsp), %ymm6
-	vmovdqa %ymm6, 168(%rsp)
-.Lpoly1305_blocks_avx2_12:
-	testb $32, %al
-	je .Lpoly1305_blocks_avx2_9
-	vpxor %xmm6, %xmm6, %xmm6
-	vmovdqa %ymm6, 168(%rsp)
-.Lpoly1305_blocks_avx2_9:
-	testb $1, %al
-	jne .Lpoly1305_blocks_avx2_13
-	vmovdqu (%rsi), %ymm3
-	vmovdqu 32(%rsi), %ymm1
-	vpunpcklqdq %ymm1, %ymm3, %ymm2
-	vpunpckhqdq %ymm1, %ymm3, %ymm1
-	vpermq $216, %ymm2, %ymm2
-	vpermq $216, %ymm1, %ymm1
-	vpand %ymm2, %ymm0, %ymm5
-	vpsrlq $26, %ymm2, %ymm4
-	vpand %ymm4, %ymm0, %ymm4
-	vpsllq $12, %ymm1, %ymm3
-	vpsrlq $52, %ymm2, %ymm2
-	vpor %ymm3, %ymm2, %ymm2
-	vpand %ymm2, %ymm0, %ymm3
-	vpsrlq $26, %ymm2, %ymm2
-	vpand %ymm2, %ymm0, %ymm2
-	vpsrlq $40, %ymm1, %ymm1
-	vpor 168(%rsp), %ymm1, %ymm1
-	addq $64, %rsi
-	subq $64, %rdx
-	orq $1, 320(%rdi)
-	jmp .Lpoly1305_blocks_avx2_14
-.Lpoly1305_blocks_avx2_13:
-	vmovdqa (%rdi), %ymm5
-	vmovdqa 32(%rdi), %ymm4
-	vmovdqa 64(%rdi), %ymm3
-	vmovdqa 96(%rdi), %ymm2
-	vmovdqa 128(%rdi), %ymm1
-.Lpoly1305_blocks_avx2_14:
-	cmpq $63, %rdx
-	jbe .Lpoly1305_blocks_avx2_15
-	vmovdqa 160(%rdi), %ymm6
-	vmovdqa %ymm8, 136(%rsp)
-	vmovdqa 192(%rdi), %ymm7
-	vpmuludq %ymm8, %ymm7, %ymm11
-	vmovdqa %ymm11, 104(%rsp)
-	vmovdqa 224(%rdi), %ymm11
-	vmovdqa %ymm11, 72(%rsp)
-	vpmuludq %ymm11, %ymm8, %ymm11
-	vmovdqa %ymm11, 40(%rsp)
-	vmovdqa 256(%rdi), %ymm11
-	vmovdqa %ymm11, 8(%rsp)
-	vpmuludq %ymm11, %ymm8, %ymm11
-	vmovdqa %ymm11, -24(%rsp)
-	vmovdqa 288(%rdi), %ymm13
-	vmovdqa %ymm13, -56(%rsp)
-	vpmuludq %ymm13, %ymm8, %ymm13
-	vmovdqa %ymm13, -88(%rsp)
-.Lpoly1305_blocks_avx2_16:
-	vpmuludq 104(%rsp), %ymm1, %ymm14
-	vmovdqa 40(%rsp), %ymm13
-	vpmuludq %ymm13, %ymm2, %ymm8
-	vpmuludq %ymm13, %ymm1, %ymm13
-	vmovdqa -24(%rsp), %ymm9
-	vpmuludq %ymm9, %ymm2, %ymm10
-	vpmuludq %ymm9, %ymm1, %ymm11
-	vpaddq %ymm8, %ymm14, %ymm14
-	vpmuludq %ymm9, %ymm3, %ymm8
-	vmovdqa -88(%rsp), %ymm12
-	vpmuludq %ymm12, %ymm1, %ymm9
-	vpaddq %ymm10, %ymm13, %ymm13
-	vpmuludq %ymm12, %ymm4, %ymm15
-	vmovdqa %ymm12, %ymm10
-	vpmuludq %ymm12, %ymm3, %ymm12
-	vpaddq %ymm8, %ymm14, %ymm14
-	vpmuludq %ymm10, %ymm2, %ymm10
-	vpmuludq %ymm6, %ymm2, %ymm8
-	vpaddq %ymm15, %ymm14, %ymm14
-	vpmuludq %ymm6, %ymm1, %ymm1
-	vpaddq %ymm12, %ymm13, %ymm13
-	vpmuludq %ymm6, %ymm5, %ymm15
-	vpaddq %ymm10, %ymm11, %ymm11
-	vpmuludq %ymm6, %ymm4, %ymm12
-	vpaddq %ymm8, %ymm9, %ymm9
-	vpmuludq %ymm6, %ymm3, %ymm10
-	vpmuludq %ymm7, %ymm3, %ymm8
-	vpaddq %ymm15, %ymm14, %ymm14
-	vpmuludq %ymm7, %ymm2, %ymm2
-	vpaddq %ymm12, %ymm13, %ymm12
-	vpmuludq %ymm7, %ymm5, %ymm15
-	vpaddq %ymm10, %ymm11, %ymm10
-	vpmuludq %ymm7, %ymm4, %ymm13
-	vpaddq %ymm8, %ymm9, %ymm8
-	vmovdqa 72(%rsp), %ymm9
-	vpmuludq %ymm9, %ymm4, %ymm11
-	vpaddq %ymm2, %ymm1, %ymm1
-	vpmuludq %ymm9, %ymm3, %ymm3
-	vpaddq %ymm15, %ymm12, %ymm12
-	vpmuludq %ymm9, %ymm5, %ymm15
-	vpaddq %ymm13, %ymm10, %ymm10
-	vmovdqa 8(%rsp), %ymm2
-	vpmuludq %ymm2, %ymm5, %ymm9
-	vpaddq %ymm11, %ymm8, %ymm8
-	vpmuludq %ymm2, %ymm4, %ymm4
-	vpaddq %ymm3, %ymm1, %ymm1
-	vpmuludq -56(%rsp), %ymm5, %ymm5
-	vpaddq %ymm15, %ymm10, %ymm10
-	vpaddq %ymm9, %ymm8, %ymm8
-	vpaddq %ymm4, %ymm1, %ymm1
-	vpaddq %ymm5, %ymm1, %ymm5
-	vmovdqu (%rsi), %ymm3
-	vmovdqu 32(%rsi), %ymm2
-	vperm2i128 $32, %ymm2, %ymm3, %ymm1
-	vperm2i128 $49, %ymm2, %ymm3, %ymm2
-	vpunpckldq %ymm2, %ymm1, %ymm15
-	vpunpckhdq %ymm2, %ymm1, %ymm2
-	vpxor %xmm4, %xmm4, %xmm4
-	vpunpckldq %ymm4, %ymm15, %ymm1
-	vpunpckhdq %ymm4, %ymm15, %ymm15
-	vpunpckldq %ymm4, %ymm2, %ymm3
-	vpunpckhdq %ymm4, %ymm2, %ymm2
-	vpsllq $6, %ymm15, %ymm15
-	vpsllq $12, %ymm3, %ymm3
-	vpsllq $18, %ymm2, %ymm2
-	vpaddq %ymm1, %ymm14, %ymm14
-	vpaddq %ymm15, %ymm12, %ymm12
-	vpaddq %ymm3, %ymm10, %ymm10
-	vpaddq %ymm2, %ymm8, %ymm8
-	vpaddq 168(%rsp), %ymm5, %ymm5
-	addq $64, %rsi
-	vpsrlq $26, %ymm14, %ymm4
-	vpsrlq $26, %ymm8, %ymm2
-	vpand %ymm0, %ymm14, %ymm14
-	vpand %ymm0, %ymm8, %ymm8
-	vpaddq %ymm4, %ymm12, %ymm12
-	vpaddq %ymm2, %ymm5, %ymm5
-	vpsrlq $26, %ymm12, %ymm3
-	vpsrlq $26, %ymm5, %ymm9
-	vpand %ymm0, %ymm12, %ymm12
-	vpand %ymm0, %ymm5, %ymm11
-	vpaddq %ymm3, %ymm10, %ymm3
-	vpmuludq 136(%rsp), %ymm9, %ymm9
-	vpaddq %ymm9, %ymm14, %ymm14
-	vpsrlq $26, %ymm3, %ymm2
-	vpsrlq $26, %ymm14, %ymm4
-	vpand %ymm0, %ymm3, %ymm3
-	vpand %ymm0, %ymm14, %ymm5
-	vpaddq %ymm2, %ymm8, %ymm2
-	vpaddq %ymm4, %ymm12, %ymm4
-	vpsrlq $26, %ymm2, %ymm1
-	vpand %ymm0, %ymm2, %ymm2
-	vpaddq %ymm1, %ymm11, %ymm1
-	subq $64, %rdx
-	cmpq $63, %rdx
-	ja .Lpoly1305_blocks_avx2_16
-.Lpoly1305_blocks_avx2_15:
-	testb $64, 320(%rdi)
-	jne .Lpoly1305_blocks_avx2_17
-	vmovdqa %ymm5, (%rdi)
-	vmovdqa %ymm4, 32(%rdi)
-	vmovdqa %ymm3, 64(%rdi)
-	vmovdqa %ymm2, 96(%rdi)
-	vmovdqa %ymm1, 128(%rdi)
-	jmp .Lpoly1305_blocks_avx2_8
-.Lpoly1305_blocks_avx2_17:
-	vpermq $245, %ymm5, %ymm0
-	vpaddq %ymm0, %ymm5, %ymm5
-	vpermq $245, %ymm4, %ymm0
-	vpaddq %ymm0, %ymm4, %ymm4
-	vpermq $245, %ymm3, %ymm0
-	vpaddq %ymm0, %ymm3, %ymm3
-	vpermq $245, %ymm2, %ymm0
-	vpaddq %ymm0, %ymm2, %ymm2
-	vpermq $245, %ymm1, %ymm0
-	vpaddq %ymm0, %ymm1, %ymm1
-	vpermq $170, %ymm5, %ymm0
-	vpaddq %ymm0, %ymm5, %ymm5
-	vpermq $170, %ymm4, %ymm0
-	vpaddq %ymm0, %ymm4, %ymm4
-	vpermq $170, %ymm3, %ymm0
-	vpaddq %ymm0, %ymm3, %ymm3
-	vpermq $170, %ymm2, %ymm0
-	vpaddq %ymm0, %ymm2, %ymm2
-	vpermq $170, %ymm1, %ymm0
-	vpaddq %ymm0, %ymm1, %ymm1
-	vmovd %xmm5, %eax
-	vmovd %xmm4, %edx
-	movl %eax, %ecx
-	shrl $26, %ecx
-	addl %edx, %ecx
-	movl %ecx, %edx
-	andl $67108863, %edx
-	vmovd %xmm3, %esi
-	shrl $26, %ecx
-	movl %ecx, %r11d
-	addl %esi, %r11d
-	vmovd %xmm2, %ecx
-	movl %r11d, %r10d
-	shrl $26, %r10d
-	addl %ecx, %r10d
-	movl %r10d, %r9d
-	andl $67108863, %r9d
-	vmovd %xmm1, %r8d
-	movl %edx, %esi
-	salq $26, %rsi
-	andl $67108863, %eax
-	orq %rax, %rsi
-	movabsq $17592186044415, %rax
-	andq %rax, %rsi
-	andl $67108863, %r11d
-	salq $8, %r11
-	shrl $18, %edx
-	movl %edx, %edx
-	orq %r11, %rdx
-	movq %r9, %rcx
-	salq $34, %rcx
-	orq %rcx, %rdx
-	andq %rax, %rdx
-	shrl $26, %r10d
-	addl %r10d, %r8d
-	salq $16, %r8
-	shrl $10, %r9d
-	movl %r9d, %r9d
-	orq %r9, %r8
-	movabsq $4398046511103, %r10
-	movq %r8, %r9
-	andq %r10, %r9
-	shrq $42, %r8
-	leaq (%r8,%r8,4), %rcx
-	addq %rcx, %rsi
-	movq %rsi, %r8
-	andq %rax, %r8
-	movq %rsi, %rcx
-	shrq $44, %rcx
-	addq %rdx, %rcx
-	movq %rcx, %rsi
-	andq %rax, %rsi
-	shrq $44, %rcx
-	movq %rcx, %rdx
-	addq %r9, %rdx
-	andq %rdx, %r10
-	shrq $42, %rdx
-	leaq (%r8,%rdx,4), %rcx
-	leaq (%rcx,%rdx), %rdx
-	movq %rdx, %rbx
-	andq %rax, %rbx
-	shrq $44, %rdx
-	movq %rdx, %r11
-	addq %rsi, %r11
-	leaq 5(%rbx), %r9
-	movq %r9, %r8
-	shrq $44, %r8
-	addq %r11, %r8
-	movabsq $-4398046511104, %rsi
-	addq %r10, %rsi
-	movq %r8, %rdx
-	shrq $44, %rdx
-	addq %rdx, %rsi
-	movq %rsi, %rdx
-	shrq $63, %rdx
-	subq $1, %rdx
-	movq %rdx, %rcx
-	notq %rcx
-	andq %rcx, %rbx
-	andq %rcx, %r11
-	andq %r10, %rcx
-	andq %rax, %r9
-	andq %rdx, %r9
-	orq %r9, %rbx
-	movq %rbx, (%rdi)
-	andq %r8, %rax
-	andq %rdx, %rax
-	orq %rax, %r11
-	movq %r11, 8(%rdi)
-	andq %rsi, %rdx
-	orq %rcx, %rdx
-	movq %rdx, 16(%rdi)
-.Lpoly1305_blocks_avx2_8:
-	movq -8(%rbp), %rbx
-	vzeroall
-	movq %rbp, %rax
-	subq %rsp, %rax
-	leave
-	addq $8, %rax
-	ret
-ELF(.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_finish_ext
-ELF(.type  _gcry_poly1305_amd64_avx2_finish_ext, at function;)
-_gcry_poly1305_amd64_avx2_finish_ext:
-.Lpoly1305_finish_ext_avx2_local:
-	vzeroupper
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %r13
-	pushq %r12
-	pushq %rbx
-	andq $-64, %rsp
-	subq $64, %rsp
-	movq %rdi, %rbx
-	movq %rdx, %r13
-	movq %rcx, %r12
-	testq %rdx, %rdx
-	je .Lpoly1305_finish_ext_avx2_22
-	vpxor %xmm0, %xmm0, %xmm0
-	vmovdqa %ymm0, (%rsp)
-	vmovdqa %ymm0, 32(%rsp)
-	movq %rsp, %rax
-	subq %rsp, %rsi
-	testb $32, %dl
-	je .Lpoly1305_finish_ext_avx2_23
-	vmovdqu (%rsp,%rsi), %ymm0
-	vmovdqa %ymm0, (%rsp)
-	leaq 32(%rsp), %rax
-.Lpoly1305_finish_ext_avx2_23:
-	testb $16, %r13b
-	je .Lpoly1305_finish_ext_avx2_24
-	vmovdqu (%rax,%rsi), %xmm0
-	vmovdqa %xmm0, (%rax)
-	addq $16, %rax
-.Lpoly1305_finish_ext_avx2_24:
-	testb $8, %r13b
-	je .Lpoly1305_finish_ext_avx2_25
-	movq (%rax,%rsi), %rdx
-	movq %rdx, (%rax)
-	addq $8, %rax
-.Lpoly1305_finish_ext_avx2_25:
-	testb $4, %r13b
-	je .Lpoly1305_finish_ext_avx2_26
-	movl (%rax,%rsi), %edx
-	movl %edx, (%rax)
-	addq $4, %rax
-.Lpoly1305_finish_ext_avx2_26:
-	testb $2, %r13b
-	je .Lpoly1305_finish_ext_avx2_27
-	movzwl (%rax,%rsi), %edx
-	movw %dx, (%rax)
-	addq $2, %rax
-.Lpoly1305_finish_ext_avx2_27:
-	testb $1, %r13b
-	je .Lpoly1305_finish_ext_avx2_28
-	movzbl (%rax,%rsi), %edx
-	movb %dl, (%rax)
-.Lpoly1305_finish_ext_avx2_28:
-	testb $15, %r13b
-	je .Lpoly1305_finish_ext_avx2_29
-	movb $1, (%rsp,%r13)
-.Lpoly1305_finish_ext_avx2_29:
-	cmpq $47, %r13
-	jbe .Lpoly1305_finish_ext_avx2_30
-	orq $4, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_30:
-	cmpq $31, %r13
-	jbe .Lpoly1305_finish_ext_avx2_32
-	orq $8, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_32:
-	cmpq $15, %r13
-	jbe .Lpoly1305_finish_ext_avx2_33
-	orq $16, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_33:
-	orq $32, 320(%rbx)
-.Lpoly1305_finish_ext_avx2_31:
-	testb $1, 320(%rbx)
-	je .Lpoly1305_finish_ext_avx2_34
-	cmpq $32, %r13
-	ja .Lpoly1305_finish_ext_avx2_34
-	cmpq $17, %r13
-	sbbq %rsi, %rsi
-	notq %rsi
-	addq $2, %rsi
-	cmpq $17, %r13
-	sbbq %rax, %rax
-	movq %rbx, %rdx
-	addq $23, %rax
-	leaq (%rbx,%rax,8), %rax
-	movl $0, %ecx
-.Lpoly1305_finish_ext_avx2_37:
-	movl 244(%rdx), %edi
-	movl %edi, (%rax)
-	movl 252(%rdx), %edi
-	movl %edi, 32(%rax)
-	movl 260(%rdx), %edi
-	movl %edi, 64(%rax)
-	movl 268(%rdx), %edi
-	movl %edi, 96(%rax)
-	movl 276(%rdx), %edi
-	movl %edi, 128(%rax)
-	addq $1, %rcx
-	subq $40, %rdx
-	addq $8, %rax
-	cmpq %rcx, %rsi
-	ja .Lpoly1305_finish_ext_avx2_37
-.Lpoly1305_finish_ext_avx2_34:
-	movl $64, %edx
-	movq %rsp, %rsi
-	movq %rbx, %rdi
-	call .Lpoly1305_blocks_avx2_local
-.Lpoly1305_finish_ext_avx2_22:
-	movq 320(%rbx), %r8
-	testb $1, %r8b
-	je .Lpoly1305_finish_ext_avx2_38
-	leaq -1(%r13), %rax
-	cmpq $47, %rax
-	ja .Lpoly1305_finish_ext_avx2_46
-	cmpq $32, %r13
-	ja .Lpoly1305_finish_ext_avx2_47
-	cmpq $17, %r13
-	sbbq %r9, %r9
-	addq $2, %r9
-	movl $0, %edi
-	cmpq $17, %r13
-	sbbq %rax, %rax
-	notq %rax
-	andl $5, %eax
-	jmp .Lpoly1305_finish_ext_avx2_39
-.Lpoly1305_finish_ext_avx2_41:
-	movl (%rdx), %esi
-	movl %esi, (%rax)
-	movl 8(%rdx), %esi
-	movl %esi, 32(%rax)
-	movl 16(%rdx), %esi
-	movl %esi, 64(%rax)
-	movl 24(%rdx), %esi
-	movl %esi, 96(%rax)
-	movl 32(%rdx), %esi
-	movl %esi, 128(%rax)
-	addq $1, %rcx
-	subq $40, %rdx
-	addq $8, %rax
-	movq %rcx, %rsi
-	subq %rdi, %rsi
-	cmpq %rsi, %r9
-	ja .Lpoly1305_finish_ext_avx2_41
-	cmpq $3, %rcx
-	ja .Lpoly1305_finish_ext_avx2_42
-	leaq 160(%rbx,%rcx,8), %rax
-.Lpoly1305_finish_ext_avx2_43:
-	movl $1, (%rax)
-	movl $0, 32(%rax)
-	movl $0, 64(%rax)
-	movl $0, 96(%rax)
-	movl $0, 128(%rax)
-	addq $1, %rcx
-	addq $8, %rax
-	cmpq $4, %rcx
-	jne .Lpoly1305_finish_ext_avx2_43
-.Lpoly1305_finish_ext_avx2_42:
-	orq $96, %r8
-	movq %r8, 320(%rbx)
-	vpxor %ymm0, %ymm0, %ymm0
-	vmovdqa %ymm0, (%rsp)
-	vmovdqa %ymm0, 32(%rsp)
-	movl $64, %edx
-	movq %rsp, %rsi
-	movq %rbx, %rdi
-	call .Lpoly1305_blocks_avx2_local
-.Lpoly1305_finish_ext_avx2_38:
-	movq 8(%rbx), %rax
-	movq %rax, %rdx
-	salq $44, %rdx
-	orq (%rbx), %rdx
-	shrq $20, %rax
-	movl $24, %edi
-	shlx %rdi, 16(%rbx), %rcx
-	orq %rcx, %rax
-	movl 292(%rbx), %ecx
-	salq $32, %rcx
-	movl 284(%rbx), %esi
-	orq %rsi, %rcx
-	movl 308(%rbx), %esi
-	salq $32, %rsi
-	movl 300(%rbx), %edi
-	orq %rdi, %rsi
-	addq %rcx, %rdx
-	adcq %rsi, %rax
-	movq %rdx, (%r12)
-	movq %rax, 8(%r12)
-	vpxor %xmm0, %xmm0, %xmm0
-	vmovdqu %ymm0, (%rbx)
-	vmovdqu %ymm0, 32(%rbx)
-	vmovdqu %ymm0, 64(%rbx)
-	vmovdqu %ymm0, 96(%rbx)
-	vmovdqu %ymm0, 128(%rbx)
-	vmovdqu %ymm0, 160(%rbx)
-	vmovdqu %ymm0, 192(%rbx)
-	vmovdqu %ymm0, 224(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_49
-.Lpoly1305_finish_ext_avx2_46:
-	movl $3, %r9d
-	movl $1, %edi
-	movl $10, %eax
-	jmp .Lpoly1305_finish_ext_avx2_39
-.Lpoly1305_finish_ext_avx2_47:
-	movl $3, %r9d
-	movl $0, %edi
-	movl $10, %eax
-.Lpoly1305_finish_ext_avx2_39:
-	leaq 164(%rbx,%rax,8), %rdx
-	leaq 160(%rbx,%rdi,8), %rax
-	movq %rdi, %rcx
-	jmp .Lpoly1305_finish_ext_avx2_41
-.Lpoly1305_finish_ext_avx2_49:
-	movq %rbp, %rax
-	subq %rsp, %rax
-	leaq -24(%rbp), %rsp
-	vzeroall
-	popq %rbx
-	popq %r12
-	popq %r13
-	popq %rbp
-	addq $(8*5), %rax
-ret
-ELF(.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;)
-
-#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index bcbe5df70..2405a090f 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -29,139 +29,34 @@
 #include "cipher.h"
 #include "bufhelp.h"
 
-
 #define POLY1305_TAGLEN 16
 #define POLY1305_KEYLEN 32
+#define POLY1305_BLOCKSIZE 16
 
 
-/* Block-size used in default implementation. */
-#define POLY1305_REF_BLOCKSIZE 16
-
-/* State size of default implementation. */
-#define POLY1305_REF_STATESIZE 64
-
-/* State alignment for default implementation. */
-#define POLY1305_REF_ALIGNMENT sizeof(void *)
-
-
-#undef POLY1305_SYSV_FUNC_ABI
-
-/* POLY1305_USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
-#undef POLY1305_USE_SSE2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define POLY1305_USE_SSE2 1
-# define POLY1305_SSE2_BLOCKSIZE 32
-# define POLY1305_SSE2_STATESIZE 248
-# define POLY1305_SSE2_ALIGNMENT 16
-# define POLY1305_SYSV_FUNC_ABI 1
-#endif
-
-
-/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
-#undef POLY1305_USE_AVX2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
-# define POLY1305_USE_AVX2 1
-# define POLY1305_AVX2_BLOCKSIZE 64
-# define POLY1305_AVX2_STATESIZE 328
-# define POLY1305_AVX2_ALIGNMENT 32
-# define POLY1305_SYSV_FUNC_ABI 1
-#endif
-
-
-/* POLY1305_USE_NEON indicates whether to enable ARM NEON assembly code. */
-#undef POLY1305_USE_NEON
-#if defined(ENABLE_NEON_SUPPORT) && defined(HAVE_ARM_ARCH_V6) && \
-    defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON)
-# define POLY1305_USE_NEON 1
-# define POLY1305_NEON_BLOCKSIZE 32
-# define POLY1305_NEON_STATESIZE 128
-# define POLY1305_NEON_ALIGNMENT 16
-#endif
-
-
-/* Largest block-size used in any implementation (optimized implementations
- * might use block-size multiple of 16). */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_NEON_BLOCKSIZE
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE
-#else
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE
-#endif
-
-/* Largest state-size used in any implementation. */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_LARGEST_STATESIZE POLY1305_NEON_STATESIZE
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE
-#else
-# define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE
-#endif
-
-/* Minimum alignment for state pointer passed to implementations. */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_STATE_ALIGNMENT POLY1305_NEON_ALIGNMENT
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT
-#else
-# define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT
-#endif
-
-
-/* Assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef OPS_FUNC_ABI
-#if defined(POLY1305_SYSV_FUNC_ABI) && \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
-# define OPS_FUNC_ABI __attribute__((sysv_abi))
-#else
-# define OPS_FUNC_ABI
-#endif
-
-
-typedef struct poly1305_key_s
+typedef struct
 {
-  byte b[POLY1305_KEYLEN];
-} poly1305_key_t;
-
-
-typedef struct poly1305_ops_s
-{
-  size_t block_size;
-  void (*init_ext) (void *ctx, const poly1305_key_t * key) OPS_FUNC_ABI;
-  unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes) OPS_FUNC_ABI;
-  unsigned int (*finish_ext) (void *ctx, const byte * m, size_t remaining,
-			      byte mac[POLY1305_TAGLEN]) OPS_FUNC_ABI;
-} poly1305_ops_t;
-
+  u32 k[4];
+  u32 r[4];
+  u32 h[5];
+} POLY1305_STATE;
 
 typedef struct poly1305_context_s
 {
-  byte state[POLY1305_LARGEST_STATESIZE + POLY1305_STATE_ALIGNMENT];
-  byte buffer[POLY1305_LARGEST_BLOCKSIZE];
-  const poly1305_ops_t *ops;
+  POLY1305_STATE state;
+  byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
 } poly1305_context_t;
 
 
-gcry_err_code_t _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
+gcry_err_code_t _gcry_poly1305_init (poly1305_context_t *ctx, const byte *key,
 				     size_t keylen);
 
-void _gcry_poly1305_finish (poly1305_context_t * ctx,
-			    byte mac[POLY1305_TAGLEN]);
+void _gcry_poly1305_finish (poly1305_context_t *ctx,
+			     byte mac[POLY1305_TAGLEN]);
 
-void _gcry_poly1305_update (poly1305_context_t * ctx, const byte * buf,
-			    size_t buflen);
+void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
+			     size_t buflen);
 
 
 #endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305-sse2-amd64.S b/cipher/poly1305-sse2-amd64.S
deleted file mode 100644
index 219eb077b..000000000
--- a/cipher/poly1305-sse2-amd64.S
+++ /dev/null
@@ -1,1043 +0,0 @@
-/* poly1305-sse2-amd64.S  -  AMD64/SSE2 implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-
-.text
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_init_ext
-ELF(.type  _gcry_poly1305_amd64_sse2_init_ext, at function;)
-_gcry_poly1305_amd64_sse2_init_ext:
-.Lpoly1305_init_ext_x86_local:
-	xor %edx, %edx
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	movq %rdx, %r10
-	movq $-1, %rcx
-	testq %r10, %r10
-	pxor %xmm0, %xmm0
-	movq $0xfffffc0ffff, %r9
-	movdqa %xmm0, (%rdi)
-	cmove %rcx, %r10
-	movdqa %xmm0, 16(%rdi)
-	movq $0xffc0fffffff, %rcx
-	movdqa %xmm0, 32(%rdi)
-	movdqa %xmm0, 48(%rdi)
-	movdqa %xmm0, 64(%rdi)
-	movq 8(%rsi), %r11
-	movq %r11, %r8
-	movq (%rsi), %r12
-	andq %r12, %rcx
-	shrq $44, %r12
-	shlq $20, %r8
-	shrq $24, %r11
-	orq %r8, %r12
-	movq $0xffffffc0f, %r8
-	andq %r9, %r12
-	andq %r8, %r11
-	movl %ecx, %r8d
-	andl $67108863, %r8d
-	movq %rcx, %r9
-	movl %r8d, 84(%rdi)
-	movq %r12, %r8
-	shrq $26, %r9
-	shlq $18, %r8
-	orq %r8, %r9
-	movq %r12, %r8
-	shrq $8, %r8
-	andl $67108863, %r9d
-	andl $67108863, %r8d
-	movl %r9d, 92(%rdi)
-	movq %r12, %r9
-	movl %r8d, 100(%rdi)
-	movq %r11, %r8
-	shrq $34, %r9
-	shlq $10, %r8
-	orq %r8, %r9
-	movq %r11, %r8
-	shrq $16, %r8
-	andl $67108863, %r9d
-	movl %r9d, 108(%rdi)
-	cmpq $16, %r10
-	movl %r8d, 116(%rdi)
-	movl 16(%rsi), %r8d
-	movl %r8d, 124(%rdi)
-	movl 20(%rsi), %r8d
-	movl %r8d, 132(%rdi)
-	movl 24(%rsi), %r8d
-	movl %r8d, 140(%rdi)
-	movl 28(%rsi), %esi
-	movl %esi, 148(%rdi)
-	jbe .Lpoly1305_init_ext_sse2_done
-	lea (%r11,%r11,4), %r14
-	shlq $2, %r14
-	lea (%r12,%r12), %rax
-	mulq %r14
-	movq %rax, %r13
-	movq %rcx, %rax
-	movq %rdx, %r8
-	mulq %rcx
-	addq %rax, %r13
-	lea (%rcx,%rcx), %rax
-	movq %r13, %r9
-	adcq %rdx, %r8
-	mulq %r12
-	shlq $20, %r8
-	movq %rax, %rsi
-	shrq $44, %r9
-	movq %r11, %rax
-	orq %r9, %r8
-	movq %rdx, %r9
-	mulq %r14
-	addq %rax, %rsi
-	movq %rcx, %rax
-	adcq %rdx, %r9
-	addq %r11, %r11
-	mulq %r11
-	addq %rsi, %r8
-	movq %rax, %r11
-	movq %r12, %rax
-	movq %rdx, %rcx
-	adcq $0, %r9
-	mulq %r12
-	addq %rax, %r11
-	movq %r8, %rsi
-	adcq %rdx, %rcx
-	shlq $20, %r9
-	shrq $44, %rsi
-	orq %rsi, %r9
-	movq $0xfffffffffff, %rsi
-	addq %r11, %r9
-	movq %r9, %r12
-	adcq $0, %rcx
-	andq %rsi, %r13
-	shlq $22, %rcx
-	andq %rsi, %r8
-	shrq $42, %r12
-	orq %r12, %rcx
-	movq %rsi, %r12
-	lea (%rcx,%rcx,4), %rcx
-	addq %rcx, %r13
-	movq %rsi, %rcx
-	andq %r13, %rcx
-	shrq $44, %r13
-	movq %rcx, %r14
-	addq %r13, %r8
-	movq $0x3ffffffffff, %r13
-	andq %r8, %r12
-	andq %r13, %r9
-	shrq $44, %r8
-	movq %r12, %r11
-	addq %r8, %r9
-	movq %r12, %rax
-	movq %r9, %r13
-	movl %ecx, %r8d
-	shrq $26, %r14
-	andl $67108863, %r8d
-	shlq $18, %r11
-	shrq $34, %rax
-	orq %r11, %r14
-	shlq $10, %r13
-	movq %r12, %r11
-	orq %r13, %rax
-	movq %r9, %r13
-	shrq $8, %r11
-	shrq $16, %r13
-	andl $67108863, %r14d
-	andl $67108863, %r11d
-	andl $67108863, %eax
-	movl %r8d, 88(%rdi)
-	cmpq $64, %r10
-	movl %r8d, 80(%rdi)
-	movl %r14d, 104(%rdi)
-	movl %r14d, 96(%rdi)
-	movl %r11d, 120(%rdi)
-	movl %r11d, 112(%rdi)
-	movl %eax, 136(%rdi)
-	movl %eax, 128(%rdi)
-	movl %r13d, 152(%rdi)
-	movl %r13d, 144(%rdi)
-	jbe .Lpoly1305_init_ext_sse2_done
-	lea (%r9,%r9,4), %r14
-	shlq $2, %r14
-	lea (%r12,%r12), %rax
-	mulq %r14
-	movq %rax, %r8
-	movq %rcx, %rax
-	movq %rdx, %r10
-	mulq %rcx
-	addq %rax, %r8
-	lea (%rcx,%rcx), %rax
-	movq %r8, %r11
-	adcq %rdx, %r10
-	andq %rsi, %r8
-	mulq %r12
-	shlq $20, %r10
-	movq %rax, %r13
-	shrq $44, %r11
-	movq %r9, %rax
-	orq %r11, %r10
-	movq %rdx, %r11
-	mulq %r14
-	addq %rax, %r13
-	movq %rcx, %rax
-	adcq %rdx, %r11
-	addq %r9, %r9
-	mulq %r9
-	addq %r13, %r10
-	movq %rax, %r9
-	movq %r12, %rax
-	movq %rdx, %rcx
-	adcq $0, %r11
-	mulq %r12
-	addq %rax, %r9
-	movq %r10, %r13
-	adcq %rdx, %rcx
-	andq %rsi, %r10
-	shlq $20, %r11
-	shrq $44, %r13
-	orq %r13, %r11
-	addq %r9, %r11
-	movq %rsi, %r9
-	movq %r11, %r12
-	adcq $0, %rcx
-	shlq $22, %rcx
-	shrq $42, %r12
-	orq %r12, %rcx
-	lea (%rcx,%rcx,4), %rcx
-	addq %rcx, %r8
-	andq %r8, %r9
-	shrq $44, %r8
-	movl %r9d, %eax
-	addq %r8, %r10
-	movq $0x3ffffffffff, %r8
-	andq %r10, %rsi
-	andq %r8, %r11
-	shrq $44, %r10
-	movq %rsi, %r8
-	addq %r10, %r11
-	andl $67108863, %eax
-	shrq $26, %r9
-	movq %r11, %r10
-	shlq $18, %r8
-	shlq $10, %r10
-	orq %r8, %r9
-	movq %rsi, %r8
-	shrq $34, %rsi
-	andl $67108863, %r9d
-	shrq $8, %r8
-	orq %r10, %rsi
-	shrq $16, %r11
-	andl $67108863, %r8d
-	andl $67108863, %esi
-	movl %eax, 168(%rdi)
-	movl %eax, 160(%rdi)
-	movl %r9d, 184(%rdi)
-	movl %r9d, 176(%rdi)
-	movl %r8d, 200(%rdi)
-	movl %r8d, 192(%rdi)
-	movl %esi, 216(%rdi)
-	movl %esi, 208(%rdi)
-	movl %r11d, 232(%rdi)
-	movl %r11d, 224(%rdi)
-.Lpoly1305_init_ext_sse2_done:
-	movq $0, 240(%rdi)
-	popq %r14
-	popq %r13
-	popq %r12
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_finish_ext
-ELF(.type  _gcry_poly1305_amd64_sse2_finish_ext, at function;)
-_gcry_poly1305_amd64_sse2_finish_ext:
-.Lpoly1305_finish_ext_x86_local:
-	pushq %rbp
-	movq %rsp, %rbp
-	subq $64, %rsp
-	andq $~63, %rsp
-	movq %rdx, 32(%rsp)
-	movq %rcx, 40(%rsp)
-	andq %rdx, %rdx
-	jz .Lpoly1305_finish_x86_no_leftover
-	pxor %xmm0, %xmm0
-	movdqa %xmm0, 0+0(%rsp)
-	movdqa %xmm0, 16+0(%rsp)
-	leaq 0(%rsp), %r8
-	testq $16, %rdx
-	jz .Lpoly1305_finish_x86_skip16
-	movdqu 0(%rsi), %xmm0
-	movdqa %xmm0, 0(%r8)
-	addq $16, %rsi
-	addq $16, %r8
-.Lpoly1305_finish_x86_skip16:
-	testq $8, %rdx
-	jz .Lpoly1305_finish_x86_skip8
-	movq 0(%rsi), %rax
-	movq %rax, 0(%r8)
-	addq $8, %rsi
-	addq $8, %r8
-.Lpoly1305_finish_x86_skip8:
-	testq $4, %rdx
-	jz .Lpoly1305_finish_x86_skip4
-	movl 0(%rsi), %eax
-	movl %eax, 0(%r8)
-	addq $4, %rsi
-	addq $4, %r8
-.Lpoly1305_finish_x86_skip4:
-	testq $2, %rdx
-	jz .Lpoly1305_finish_x86_skip2
-	movw 0(%rsi), %ax
-	movw %ax, 0(%r8)
-	addq $2, %rsi
-	addq $2, %r8
-.Lpoly1305_finish_x86_skip2:
-	testq $1, %rdx
-	jz .Lpoly1305_finish_x86_skip1
-	movb 0(%rsi), %al
-	movb %al, 0(%r8)
-	addq $1, %r8
-.Lpoly1305_finish_x86_skip1:
-	cmpq $16, %rdx
-	je .Lpoly1305_finish_x86_is16
-	movb $1, 0(%r8)
-.Lpoly1305_finish_x86_is16:
-	movq $4, %rax
-	jae .Lpoly1305_finish_x86_16andover
-	movq $8, %rax
-.Lpoly1305_finish_x86_16andover:
-	orq %rax, 240(%rdi)
-	leaq 0(%rsp), %rsi
-	movq $32, %rdx
-	callq .Lpoly1305_blocks_x86_local
-.Lpoly1305_finish_x86_no_leftover:
-	testq $1, 240(%rdi)
-	jz .Lpoly1305_finish_x86_not_started
-	movq 32(%rsp), %rdx
-	andq %rdx, %rdx
-	jz .Lpoly1305_finish_x86_r2r
-	cmpq $16, %rdx
-	jg .Lpoly1305_finish_x86_r2r
-	xorl %r10d, %r10d
-	movl 84(%rdi), %eax
-	movl 92(%rdi), %ecx
-	movl 100(%rdi), %edx
-	movl 108(%rdi), %r8d
-	movl 116(%rdi), %r9d
-	movl %eax, 80(%rdi)
-	movl $1, 8+80(%rdi)
-	movl %ecx, 96(%rdi)
-	movl %r10d, 8+96(%rdi)
-	movl %edx, 112(%rdi)
-	movl %r10d, 8+112(%rdi)
-	movl %r8d, 128(%rdi)
-	movl %r10d, 8+128(%rdi)
-	movl %r9d, 144(%rdi)
-	movl %r10d, 8+144(%rdi)
-	jmp .Lpoly1305_finish_x86_combine
-.Lpoly1305_finish_x86_r2r:
-	movl 84(%rdi), %eax
-	movl 92(%rdi), %ecx
-	movl 100(%rdi), %edx
-	movl 108(%rdi), %r8d
-	movl 116(%rdi), %r9d
-	movl %eax, 8+80(%rdi)
-	movl %ecx, 8+96(%rdi)
-	movl %edx, 8+112(%rdi)
-	movl %r8d, 8+128(%rdi)
-	movl %r9d, 8+144(%rdi)
-.Lpoly1305_finish_x86_combine:
-	xorq %rsi, %rsi
-	movq $32, %rdx
-	callq .Lpoly1305_blocks_x86_local
-.Lpoly1305_finish_x86_not_started:
-	movq 0(%rdi), %r8
-	movq 8(%rdi), %r9
-	movq %r9, %r10
-	movq 16(%rdi), %r11
-	shlq $44, %r9
-	shrq $20, %r10
-	shlq $24, %r11
-	orq %r9, %r8
-	orq %r11, %r10
-	pxor %xmm0, %xmm0
-	movl 124(%rdi), %eax
-	movl 132(%rdi), %ecx
-	movl 140(%rdi), %edx
-	movl 148(%rdi), %esi
-	movq 40(%rsp), %r11
-	shlq $32, %rcx
-	shlq $32, %rsi
-	orq %rcx, %rax
-	orq %rsi, %rdx
-	addq %r8, %rax
-	adcq %r10, %rdx
-	movq %rax, 0(%r11)
-	movq %rdx, 8(%r11)
-	movq %rbp, %rax
-	subq %rsp, %rax
-	movq %rbp, %rsp
-	movdqa %xmm0, 0(%rdi)
-	movdqa %xmm0, 16(%rdi)
-	movdqa %xmm0, 32(%rdi)
-	movdqa %xmm0, 48(%rdi)
-	movdqa %xmm0, 64(%rdi)
-	movdqa %xmm0, 80(%rdi)
-	movdqa %xmm0, 96(%rdi)
-	movdqa %xmm0, 112(%rdi)
-	movdqa %xmm0, 128(%rdi)
-	movdqa %xmm0, 144(%rdi)
-	movdqa %xmm0, 160(%rdi)
-	movdqa %xmm0, 176(%rdi)
-	movdqa %xmm0, 192(%rdi)
-	movdqa %xmm0, 208(%rdi)
-	movdqa %xmm0, 224(%rdi)
-	popq %rbp
-	addq $8, %rax
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_blocks
-ELF(.type  _gcry_poly1305_amd64_sse2_blocks, at function;)
-_gcry_poly1305_amd64_sse2_blocks:
-.Lpoly1305_blocks_x86_local:
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %rbx
-	andq $-64, %rsp
-	subq $328, %rsp
-	movq 240(%rdi), %rax
-	movl $(1<<24), %r8d
-	movl $((1<<26)-1), %r9d
-	movd %r8, %xmm0
-	movd %r9, %xmm5
-	pshufd $0x44, %xmm0, %xmm0
-	pshufd $0x44, %xmm5, %xmm5
-	testb $4, %al
-	je .Lpoly1305_blocks_x86_3
-	psrldq $8, %xmm0
-.Lpoly1305_blocks_x86_3:
-	testb $8, %al
-	je .Lpoly1305_blocks_x86_4
-	pxor %xmm0, %xmm0
-.Lpoly1305_blocks_x86_4:
-	movdqa %xmm0, 168(%rsp)
-	testb $1, %al
-	jne .Lpoly1305_blocks_x86_5
-	movq 16(%rsi), %xmm0
-	movdqa %xmm5, %xmm7
-	movdqa %xmm5, %xmm10
-	movq (%rsi), %xmm6
-	orq $1, %rax
-	subq $32, %rdx
-	movq 8(%rsi), %xmm1
-	punpcklqdq %xmm0, %xmm6
-	movq 24(%rsi), %xmm0
-	pand %xmm6, %xmm7
-	movdqa %xmm6, %xmm9
-	psrlq $52, %xmm6
-	addq $32, %rsi
-	punpcklqdq %xmm0, %xmm1
-	movdqa %xmm1, %xmm0
-	psrlq $26, %xmm9
-	psllq $12, %xmm0
-	movq %rax, 240(%rdi)
-	pand %xmm5, %xmm9
-	por %xmm0, %xmm6
-	psrlq $40, %xmm1
-	pand %xmm6, %xmm10
-	por 168(%rsp), %xmm1
-	psrlq $26, %xmm6
-	pand %xmm5, %xmm6
-.Lpoly1305_blocks_x86_6:
-	movdqa 80(%rdi), %xmm13
-	cmpq $63, %rdx
-	movl $(5), %r8d
-	movd %r8, %xmm14
-	pshufd $0x44, %xmm14, %xmm14
-	movdqa 96(%rdi), %xmm15
-	movdqa %xmm13, -8(%rsp)
-	movdqa 112(%rdi), %xmm0
-	movdqa %xmm14, 136(%rsp)
-	movdqa 128(%rdi), %xmm3
-	movdqa %xmm15, 312(%rsp)
-	pmuludq %xmm14, %xmm15
-	movdqa 144(%rdi), %xmm13
-	movdqa %xmm0, 232(%rsp)
-	pmuludq %xmm14, %xmm0
-	movdqa %xmm3, 152(%rsp)
-	pmuludq %xmm14, %xmm3
-	movdqa %xmm13, 56(%rsp)
-	pmuludq %xmm14, %xmm13
-	movdqa %xmm15, 40(%rsp)
-	movdqa %xmm0, -24(%rsp)
-	movdqa %xmm3, -40(%rsp)
-	movdqa %xmm13, -56(%rsp)
-	jbe .Lpoly1305_blocks_x86_7
-	movdqa 192(%rdi), %xmm15
-	leaq 32(%rsi), %rax
-	movq %rdx, %rcx
-	movdqa 176(%rdi), %xmm14
-	movdqa %xmm15, %xmm2
-	movdqa 208(%rdi), %xmm0
-	movdqa %xmm15, 216(%rsp)
-	movdqa %xmm14, 296(%rsp)
-	movdqa 224(%rdi), %xmm3
-	pmuludq 136(%rsp), %xmm14
-	movdqa -24(%rsp), %xmm13
-	movdqa %xmm14, 8(%rsp)
-	pmuludq 136(%rsp), %xmm2
-	movdqa -40(%rsp), %xmm14
-	movdqa %xmm0, 120(%rsp)
-	pmuludq 136(%rsp), %xmm0
-	movdqa %xmm3, 24(%rsp)
-	movdqa 160(%rdi), %xmm12
-	movdqa %xmm0, %xmm8
-	movdqa -56(%rsp), %xmm15
-	movdqa %xmm13, 88(%rsp)
-	pmuludq 136(%rsp), %xmm3
-	movdqa %xmm2, 104(%rsp)
-	movdqa %xmm0, %xmm13
-	movdqa -8(%rsp), %xmm11
-	movdqa %xmm3, 280(%rsp)
-	movdqa %xmm2, %xmm3
-	movdqa %xmm0, 200(%rsp)
-	movdqa %xmm14, 184(%rsp)
-	movdqa %xmm15, 264(%rsp)
-	jmp .Lpoly1305_blocks_x86_8
-.p2align 6,,63
-.Lpoly1305_blocks_x86_13:
-	movdqa 200(%rsp), %xmm13
-	movdqa %xmm3, %xmm6
-	movdqa 200(%rsp), %xmm8
-	movdqa 104(%rsp), %xmm3
-.Lpoly1305_blocks_x86_8:
-	movdqa 8(%rsp), %xmm4
-	pmuludq %xmm6, %xmm3
-	subq $64, %rcx
-	pmuludq %xmm10, %xmm8
-	movdqa 104(%rsp), %xmm2
-	movdqa 200(%rsp), %xmm0
-	pmuludq %xmm1, %xmm4
-	movdqa 280(%rsp), %xmm15
-	pmuludq %xmm6, %xmm13
-	movdqa 280(%rsp), %xmm14
-	pmuludq %xmm1, %xmm0
-	paddq %xmm3, %xmm4
-	pmuludq %xmm1, %xmm2
-	movdqa 280(%rsp), %xmm3
-	paddq %xmm8, %xmm4
-	pmuludq %xmm9, %xmm15
-	movdqa 280(%rsp), %xmm8
-	pmuludq %xmm10, %xmm14
-	pmuludq %xmm6, %xmm8
-	paddq %xmm13, %xmm2
-	movdqa %xmm6, %xmm13
-	pmuludq %xmm1, %xmm3
-	paddq %xmm15, %xmm4
-	movdqa 296(%rsp), %xmm15
-	pmuludq %xmm12, %xmm13
-	paddq %xmm14, %xmm2
-	movdqa %xmm7, %xmm14
-	paddq %xmm8, %xmm0
-	pmuludq %xmm12, %xmm14
-	movdqa %xmm9, %xmm8
-	pmuludq 296(%rsp), %xmm6
-	pmuludq %xmm12, %xmm8
-	movdqa %xmm6, 248(%rsp)
-	pmuludq %xmm10, %xmm15
-	movq -16(%rax), %xmm6
-	paddq %xmm13, %xmm3
-	movdqa %xmm10, %xmm13
-	paddq %xmm14, %xmm4
-	movq -8(%rax), %xmm14
-	paddq %xmm8, %xmm2
-	movq -32(%rax), %xmm8
-	pmuludq %xmm12, %xmm13
-	paddq %xmm15, %xmm3
-	pmuludq %xmm12, %xmm1
-	movdqa 216(%rsp), %xmm15
-	pmuludq 216(%rsp), %xmm10
-	punpcklqdq %xmm6, %xmm8
-	movq -24(%rax), %xmm6
-	pmuludq %xmm9, %xmm15
-	paddq %xmm13, %xmm0
-	movdqa 296(%rsp), %xmm13
-	paddq 248(%rsp), %xmm1
-	punpcklqdq %xmm14, %xmm6
-	movdqa 296(%rsp), %xmm14
-	pmuludq %xmm9, %xmm13
-	pmuludq 120(%rsp), %xmm9
-	movdqa %xmm15, 72(%rsp)
-	paddq %xmm10, %xmm1
-	movdqa 216(%rsp), %xmm15
-	pmuludq %xmm7, %xmm14
-	movdqa %xmm6, %xmm10
-	paddq %xmm9, %xmm1
-	pmuludq %xmm7, %xmm15
-	paddq %xmm13, %xmm0
-	paddq 72(%rsp), %xmm3
-	movdqa 120(%rsp), %xmm13
-	psllq $12, %xmm10
-	paddq %xmm14, %xmm2
-	movdqa %xmm5, %xmm14
-	pand %xmm8, %xmm14
-	pmuludq %xmm7, %xmm13
-	paddq %xmm15, %xmm0
-	movdqa %xmm14, 248(%rsp)
-	movdqa %xmm8, %xmm14
-	psrlq $52, %xmm8
-	movdqu (%rax), %xmm9
-	por %xmm10, %xmm8
-	pmuludq 24(%rsp), %xmm7
-	movdqu 16(%rax), %xmm10
-	paddq %xmm13, %xmm3
-	pxor %xmm13, %xmm13
-	movdqa %xmm9, %xmm15
-	paddq %xmm7, %xmm1
-	movdqa %xmm6, %xmm7
-	movdqa %xmm10, -72(%rsp)
-	punpckldq %xmm10, %xmm15
-	movdqa %xmm15, %xmm10
-	punpckldq %xmm13, %xmm10
-	punpckhdq -72(%rsp), %xmm9
-	psrlq $40, %xmm6
-	movdqa %xmm10, 72(%rsp)
-	movdqa %xmm9, %xmm10
-	punpckhdq %xmm13, %xmm9
-	psllq $18, %xmm9
-	paddq 72(%rsp), %xmm4
-	addq $64, %rax
-	paddq %xmm9, %xmm3
-	movdqa 40(%rsp), %xmm9
-	cmpq $63, %rcx
-	punpckhdq %xmm13, %xmm15
-	psllq $6, %xmm15
-	punpckldq %xmm13, %xmm10
-	paddq %xmm15, %xmm2
-	psllq $12, %xmm10
-	por 168(%rsp), %xmm6
-	pmuludq %xmm6, %xmm9
-	movdqa 88(%rsp), %xmm15
-	paddq %xmm10, %xmm0
-	movdqa 88(%rsp), %xmm13
-	psrlq $14, %xmm7
-	pand %xmm5, %xmm8
-	movdqa 184(%rsp), %xmm10
-	pand %xmm5, %xmm7
-	pmuludq %xmm7, %xmm15
-	paddq %xmm9, %xmm4
-	pmuludq %xmm6, %xmm13
-	movdqa 184(%rsp), %xmm9
-	paddq 168(%rsp), %xmm1
-	pmuludq %xmm7, %xmm10
-	pmuludq %xmm6, %xmm9
-	paddq %xmm15, %xmm4
-	movdqa 184(%rsp), %xmm15
-	paddq %xmm13, %xmm2
-	psrlq $26, %xmm14
-	movdqa 264(%rsp), %xmm13
-	paddq %xmm10, %xmm2
-	pmuludq %xmm8, %xmm15
-	pand %xmm5, %xmm14
-	paddq %xmm9, %xmm0
-	pmuludq %xmm6, %xmm13
-	movdqa 264(%rsp), %xmm9
-	movdqa 264(%rsp), %xmm10
-	pmuludq %xmm11, %xmm6
-	pmuludq %xmm8, %xmm9
-	paddq %xmm15, %xmm4
-	movdqa 264(%rsp), %xmm15
-	pmuludq %xmm14, %xmm10
-	paddq %xmm13, %xmm3
-	movdqa %xmm7, %xmm13
-	pmuludq %xmm7, %xmm15
-	paddq %xmm6, %xmm1
-	movdqa 312(%rsp), %xmm6
-	paddq %xmm9, %xmm2
-	pmuludq %xmm11, %xmm13
-	movdqa 248(%rsp), %xmm9
-	paddq %xmm10, %xmm4
-	pmuludq %xmm8, %xmm6
-	pmuludq 312(%rsp), %xmm7
-	paddq %xmm15, %xmm0
-	movdqa %xmm9, %xmm10
-	movdqa %xmm14, %xmm15
-	pmuludq %xmm11, %xmm10
-	paddq %xmm13, %xmm3
-	movdqa %xmm8, %xmm13
-	pmuludq %xmm11, %xmm13
-	paddq %xmm6, %xmm3
-	paddq %xmm7, %xmm1
-	movdqa 232(%rsp), %xmm6
-	pmuludq %xmm11, %xmm15
-	pmuludq 232(%rsp), %xmm8
-	paddq %xmm10, %xmm4
-	paddq %xmm8, %xmm1
-	movdqa 312(%rsp), %xmm10
-	paddq %xmm13, %xmm0
-	pmuludq %xmm14, %xmm6
-	movdqa 312(%rsp), %xmm13
-	pmuludq %xmm9, %xmm10
-	paddq %xmm15, %xmm2
-	movdqa 232(%rsp), %xmm7
-	pmuludq %xmm14, %xmm13
-	pmuludq 152(%rsp), %xmm14
-	paddq %xmm14, %xmm1
-	pmuludq %xmm9, %xmm7
-	paddq %xmm6, %xmm3
-	paddq %xmm10, %xmm2
-	movdqa 152(%rsp), %xmm10
-	paddq %xmm13, %xmm0
-	pmuludq %xmm9, %xmm10
-	paddq %xmm7, %xmm0
-	movdqa %xmm4, %xmm7
-	psrlq $26, %xmm7
-	pmuludq 56(%rsp), %xmm9
-	pand %xmm5, %xmm4
-	paddq %xmm7, %xmm2
-	paddq %xmm9, %xmm1
-	paddq %xmm10, %xmm3
-	movdqa %xmm2, %xmm7
-	movdqa %xmm2, %xmm9
-	movdqa %xmm3, %xmm6
-	psrlq $26, %xmm7
-	pand %xmm5, %xmm3
-	psrlq $26, %xmm6
-	paddq %xmm7, %xmm0
-	pand %xmm5, %xmm9
-	paddq %xmm6, %xmm1
-	movdqa %xmm0, %xmm10
-	movdqa %xmm1, %xmm6
-	pand %xmm5, %xmm10
-	pand %xmm5, %xmm1
-	psrlq $26, %xmm6
-	pmuludq 136(%rsp), %xmm6
-	paddq %xmm6, %xmm4
-	movdqa %xmm0, %xmm6
-	psrlq $26, %xmm6
-	movdqa %xmm4, %xmm2
-	movdqa %xmm4, %xmm7
-	paddq %xmm6, %xmm3
-	psrlq $26, %xmm2
-	pand %xmm5, %xmm7
-	movdqa %xmm3, %xmm0
-	paddq %xmm2, %xmm9
-	pand %xmm5, %xmm3
-	psrlq $26, %xmm0
-	paddq %xmm0, %xmm1
-	ja .Lpoly1305_blocks_x86_13
-	leaq -64(%rdx), %rax
-	movdqa %xmm3, %xmm6
-	andl $63, %edx
-	andq $-64, %rax
-	leaq 64(%rsi,%rax), %rsi
-.Lpoly1305_blocks_x86_7:
-	cmpq $31, %rdx
-	jbe .Lpoly1305_blocks_x86_9
-	movdqa -24(%rsp), %xmm13
-	movdqa %xmm6, %xmm0
-	movdqa %xmm6, %xmm3
-	movdqa 40(%rsp), %xmm11
-	movdqa %xmm1, %xmm12
-	testq %rsi, %rsi
-	movdqa -40(%rsp), %xmm2
-	pmuludq %xmm13, %xmm0
-	movdqa %xmm1, %xmm8
-	pmuludq %xmm1, %xmm11
-	movdqa %xmm10, %xmm4
-	movdqa %xmm1, %xmm14
-	pmuludq %xmm2, %xmm3
-	movdqa %xmm6, %xmm15
-	pmuludq %xmm1, %xmm13
-	movdqa %xmm7, %xmm1
-	pmuludq %xmm2, %xmm12
-	paddq %xmm0, %xmm11
-	movdqa -56(%rsp), %xmm0
-	pmuludq %xmm10, %xmm2
-	paddq %xmm3, %xmm13
-	pmuludq %xmm0, %xmm4
-	movdqa %xmm9, %xmm3
-	pmuludq %xmm0, %xmm3
-	paddq %xmm2, %xmm11
-	pmuludq %xmm0, %xmm8
-	movdqa %xmm6, %xmm2
-	pmuludq %xmm0, %xmm2
-	movdqa -8(%rsp), %xmm0
-	paddq %xmm4, %xmm13
-	movdqa 312(%rsp), %xmm4
-	paddq %xmm3, %xmm11
-	pmuludq 312(%rsp), %xmm6
-	movdqa 312(%rsp), %xmm3
-	pmuludq %xmm0, %xmm1
-	paddq %xmm2, %xmm12
-	pmuludq %xmm0, %xmm15
-	movdqa %xmm9, %xmm2
-	pmuludq %xmm0, %xmm2
-	pmuludq %xmm7, %xmm3
-	paddq %xmm1, %xmm11
-	movdqa 232(%rsp), %xmm1
-	pmuludq %xmm0, %xmm14
-	paddq %xmm15, %xmm8
-	pmuludq %xmm10, %xmm0
-	paddq %xmm2, %xmm13
-	movdqa 312(%rsp), %xmm2
-	pmuludq %xmm10, %xmm4
-	paddq %xmm3, %xmm13
-	movdqa 152(%rsp), %xmm3
-	pmuludq %xmm9, %xmm2
-	paddq %xmm6, %xmm14
-	pmuludq 232(%rsp), %xmm10
-	paddq %xmm0, %xmm12
-	pmuludq %xmm9, %xmm1
-	paddq %xmm10, %xmm14
-	movdqa 232(%rsp), %xmm0
-	pmuludq %xmm7, %xmm3
-	paddq %xmm4, %xmm8
-	pmuludq 152(%rsp), %xmm9
-	paddq %xmm2, %xmm12
-	paddq %xmm9, %xmm14
-	pmuludq %xmm7, %xmm0
-	paddq %xmm1, %xmm8
-	pmuludq 56(%rsp), %xmm7
-	paddq %xmm3, %xmm8
-	paddq %xmm7, %xmm14
-	paddq %xmm0, %xmm12
-	je .Lpoly1305_blocks_x86_10
-	movdqu (%rsi), %xmm1
-	pxor %xmm0, %xmm0
-	paddq 168(%rsp), %xmm14
-	movdqu 16(%rsi), %xmm2
-	movdqa %xmm1, %xmm3
-	punpckldq %xmm2, %xmm3
-	punpckhdq %xmm2, %xmm1
-	movdqa %xmm3, %xmm4
-	movdqa %xmm1, %xmm2
-	punpckldq %xmm0, %xmm4
-	punpckhdq %xmm0, %xmm3
-	punpckhdq %xmm0, %xmm1
-	punpckldq %xmm0, %xmm2
-	movdqa %xmm2, %xmm0
-	psllq $6, %xmm3
-	paddq %xmm4, %xmm11
-	psllq $12, %xmm0
-	paddq %xmm3, %xmm13
-	psllq $18, %xmm1
-	paddq %xmm0, %xmm12
-	paddq %xmm1, %xmm8
-.Lpoly1305_blocks_x86_10:
-	movdqa %xmm11, %xmm9
-	movdqa %xmm8, %xmm1
-	movdqa %xmm11, %xmm7
-	psrlq $26, %xmm9
-	movdqa %xmm8, %xmm6
-	pand %xmm5, %xmm7
-	paddq %xmm13, %xmm9
-	psrlq $26, %xmm1
-	pand %xmm5, %xmm6
-	movdqa %xmm9, %xmm10
-	paddq %xmm14, %xmm1
-	pand %xmm5, %xmm9
-	psrlq $26, %xmm10
-	movdqa %xmm1, %xmm0
-	pand %xmm5, %xmm1
-	paddq %xmm12, %xmm10
-	psrlq $26, %xmm0
-	pmuludq 136(%rsp), %xmm0
-	movdqa %xmm10, %xmm2
-	paddq %xmm0, %xmm7
-	psrlq $26, %xmm2
-	movdqa %xmm7, %xmm0
-	pand %xmm5, %xmm10
-	paddq %xmm2, %xmm6
-	psrlq $26, %xmm0
-	pand %xmm5, %xmm7
-	movdqa %xmm6, %xmm2
-	paddq %xmm0, %xmm9
-	pand %xmm5, %xmm6
-	psrlq $26, %xmm2
-	paddq %xmm2, %xmm1
-.Lpoly1305_blocks_x86_9:
-	testq %rsi, %rsi
-	je .Lpoly1305_blocks_x86_11
-	movdqa %xmm7, 0(%rdi)
-	movdqa %xmm9, 16(%rdi)
-	movdqa %xmm10, 32(%rdi)
-	movdqa %xmm6, 48(%rdi)
-	movdqa %xmm1, 64(%rdi)
-	movq -8(%rbp), %rbx
-	leave
-	ret
-.Lpoly1305_blocks_x86_5:
-	movdqa 0(%rdi), %xmm7
-	movdqa 16(%rdi), %xmm9
-	movdqa 32(%rdi), %xmm10
-	movdqa 48(%rdi), %xmm6
-	movdqa 64(%rdi), %xmm1
-	jmp .Lpoly1305_blocks_x86_6
-.Lpoly1305_blocks_x86_11:
-	movdqa %xmm7, %xmm0
-	movdqa %xmm9, %xmm2
-	movdqa %xmm6, %xmm3
-	psrldq $8, %xmm0
-	movabsq $4398046511103, %rbx
-	paddq %xmm0, %xmm7
-	psrldq $8, %xmm2
-	movdqa %xmm10, %xmm0
-	movd %xmm7, %edx
-	paddq %xmm2, %xmm9
-	psrldq $8, %xmm0
-	movl %edx, %ecx
-	movd %xmm9, %eax
-	paddq %xmm0, %xmm10
-	shrl $26, %ecx
-	psrldq $8, %xmm3
-	movdqa %xmm1, %xmm0
-	addl %ecx, %eax
-	movd %xmm10, %ecx
-	paddq %xmm3, %xmm6
-	movl %eax, %r9d
-	shrl $26, %eax
-	psrldq $8, %xmm0
-	addl %ecx, %eax
-	movd %xmm6, %ecx
-	paddq %xmm0, %xmm1
-	movl %eax, %esi
-	andl $67108863, %r9d
-	movd %xmm1, %r10d
-	shrl $26, %esi
-	andl $67108863, %eax
-	andl $67108863, %edx
-	addl %ecx, %esi
-	salq $8, %rax
-	movl %r9d, %ecx
-	shrl $18, %r9d
-	movl %esi, %r8d
-	shrl $26, %esi
-	andl $67108863, %r8d
-	addl %r10d, %esi
-	orq %r9, %rax
-	salq $16, %rsi
-	movq %r8, %r9
-	shrl $10, %r8d
-	salq $26, %rcx
-	orq %r8, %rsi
-	salq $34, %r9
-	orq %rdx, %rcx
-	movq %rsi, %r8
-	shrq $42, %rsi
-	movabsq $17592186044415, %rdx
-	orq %r9, %rax
-	andq %rbx, %r8
-	leaq (%rsi,%rsi,4), %rsi
-	andq %rdx, %rcx
-	andq %rdx, %rax
-	movabsq $-4398046511104, %r10
-	addq %rsi, %rcx
-	movq %rcx, %rsi
-	shrq $44, %rcx
-	addq %rcx, %rax
-	andq %rdx, %rsi
-	movq %rax, %rcx
-	shrq $44, %rax
-	addq %r8, %rax
-	andq %rdx, %rcx
-	andq %rax, %rbx
-	shrq $42, %rax
-	leaq (%rsi,%rax,4), %rsi
-	addq %rbx, %r10
-	addq %rax, %rsi
-	movq %rsi, %r8
-	shrq $44, %rsi
-	andq %rdx, %r8
-	addq %rcx, %rsi
-	leaq 5(%r8), %r9
-	movq %r9, %r11
-	andq %rdx, %r9
-	shrq $44, %r11
-	addq %rsi, %r11
-	movq %r11, %rax
-	andq %r11, %rdx
-	shrq $44, %rax
-	addq %rax, %r10
-	movq %r10, %rax
-	shrq $63, %rax
-	subq $1, %rax
-	movq %rax, %rcx
-	andq %rax, %r9
-	andq %rax, %rdx
-	notq %rcx
-	andq %r10, %rax
-	andq %rcx, %r8
-	andq %rcx, %rsi
-	andq %rbx, %rcx
-	orq %r9, %r8
-	orq %rdx, %rsi
-	orq %rax, %rcx
-	movq %r8, 0(%rdi)
-	movq %rsi, 8(%rdi)
-	movq %rcx, 16(%rdi)
-	movq -8(%rbp), %rbx
-	movq %rbp, %rax
-	subq %rsp, %rax
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	leave
-	addq $8, %rax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;)
-
-#endif
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 22255fb15..68d9b9015 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -1,5 +1,5 @@
 /* poly1305.c  -  Poly1305 internals and generic implementation
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -17,11 +17,6 @@
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
-/* The code is based on public-domain Poly1305 implementation by
- * Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -33,157 +28,325 @@
 #include "bufhelp.h"
 #include "poly1305-internal.h"
 
+#include "mpi-internal.h"
+#include "longlong.h"
+
 
 static const char *selftest (void);
-

-
-
-#ifdef POLY1305_USE_SSE2
-
-void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_sse2_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_sse2_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
-
-static const poly1305_ops_t poly1305_amd64_sse2_ops = {
-  POLY1305_SSE2_BLOCKSIZE,
-  _gcry_poly1305_amd64_sse2_init_ext,
-  _gcry_poly1305_amd64_sse2_blocks,
-  _gcry_poly1305_amd64_sse2_finish_ext
-};
-
-#else  /* !POLY1305_USE_SSE2 */
-
-static OPS_FUNC_ABI void poly1305_init_ext_ref32
-/**/                (void *state, const poly1305_key_t *key);
-static OPS_FUNC_ABI unsigned int poly1305_blocks_ref32
-/**/                (void *state, const byte *m, size_t bytes);
-static OPS_FUNC_ABI unsigned int poly1305_finish_ext_ref32
-/**/                (void *state, const byte * m,
-                     size_t remaining, byte mac[POLY1305_TAGLEN]);
-
-static const poly1305_ops_t poly1305_default_ops = {
-  POLY1305_REF_BLOCKSIZE,
-  poly1305_init_ext_ref32,
-  poly1305_blocks_ref32,
-  poly1305_finish_ext_ref32
-};
-
-#endif /* !POLY1305_USE_SSE2 */
-
-
-#ifdef POLY1305_USE_AVX2
-
-void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
-
-static const poly1305_ops_t poly1305_amd64_avx2_ops = {
-  POLY1305_AVX2_BLOCKSIZE,
-  _gcry_poly1305_amd64_avx2_init_ext,
-  _gcry_poly1305_amd64_avx2_blocks,
-  _gcry_poly1305_amd64_avx2_finish_ext
-};
 
+
+#undef USE_MPI_64BIT
+#undef USE_MPI_32BIT
+#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64_TYPEDEF)
+# define USE_MPI_64BIT 1
+#elif BYTES_PER_MPI_LIMB == 4
+# define USE_MPI_32BIT 1
+#else
+# error please implement for this limb size.
 #endif
 
 
-#ifdef POLY1305_USE_NEON
+static void poly1305_init (poly1305_context_t *ctx,
+			   const byte key[POLY1305_KEYLEN])
+{
+  POLY1305_STATE *st = &ctx->state;
 
-void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
+  ctx->leftover = 0;
 
-static const poly1305_ops_t poly1305_armv7_neon_ops = {
-  POLY1305_NEON_BLOCKSIZE,
-  _gcry_poly1305_armv7_neon_init_ext,
-  _gcry_poly1305_armv7_neon_blocks,
-  _gcry_poly1305_armv7_neon_finish_ext
-};
+  st->h[0] = 0;
+  st->h[1] = 0;
+  st->h[2] = 0;
+  st->h[3] = 0;
+  st->h[4] = 0;
 
-#endif
+  st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
+  st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
+  st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
+  st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
 
+  st->k[0] = buf_get_le32(key + 16);
+  st->k[1] = buf_get_le32(key + 20);
+  st->k[2] = buf_get_le32(key + 24);
+  st->k[3] = buf_get_le32(key + 28);
+}
 
-/* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit
- * multiplication and 64 bit addition.
- */
 
-typedef struct poly1305_state_ref32_s
+#ifdef USE_MPI_64BIT
+
+#if defined (__aarch64__) && __GNUC__ >= 4
+
+/* A += B (armv8/aarch64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("adds %0, %3, %0\n" \
+	       "adcs %1, %4, %1\n" \
+	       "adc  %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#endif /* __aarch64__ */
+
+#if defined (__x86_64__) && __GNUC__ >= 4
+
+/* A += B (x86-64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addq %3, %0\n" \
+	       "adcq %4, %1\n" \
+	       "adcq %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "g" (B0), "g" (B1), "g" (B2) \
+	       : "cc" )
+
+#endif /* __x86_64__ */
+
+#ifndef ADD_1305_64
+/* A += B (generic, mpi) */
+#  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
+    u64 carry; \
+    add_ssaaaa(carry, A0, 0, A0, 0, B0); \
+    add_ssaaaa(A2, A1, A2, A1, B2, B1); \
+    add_ssaaaa(A2, A1, A2, A1, 0, carry); \
+  } while (0)
+#endif
+
+/* H = H * R mod 2???-5 */
+#define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
+    u64 x0_lo, x0_hi, x1_lo, x1_hi; \
+    u64 t0_lo, t0_hi, t1_lo, t1_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    \
+    umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
+    add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
+    umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
+    add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
+    t1_hi = H2 * R0;       /* h2 * r0 */ \
+    add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    /* carry propagation */ \
+    H2 = H0 & 3; \
+    H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
+    ADD_1305_64(H2, H1, H0, 0, x0_hi, x0_lo); \
+  } while (0)
+
+unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
 {
-  u32 r[5];
-  u32 h[5];
-  u32 pad[4];
-  byte final;
-} poly1305_state_ref32_t;
+  POLY1305_STATE *st = &ctx->state;
+  u64 r0, r1, r1_mult5;
+  u64 h0, h1, h2;
+  u64 m0, m1, m2;
+
+  m2 = high_pad;
+
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
+
+  r0 = st->r[0] + ((u64)st->r[1] << 32);
+  r1 = st->r[2] + ((u64)st->r[3] << 32);
+
+  r1_mult5 = (r1 >> 2) + r1;
+
+  m0 = buf_get_le64(buf + 0);
+  m1 = buf_get_le64(buf + 8);
+  buf += POLY1305_BLOCKSIZE;
+  len -= POLY1305_BLOCKSIZE;
+
+  while (len >= POLY1305_BLOCKSIZE)
+    {
+      /* a = h + m */
+      ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+      m0 = buf_get_le64(buf + 0);
+      m1 = buf_get_le64(buf + 8);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
+    }
+
+  /* a = h + m */
+  ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+  /* h = a * r (partial mod 2^130-5) */
+  MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
+  st->h[0] = h0;
+  st->h[1] = h0 >> 32;
+  st->h[2] = h1;
+  st->h[3] = h1 >> 32;
+  st->h[4] = h2;
+
+  return 6 * sizeof (void *) + 18 * sizeof (u64);
+}
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI void
-poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key)
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
+  POLY1305_STATE *st = &ctx->state;
+  unsigned int burn = 0;
+  u64 u, carry;
+  u64 k0, k1;
+  u64 h0, h1;
+  u64 h2;
+
+  /* process the remaining block */
+  if (ctx->leftover)
+    {
+      ctx->buffer[ctx->leftover++] = 1;
+      for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
+	ctx->buffer[ctx->leftover] = 0;
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
+    }
 
-  gcry_assert (sizeof (*st) + POLY1305_STATE_ALIGNMENT <=
-	       sizeof (((poly1305_context_t *) 0)->state));
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
 
-  /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-  st->r[0] = (buf_get_le32 (&key->b[0])) & 0x3ffffff;
-  st->r[1] = (buf_get_le32 (&key->b[3]) >> 2) & 0x3ffff03;
-  st->r[2] = (buf_get_le32 (&key->b[6]) >> 4) & 0x3ffc0ff;
-  st->r[3] = (buf_get_le32 (&key->b[9]) >> 6) & 0x3f03fff;
-  st->r[4] = (buf_get_le32 (&key->b[12]) >> 8) & 0x00fffff;
+  k0 = st->k[0] + ((u64)st->k[1] << 32);
+  k1 = st->k[2] + ((u64)st->k[3] << 32);
 
-  /* h = 0 */
-  st->h[0] = 0;
-  st->h[1] = 0;
-  st->h[2] = 0;
-  st->h[3] = 0;
-  st->h[4] = 0;
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, u, 0, h0, 0, 5);
+  add_ssaaaa(carry, u, 0, carry, 0, h1);
+  u = (carry + h2) >> 2; /* u == 0 or 1 */
 
-  /* save pad for later */
-  st->pad[0] = buf_get_le32 (&key->b[16]);
-  st->pad[1] = buf_get_le32 (&key->b[20]);
-  st->pad[2] = buf_get_le32 (&key->b[24]);
-  st->pad[3] = buf_get_le32 (&key->b[28]);
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(h1, h0, h1, h0, 0, u);
 
-  st->final = 0;
+  /* add high part of key + h */
+  add_ssaaaa(h1, h0, h1, h0, k1, k0);
+  buf_put_le64(mac + 0, h0);
+  buf_put_le64(mac + 8, h1);
+
+  /* burn_stack */
+  return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
 }
-#endif /* !POLY1305_USE_SSE2 */
 
+#endif /* USE_MPI_64BIT */
+
+#ifdef USE_MPI_32BIT
+
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+/* HI:LO += A * B (arm) */
+#define UMUL_ADD_32(HI, LO, A, B) \
+      __asm__ ("umlal %1, %0, %4, %5" \
+	       : "=r" (HI), "=r" (LO) \
+	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
+
+/* A += B (arm) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("adds %0, %0, %5\n" \
+	       "adcs %1, %1, %6\n" \
+	       "adcs %2, %2, %7\n" \
+	       "adcs %3, %3, %8\n" \
+	       "adc %4, %4, %9\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
+	       : "cc" )
+
+#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
+
+#if defined (__i386__) && __GNUC__ >= 4
+
+/* A += B (i386) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("addl %5, %0\n" \
+	       "adcl %6, %1\n" \
+	       "adcl %7, %2\n" \
+	       "adcl %8, %3\n" \
+	       "adcl %9, %4\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
+	       : "cc" )
+
+#endif /* __i386__ */
+
+#ifndef UMUL_ADD_32
+/* HI:LO += A * B (generic, mpi) */
+#  define UMUL_ADD_32(HI, LO, A, B) do { \
+    u32 t_lo, t_hi; \
+    umul_ppmm(t_hi, t_lo, A, B); \
+    add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
+  } while (0)
+#endif
+
+#ifndef ADD_1305_32
+/* A += B (generic, mpi) */
+#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
+    u32 carry0, carry1, carry2; \
+    add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
+    add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
+    add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
+    add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
+    add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
+    add_ssaaaa(A4, A3, A4, A3, B4, B3); \
+    add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
+  } while (0)
+#endif
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI unsigned int
-poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
+/* H = H * R mod 2???-5 */
+#define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
+                        R3_MULT5, R2_MULT5, R1_MULT5) do { \
+    u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
+    u32 t0_lo, t0_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
+    umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
+    H1 = x0_hi; \
+    UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
+    \
+    t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
+    t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
+    add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
+    add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
+    t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
+    t0_hi = H4 * R0;       /* h4 * r0 */ \
+    add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
+    \
+    /* carry propagation */ \
+    H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
+    H4 = H4 & 3; \
+    ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
+  } while (0)
+
+unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
-  const u32 hibit = (st->final) ? 0 : (1 << 24);	/* 1 << 128 */
-  u32 r0, r1, r2, r3, r4;
-  u32 s1, s2, s3, s4;
+  POLY1305_STATE *st = &ctx->state;
+  u32 r1_mult5, r2_mult5, r3_mult5;
   u32 h0, h1, h2, h3, h4;
-  u64 d0, d1, d2, d3, d4;
-  u32 c;
-
-  r0 = st->r[0];
-  r1 = st->r[1];
-  r2 = st->r[2];
-  r3 = st->r[3];
-  r4 = st->r[4];
+  u32 m0, m1, m2, m3, m4;
 
-  s1 = r1 * 5;
-  s2 = r2 * 5;
-  s3 = r3 * 5;
-  s4 = r4 * 5;
+  m4 = high_pad;
 
   h0 = st->h[0];
   h1 = st->h[1];
@@ -191,54 +354,27 @@ poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
   h3 = st->h[3];
   h4 = st->h[4];
 
-  while (bytes >= POLY1305_REF_BLOCKSIZE)
+  r1_mult5 = (st->r[1] >> 2) + st->r[1];
+  r2_mult5 = (st->r[2] >> 2) + st->r[2];
+  r3_mult5 = (st->r[3] >> 2) + st->r[3];
+
+  while (len >= POLY1305_BLOCKSIZE)
     {
-      /* h += m[i] */
-      h0 += (buf_get_le32 (m + 0)) & 0x3ffffff;
-      h1 += (buf_get_le32 (m + 3) >> 2) & 0x3ffffff;
-      h2 += (buf_get_le32 (m + 6) >> 4) & 0x3ffffff;
-      h3 += (buf_get_le32 (m + 9) >> 6) & 0x3ffffff;
-      h4 += (buf_get_le32 (m + 12) >> 8) | hibit;
-
-      /* h *= r */
-      d0 =
-	((u64) h0 * r0) + ((u64) h1 * s4) +
-	((u64) h2 * s3) + ((u64) h3 * s2) + ((u64) h4 * s1);
-      d1 =
-	((u64) h0 * r1) + ((u64) h1 * r0) +
-	((u64) h2 * s4) + ((u64) h3 * s3) + ((u64) h4 * s2);
-      d2 =
-	((u64) h0 * r2) + ((u64) h1 * r1) +
-	((u64) h2 * r0) + ((u64) h3 * s4) + ((u64) h4 * s3);
-      d3 =
-	((u64) h0 * r3) + ((u64) h1 * r2) +
-	((u64) h2 * r1) + ((u64) h3 * r0) + ((u64) h4 * s4);
-      d4 =
-	((u64) h0 * r4) + ((u64) h1 * r3) +
-	((u64) h2 * r2) + ((u64) h3 * r1) + ((u64) h4 * r0);
-
-      /* (partial) h %= p */
-      c = (u32) (d0 >> 26);
-      h0 = (u32) d0 & 0x3ffffff;
-      d1 += c;
-      c = (u32) (d1 >> 26);
-      h1 = (u32) d1 & 0x3ffffff;
-      d2 += c;
-      c = (u32) (d2 >> 26);
-      h2 = (u32) d2 & 0x3ffffff;
-      d3 += c;
-      c = (u32) (d3 >> 26);
-      h3 = (u32) d3 & 0x3ffffff;
-      d4 += c;
-      c = (u32) (d4 >> 26);
-      h4 = (u32) d4 & 0x3ffffff;
-      h0 += c * 5;
-      c = (h0 >> 26);
-      h0 = h0 & 0x3ffffff;
-      h1 += c;
-
-      m += POLY1305_REF_BLOCKSIZE;
-      bytes -= POLY1305_REF_BLOCKSIZE;
+      m0 = buf_get_le32(buf + 0);
+      m1 = buf_get_le32(buf + 4);
+      m2 = buf_get_le32(buf + 8);
+      m3 = buf_get_le32(buf + 12);
+
+      /* a = h + m */
+      ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_32(h4, h3, h2, h1, h0,
+		      st->r[3], st->r[2], st->r[1], st->r[0],
+		      r3_mult5, r2_mult5, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
     }
 
   st->h[0] = h0;
@@ -247,185 +383,95 @@ poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
   st->h[3] = h3;
   st->h[4] = h4;
 
-  return (16 * sizeof (u32) + 5 * sizeof (u64) + 5 * sizeof (void *));
+  return 6 * sizeof (void *) + 28 * sizeof (u32);
 }
-#endif /* !POLY1305_USE_SSE2 */
-
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI unsigned int
-poly1305_finish_ext_ref32 (void *state, const byte * m,
-			   size_t remaining, byte mac[POLY1305_TAGLEN])
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
-  u32 h0, h1, h2, h3, h4, c;
-  u32 g0, g1, g2, g3, g4;
-  u64 f;
-  u32 mask;
+  POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
+  u32 carry, tmp0, tmp1, tmp2, u;
+  u32 h4, h3, h2, h1, h0;
 
   /* process the remaining block */
-  if (remaining)
+  if (ctx->leftover)
     {
-      byte final[POLY1305_REF_BLOCKSIZE] = { 0 };
-      size_t i;
-      for (i = 0; i < remaining; i++)
-	final[i] = m[i];
-      final[remaining] = 1;
-      st->final = 1;
-      burn = poly1305_blocks_ref32 (st, final, POLY1305_REF_BLOCKSIZE);
+      ctx->buffer[ctx->leftover++] = 1;
+      for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
+	ctx->buffer[ctx->leftover] = 0;
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
-  /* fully carry h */
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
-  c = h1 >> 26;
-  h1 = h1 & 0x3ffffff;
-  h2 += c;
-  c = h2 >> 26;
-  h2 = h2 & 0x3ffffff;
-  h3 += c;
-  c = h3 >> 26;
-  h3 = h3 & 0x3ffffff;
-  h4 += c;
-  c = h4 >> 26;
-  h4 = h4 & 0x3ffffff;
-  h0 += c * 5;
-  c = h0 >> 26;
-  h0 = h0 & 0x3ffffff;
-  h1 += c;
-
-  /* compute h + -p */
-  g0 = h0 + 5;
-  c = g0 >> 26;
-  g0 &= 0x3ffffff;
-  g1 = h1 + c;
-  c = g1 >> 26;
-  g1 &= 0x3ffffff;
-  g2 = h2 + c;
-  c = g2 >> 26;
-  g2 &= 0x3ffffff;
-  g3 = h3 + c;
-  c = g3 >> 26;
-  g3 &= 0x3ffffff;
-  g4 = h4 + c - (1 << 26);
-
-  /* select h if h < p, or h + -p if h >= p */
-  mask = (g4 >> ((sizeof (u32) * 8) - 1)) - 1;
-  g0 &= mask;
-  g1 &= mask;
-  g2 &= mask;
-  g3 &= mask;
-  g4 &= mask;
-  mask = ~mask;
-  h0 = (h0 & mask) | g0;
-  h1 = (h1 & mask) | g1;
-  h2 = (h2 & mask) | g2;
-  h3 = (h3 & mask) | g3;
-  h4 = (h4 & mask) | g4;
-
-  /* h = h % (2^128) */
-  h0 = ((h0) | (h1 << 26)) & 0xffffffff;
-  h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
-  h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
-  h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
-
-  /* mac = (h + pad) % (2^128) */
-  f = (u64) h0 + st->pad[0];
-  h0 = (u32) f;
-  f = (u64) h1 + st->pad[1] + (f >> 32);
-  h1 = (u32) f;
-  f = (u64) h2 + st->pad[2] + (f >> 32);
-  h2 = (u32) f;
-  f = (u64) h3 + st->pad[3] + (f >> 32);
-  h3 = (u32) f;
-
-  buf_put_le32 (mac + 0, h0);
-  buf_put_le32 (mac + 4, h1);
-  buf_put_le32 (mac + 8, h2);
-  buf_put_le32 (mac + 12, h3);
-
-  /* zero out the state */
-  st->h[0] = 0;
-  st->h[1] = 0;
-  st->h[2] = 0;
-  st->h[3] = 0;
-  st->h[4] = 0;
-  st->r[0] = 0;
-  st->r[1] = 0;
-  st->r[2] = 0;
-  st->r[3] = 0;
-  st->r[4] = 0;
-  st->pad[0] = 0;
-  st->pad[1] = 0;
-  st->pad[2] = 0;
-  st->pad[3] = 0;
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
+  u = (carry + h4) >> 2; /* u == 0 or 1 */
+
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(carry, h0, 0, h0, 0, u);
+  add_ssaaaa(carry, h1, 0, h1, 0, carry);
+  add_ssaaaa(carry, h2, 0, h2, 0, carry);
+  add_ssaaaa(carry, h3, 0, h3, 0, carry);
+
+  /* add high part of key + h */
+  add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
+  add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
+  add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
+  add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
+  add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
+  add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
+  h3 += tmp2;
+
+  buf_put_le32(mac + 0, h0);
+  buf_put_le32(mac + 4, h1);
+  buf_put_le32(mac + 8, h2);
+  buf_put_le32(mac + 12, h3);
 
   /* burn_stack */
-  return (13 * sizeof (u32) + sizeof (u64) +
-	  POLY1305_REF_BLOCKSIZE + 6 * sizeof (void *)) + burn;
+  return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
 }
-#endif /* !POLY1305_USE_SSE2*/
 
-
-

-
-
-static inline void *
-poly1305_get_state (poly1305_context_t * ctx)
-{
-  byte *c = ctx->state;
-  c += POLY1305_STATE_ALIGNMENT - 1;
-  c -= (uintptr_t) c & (POLY1305_STATE_ALIGNMENT - 1);
-  return c;
-}
-
-
-static void
-poly1305_init (poly1305_context_t * ctx, const poly1305_key_t * key)
-{
-  void *state = poly1305_get_state (ctx);
-
-  ctx->leftover = 0;
-
-  ctx->ops->init_ext (state, key);
-}
+#endif /* USE_MPI_32BIT */
 
 
 void
-_gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes)
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
 {
-  void *state = poly1305_get_state (ctx);
   unsigned int burn = 0;
-  size_t block_size = ctx->ops->block_size;
 
   /* handle leftover */
   if (ctx->leftover)
     {
-      size_t want = (block_size - ctx->leftover);
+      size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
       if (want > bytes)
 	want = bytes;
       buf_cpy (ctx->buffer + ctx->leftover, m, want);
       bytes -= want;
       m += want;
       ctx->leftover += want;
-      if (ctx->leftover < block_size)
+      if (ctx->leftover < POLY1305_BLOCKSIZE)
 	return;
-      burn = ctx->ops->blocks (state, ctx->buffer, block_size);
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
 
   /* process full blocks */
-  if (bytes >= block_size)
+  if (bytes >= POLY1305_BLOCKSIZE)
     {
-      size_t want = (bytes & ~(block_size - 1));
-      burn = ctx->ops->blocks (state, m, want);
-      m += want;
-      bytes -= want;
+      size_t nblks = bytes / POLY1305_BLOCKSIZE;
+      burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
+      m += nblks * POLY1305_BLOCKSIZE;
+      bytes -= nblks * POLY1305_BLOCKSIZE;
     }
 
   /* store leftover */
@@ -441,12 +487,11 @@ _gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes)
 
 
 void
-_gcry_poly1305_finish (poly1305_context_t * ctx, byte mac[POLY1305_TAGLEN])
+_gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
 {
-  void *state = poly1305_get_state (ctx);
   unsigned int burn;
 
-  burn = ctx->ops->finish_ext (state, ctx->buffer, ctx->leftover, mac);
+  burn = poly1305_final (ctx, mac);
 
   _gcry_burn_stack (burn);
 }
@@ -458,8 +503,6 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
 {
   static int initialized;
   static const char *selftest_failed;
-  poly1305_key_t keytmp;
-  unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
@@ -475,26 +518,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
-#ifdef POLY1305_USE_SSE2
-  ctx->ops = &poly1305_amd64_sse2_ops;
-#else
-  ctx->ops = &poly1305_default_ops;
-#endif
-
-#ifdef POLY1305_USE_AVX2
-  if (features & HWF_INTEL_AVX2)
-    ctx->ops = &poly1305_amd64_avx2_ops;
-#endif
-#ifdef POLY1305_USE_NEON
-  if (features & HWF_ARM_NEON)
-    ctx->ops = &poly1305_armv7_neon_ops;
-#endif
-  (void)features;
-
-  buf_cpy (keytmp.b, key, POLY1305_KEYLEN);
-  poly1305_init (ctx, &keytmp);
-
-  wipememory (&keytmp, sizeof (keytmp));
+  poly1305_init (ctx, key);
 
   return 0;
 }
diff --git a/configure.ac b/configure.ac
index 57b840e6e..c4b59f4dd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2239,19 +2239,6 @@ if test "$found" = "1" ; then
    fi
 fi
 
-case "${host}" in
-   x86_64-*-*)
-      # Build with the assembly implementation
-      GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo"
-      GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo"
-   ;;
-esac
-
-if test x"$neonsupport" = xyes ; then
-   # Build with the NEON implementation
-   GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-armv7-neon.lo"
-fi
-
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"


From jussi.kivilinna at iki.fi  Sat Jan  6 19:03:01 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 06 Jan 2018 20:03:01 +0200
Subject: [PATCH 2/3] New ChaCha implementations
In-Reply-To: <151526177666.9411.1967680231927273017.stgit@localhost.localdomain>
References: <151526177666.9411.1967680231927273017.stgit@localhost.localdomain>
Message-ID: <151526178173.9411.5426154117445836854.stgit@localhost.localdomain>

* cipher/Makefile.am: Remove 'chacha20-sse2-amd64.S',
'chacha20-ssse3-amd64.S', 'chacha20-avx2-amd64.S'; Add
'chacha20-amd64-ssse3.S', 'chacha20-amd64-avx2.S'.
* cipher/chacha20-amd64-avx2.S: New.
* cipher/chacha20-amd64-ssse3.S: New.
* cipher/chacha20-armv7-neon.S: Rewrite.
* cipher/chacha20-avx2-amd64.S: Remove.
* cipher/chacha20-sse2-amd64.S: Remove.
* cipher/chacha20-ssse3-amd64.S: Remove.
* cipher/chacha20.c (CHACHA20_INPUT_LENGTH, USE_SSE2, USE_NEON)
(ASM_EXTRA_STACK, chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks)
(_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks)
(_gcry_chacha20_armv7_neon_blocks, QROUND, QOUT, chacha20_core)
(chacha20_do_encrypt_stream): Remove.
(_gcry_chacha20_amd64_ssse3_blocks4, _gcry_chacha20_amd64_avx2_blocks8)
(_gcry_chacha20_armv7_neon_blocks4, ROTATE, XOR, PLUS, PLUSONE)
(QUARTERROUND, BUF_XOR_LE32): New.
(CHACHA20_context_s, chacha20_blocks, chacha20_keysetup)
(chacha20_encrypt_stream): Rewrite.
(chacha20_do_setkey): Adjust for new CHACHA20_context_s.
* configure.ac: Remove 'chacha20-sse2-amd64.lo',
'chacha20-ssse3-amd64.lo', 'chacha20-avx2-amd64.lo'; Add
'chacha20-amd64-ssse3.lo', 'chacha20-amd64-avx2.lo'.
--

Intel Core i7-4790K CPU @ 4.00GHz (x86_64/AVX2):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.319 ns/B    2988.5 MiB/s      1.28 c/B
     STREAM dec |     0.318 ns/B    2995.4 MiB/s      1.27 c/B

Intel Core i7-4790K CPU @ 4.00GHz (x86_64/SSSE3):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.633 ns/B    1507.4 MiB/s      2.53 c/B
     STREAM dec |     0.633 ns/B    1506.6 MiB/s      2.53 c/B

Intel Core i7-4790K CPU @ 4.00GHz (i386):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      2.05 ns/B     465.2 MiB/s      8.20 c/B
     STREAM dec |      2.04 ns/B     467.5 MiB/s      8.16 c/B

Cortex-A53 @ 1152Mhz (armv7/neon):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      5.29 ns/B     180.3 MiB/s      6.09 c/B
     STREAM dec |      5.29 ns/B     180.1 MiB/s      6.10 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 08baa7c44..a24b117c2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -64,8 +64,7 @@ EXTRA_libcipher_la_SOURCES = \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
-chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
-  chacha20-armv7-neon.S \
+chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S chacha20-armv7-neon.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
new file mode 100644
index 000000000..f4f290dbb
--- /dev/null
+++ b/cipher/chacha20-amd64-avx2.S
@@ -0,0 +1,322 @@
+/* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (32)
+#define STACK_VEC_X13 (32 + STACK_VEC_X12)
+#define STACK_TMP     (32 + STACK_VEC_X13)
+#define STACK_TMP1    (32 + STACK_TMP)
+#define STACK_TMP2    (32 + STACK_TMP1)
+
+#define STACK_MAX     (32 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %ymm0
+#define X1 %ymm1
+#define X2 %ymm2
+#define X3 %ymm3
+#define X4 %ymm4
+#define X5 %ymm5
+#define X6 %ymm6
+#define X7 %ymm7
+#define X8 %ymm8
+#define X9 %ymm9
+#define X10 %ymm10
+#define X11 %ymm11
+#define X12 %ymm12
+#define X13 %ymm13
+#define X14 %ymm14
+#define X15 %ymm15
+
+#define X0h %xmm0
+#define X1h %xmm1
+#define X2h %xmm2
+#define X3h %xmm3
+#define X4h %xmm4
+#define X5h %xmm5
+#define X6h %xmm6
+#define X7h %xmm7
+#define X8h %xmm8
+#define X9h %xmm9
+#define X10h %xmm10
+#define X11h %xmm11
+#define X12h %xmm12
+#define X13h %xmm13
+#define X14h %xmm14
+#define X15h %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+/**********************************************************************
+  8-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp)	\
+	vpsrld $(32 - (c)), v1, tmp;	\
+	vpslld $(c), v1, v1;		\
+	vpaddb tmp, v1, v1;		\
+	vpsrld $(32 - (c)), v2, tmp;	\
+	vpslld $(c), v2, v2;		\
+	vpaddb tmp, v2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	vpshufb shuf, v1, v1;		\
+	vpshufb shuf, v2, v2;
+
+#define XOR(ds,s) \
+	vpxor s, ds, ds;
+
+#define PLUS(ds,s) \
+	vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1)		\
+	vbroadcasti128 .Lshuf_rol16 RIP, tmp1;			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1);				\
+	vbroadcasti128 .Lshuf_rol8 RIP, tmp1;			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1);
+
+#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1)	\
+	vextracti128 $1, yreg, tmp1##h;					\
+	vpxor offset_lo(src), yreg##h, yreg##h;				\
+	vpxor offset_hi(src), tmp1##h, tmp1##h;				\
+	vmovdqu yreg##h, offset_lo(dst);				\
+	vmovdqu tmp1##h, offset_hi(dst);
+
+.align 32
+chacha20_data:
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Linc_counter:
+	.byte 0,1,2,3,4,5,6,7
+.Lunsigned_cmp:
+	.long 0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_amd64_avx2_blocks8, at function;)
+
+_gcry_chacha20_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 */
+
+	vzeroupper;
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $STACK_MAX, %rsp;
+	andq $~31, %rsp;
+
+.Loop4:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter RIP, X0;
+	vpbroadcastd .Lunsigned_cmp RIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+.Lround2:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15)
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8)
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15)
+	sub $2, ROUND;
+	jnz .Lround2;
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	vpbroadcastd (8 * 4)(INPUT), X15;
+	PLUS(X8, X15);
+	vpbroadcastd (9 * 4)(INPUT), X15;
+	PLUS(X9, X15);
+	vpbroadcastd (10 * 4)(INPUT), X15;
+	PLUS(X10, X15);
+	vpbroadcastd (11 * 4)(INPUT), X15;
+	PLUS(X11, X15);
+	vmovdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	vmovdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X13, (STACK_TMP)(%rsp);
+	vpbroadcastd (14 * 4)(INPUT), X13;
+	PLUS(X14, X13);
+	vmovdqa X14, (STACK_TMP1)(%rsp);
+	vpbroadcastd (15 * 4)(INPUT), X13;
+	PLUS(X15, X13);
+	vmovdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14);
+	transpose_4x4(X4, X5, X6, X7, X13, X14);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
+	vmovdqa (STACK_TMP)(%rsp), X13;
+	vmovdqa (STACK_TMP1)(%rsp), X14;
+	vmovdqa (STACK_TMP2)(%rsp), X15;
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+
+	sub $8, NBLKS;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	jnz .Loop4;
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vmovdqa X0, (STACK_TMP2)(%rsp);
+	vzeroall;
+
+	/* eax zeroed by round loop. */
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_amd64_avx2_blocks8;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
new file mode 100644
index 000000000..7ad1c0ae3
--- /dev/null
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -0,0 +1,341 @@
+/* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (16 + STACK_VEC_X12)
+#define STACK_TMP     (16 + STACK_VEC_X13)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %xmm0
+#define X1 %xmm1
+#define X2 %xmm2
+#define X3 %xmm3
+#define X4 %xmm4
+#define X5 %xmm5
+#define X6 %xmm6
+#define X7 %xmm7
+#define X8 %xmm8
+#define X9 %xmm9
+#define X10 %xmm10
+#define X11 %xmm11
+#define X12 %xmm12
+#define X13 %xmm13
+#define X14 %xmm14
+#define X15 %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg, t) \
+	pxor_u(offset(src), xreg, t); \
+	movdqu xreg, offset(dst);
+
+#define clear(x) pxor x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp1,tmp2)	\
+	movdqa v1, tmp1; 		\
+	movdqa v2, tmp2; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;			\
+	psrld $(32 - (c)), v2;		\
+	pslld $(c), tmp2;		\
+	paddb tmp2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	pshufb shuf, v1;		\
+	pshufb shuf, v2;
+
+#define XOR(ds,s) \
+	pxor s, ds;
+
+#define PLUS(ds,s) \
+	paddd s, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)	\
+	movdqa .Lshuf_rol16 RIP, tmp1;				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
+	movdqa .Lshuf_rol8 RIP, tmp1;				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 16
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Linc_counter:
+	.long 0,1,2,3
+.Lunsigned_cmp:
+	.long 0x80000000,0x80000000,0x80000000,0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks4, at function;)
+
+_gcry_chacha20_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $STACK_MAX, %rsp;
+	andq $~15, %rsp;
+
+.Loop4:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vmovdqa .Linc_counter RIP, X0;
+	vmovdqa .Lunsigned_cmp RIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+.Lround2:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	sub $2, ROUND;
+	jnz .Lround2;
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	sub $4, NBLKS;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	jnz .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	/* eax zeroed by round loop. */
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index c1971fc7f..33a43df1f 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -1,6 +1,6 @@
-/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function
+/* chacha20-armv7-neon.S  -  ARMv7 NEON implementation of ChaCha20 cipher
  *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -19,732 +19,375 @@
  */
 
 /*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20)
+    defined(HAVE_GCC_INLINE_ASM_NEON)
 
 .syntax unified
 .fpu neon
 .arm
 
-#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d3}; \
-        vmov s0, l0; \
-        vmov s1, l1; \
-        vmov s2, l2; \
-        vmov s3, l3; \
-        vmov s4, l4; \
-        vmov s5, l5; \
-        vmov s6, l6; \
-        vmov s7, l7; \
-        vst1.32 {d0-d3}, [ptr]; \
-        add ptr, #32; \
-        vpop {d0-d3}; \
-        b 2f; \
-     1: stmia ptr!, {l0-l7}; \
-     2: ;
-
-#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d1}; \
-        vld1.32 {d0-d1}, [ptr]; \
-        add ptr, #16; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vmov l2, s2; \
-        vmov l3, s3; \
-        vpop {d0-d1}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l3}; \
-     2: ;
-
 .text
 
-.globl _gcry_chacha20_armv7_neon_blocks
-.type  _gcry_chacha20_armv7_neon_blocks,%function;
-_gcry_chacha20_armv7_neon_blocks:
-.Lchacha_blocks_neon_local:
-	tst r3, r3
-	beq .Lchacha_blocks_neon_nobytes
-	vstmdb sp!, {q4,q5,q6,q7}
-	stmfd sp!, {r4-r12, r14}
-	mov r8, sp
-	sub sp, sp, #196
-	and sp, sp, #0xffffffe0
-	str r0, [sp, #60]
-	str r1, [sp, #48]
-	str r2, [sp, #40]
-	str r3, [sp, #52]
-	str r8, [sp, #192]
-	add r1, sp, #64
-	ldmia r0!, {r4-r11}
-	stmia r1!, {r4-r11}
-	ldmia r0!, {r4-r11}
-	stmia r1!, {r4-r11}
-	mov r4, #20
-	str r4, [sp, #44]
-	cmp r3, #256
-	blo .Lchacha_blocks_neon_mainloop2
-.Lchacha_blocks_neon_mainloop1:
-	ldr r0, [sp, #44]
-	str r0, [sp, #0]
-	add r1, sp, #(64)
-	mov r2, #1
-	veor q12, q12
-	vld1.32 {q0,q1}, [r1,:128]!
-	vld1.32 {q2,q3}, [r1,:128]
-	vmov.32 d24[0], r2
-	vadd.u64 q3, q3, q12
-	vmov q4, q0
-	vmov q5, q1
-	vmov q6, q2
-	vadd.u64 q7, q3, q12
-	vmov q8, q0
-	vmov q9, q1
-	vmov q10, q2
-	vadd.u64 q11, q7, q12
-	add r0, sp, #64
-	ldm r0, {r0-r12}
-	ldr r14, [sp, #(64 +60)]
-	str r6, [sp, #8]
-	str r11, [sp, #12]
-	str r14, [sp, #28]
-	ldr r11, [sp, #(64 +52)]
-	ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds1:
-	ldr r6, [sp, #0]
-	vadd.i32 q0, q0, q1
-	add r0, r0, r4
-	vadd.i32 q4, q4, q5
-	add r1, r1, r5
-	vadd.i32 q8, q8, q9
-	eor r12, r12, r0
-	veor q12, q3, q0
-	eor r11, r11, r1
-	veor q13, q7, q4
-	ror r12, r12, #16
-	veor q14, q11, q8
-	ror r11, r11, #16
-	vrev32.16 q3, q12
-	subs r6, r6, #2
-	vrev32.16 q7, q13
-	add r8, r8, r12
-	vrev32.16 q11, q14
-	add r9, r9, r11
-	vadd.i32 q2, q2, q3
-	eor r4, r4, r8
-	vadd.i32 q6, q6, q7
-	eor r5, r5, r9
-	vadd.i32 q10, q10, q11
-	str r6, [sp, #0]
-	veor q12, q1, q2
-	ror r4, r4, #20
-	veor q13, q5, q6
-	ror r5, r5, #20
-	veor q14, q9, q10
-	add r0, r0, r4
-	vshl.i32 q1, q12, #12
-	add r1, r1, r5
-	vshl.i32 q5, q13, #12
-	ldr r6, [sp, #8]
-	vshl.i32 q9, q14, #12
-	eor r12, r12, r0
-	vsri.u32 q1, q12, #20
-	eor r11, r11, r1
-	vsri.u32 q5, q13, #20
-	ror r12, r12, #24
-	vsri.u32 q9, q14, #20
-	ror r11, r11, #24
-	vadd.i32 q0, q0, q1
-	add r8, r8, r12
-	vadd.i32 q4, q4, q5
-	add r9, r9, r11
-	vadd.i32 q8, q8, q9
-	eor r4, r4, r8
-	veor q12, q3, q0
-	eor r5, r5, r9
-	veor q13, q7, q4
-	str r11, [sp, #20]
-	veor q14, q11, q8
-	ror r4, r4, #25
-	vshl.i32 q3, q12, #8
-	ror r5, r5, #25
-	vshl.i32 q7, q13, #8
-	str r4, [sp, #4]
-	vshl.i32 q11, q14, #8
-	ldr r4, [sp, #28]
-	vsri.u32 q3, q12, #24
-	add r2, r2, r6
-	vsri.u32 q7, q13, #24
-	add r3, r3, r7
-	vsri.u32 q11, q14, #24
-	ldr r11, [sp, #12]
-	vadd.i32 q2, q2, q3
-	eor r14, r14, r2
-	vadd.i32 q6, q6, q7
-	eor r4, r4, r3
-	vadd.i32 q10, q10, q11
-	ror r14, r14, #16
-	veor q12, q1, q2
-	ror r4, r4, #16
-	veor q13, q5, q6
-	add r10, r10, r14
-	veor q14, q9, q10
-	add r11, r11, r4
-	vshl.i32 q1, q12, #7
-	eor r6, r6, r10
-	vshl.i32 q5, q13, #7
-	eor r7, r7, r11
-	vshl.i32 q9, q14, #7
-	ror r6, r6, #20
-	vsri.u32 q1, q12, #25
-	ror r7, r7, #20
-	vsri.u32 q5, q13, #25
-	add r2, r2, r6
-	vsri.u32 q9, q14, #25
-	add r3, r3, r7
-	vext.32 q3, q3, q3, #3
-	eor r14, r14, r2
-	vext.32 q7, q7, q7, #3
-	eor r4, r4, r3
-	vext.32 q11, q11, q11, #3
-	ror r14, r14, #24
-	vext.32 q1, q1, q1, #1
-	ror r4, r4, #24
-	vext.32 q5, q5, q5, #1
-	add r10, r10, r14
-	vext.32 q9, q9, q9, #1
-	add r11, r11, r4
-	vext.32 q2, q2, q2, #2
-	eor r6, r6, r10
-	vext.32 q6, q6, q6, #2
-	eor r7, r7, r11
-	vext.32 q10, q10, q10, #2
-	ror r6, r6, #25
-	vadd.i32 q0, q0, q1
-	ror r7, r7, #25
-	vadd.i32 q4, q4, q5
-	add r0, r0, r5
-	vadd.i32 q8, q8, q9
-	add r1, r1, r6
-	veor q12, q3, q0
-	eor r4, r4, r0
-	veor q13, q7, q4
-	eor r12, r12, r1
-	veor q14, q11, q8
-	ror r4, r4, #16
-	vrev32.16 q3, q12
-	ror r12, r12, #16
-	vrev32.16 q7, q13
-	add r10, r10, r4
-	vrev32.16 q11, q14
-	add r11, r11, r12
-	vadd.i32 q2, q2, q3
-	eor r5, r5, r10
-	vadd.i32 q6, q6, q7
-	eor r6, r6, r11
-	vadd.i32 q10, q10, q11
-	ror r5, r5, #20
-	veor q12, q1, q2
-	ror r6, r6, #20
-	veor q13, q5, q6
-	add r0, r0, r5
-	veor q14, q9, q10
-	add r1, r1, r6
-	vshl.i32 q1, q12, #12
-	eor r4, r4, r0
-	vshl.i32 q5, q13, #12
-	eor r12, r12, r1
-	vshl.i32 q9, q14, #12
-	ror r4, r4, #24
-	vsri.u32 q1, q12, #20
-	ror r12, r12, #24
-	vsri.u32 q5, q13, #20
-	add r10, r10, r4
-	vsri.u32 q9, q14, #20
-	add r11, r11, r12
-	vadd.i32 q0, q0, q1
-	eor r5, r5, r10
-	vadd.i32 q4, q4, q5
-	eor r6, r6, r11
-	vadd.i32 q8, q8, q9
-	str r11, [sp, #12]
-	veor q12, q3, q0
-	ror r5, r5, #25
-	veor q13, q7, q4
-	ror r6, r6, #25
-	veor q14, q11, q8
-	str r4, [sp, #28]
-	vshl.i32 q3, q12, #8
-	ldr r4, [sp, #4]
-	vshl.i32 q7, q13, #8
-	add r2, r2, r7
-	vshl.i32 q11, q14, #8
-	add r3, r3, r4
-	vsri.u32 q3, q12, #24
-	ldr r11, [sp, #20]
-	vsri.u32 q7, q13, #24
-	eor r11, r11, r2
-	vsri.u32 q11, q14, #24
-	eor r14, r14, r3
-	vadd.i32 q2, q2, q3
-	ror r11, r11, #16
-	vadd.i32 q6, q6, q7
-	ror r14, r14, #16
-	vadd.i32 q10, q10, q11
-	add r8, r8, r11
-	veor q12, q1, q2
-	add r9, r9, r14
-	veor q13, q5, q6
-	eor r7, r7, r8
-	veor q14, q9, q10
-	eor r4, r4, r9
-	vshl.i32 q1, q12, #7
-	ror r7, r7, #20
-	vshl.i32 q5, q13, #7
-	ror r4, r4, #20
-	vshl.i32 q9, q14, #7
-	str r6, [sp, #8]
-	vsri.u32 q1, q12, #25
-	add r2, r2, r7
-	vsri.u32 q5, q13, #25
-	add r3, r3, r4
-	vsri.u32 q9, q14, #25
-	eor r11, r11, r2
-	vext.32 q3, q3, q3, #1
-	eor r14, r14, r3
-	vext.32 q7, q7, q7, #1
-	ror r11, r11, #24
-	vext.32 q11, q11, q11, #1
-	ror r14, r14, #24
-	vext.32 q1, q1, q1, #3
-	add r8, r8, r11
-	vext.32 q5, q5, q5, #3
-	add r9, r9, r14
-	vext.32 q9, q9, q9, #3
-	eor r7, r7, r8
-	vext.32 q2, q2, q2, #2
-	eor r4, r4, r9
-	vext.32 q6, q6, q6, #2
-	ror r7, r7, #25
-	vext.32 q10, q10, q10, #2
-	ror r4, r4, #25
-	bne .Lchacha_blocks_neon_rounds1
-	str r8, [sp, #0]
-	str r9, [sp, #4]
-	str r10, [sp, #8]
-	str r12, [sp, #16]
-	str r11, [sp, #20]
-	str r14, [sp, #24]
-	add r9, sp, #64
-	vld1.32 {q12,q13}, [r9,:128]!
-	ldr r12, [sp, #48]
-	vld1.32 {q14,q15}, [r9,:128]
-	ldr r14, [sp, #40]
-	vadd.i32 q0, q0, q12
-	ldr r8, [sp, #(64 +0)]
-	vadd.i32 q4, q4, q12
-	ldr r9, [sp, #(64 +4)]
-	vadd.i32 q8, q8, q12
-	ldr r10, [sp, #(64 +8)]
-	vadd.i32 q1, q1, q13
-	ldr r11, [sp, #(64 +12)]
-	vadd.i32 q5, q5, q13
-	add r0, r0, r8
-	vadd.i32 q9, q9, q13
-	add r1, r1, r9
-	vadd.i32 q2, q2, q14
-	add r2, r2, r10
-	vadd.i32 q6, q6, q14
-	ldr r8, [sp, #(64 +16)]
-	vadd.i32 q10, q10, q14
-	add r3, r3, r11
-	veor q14, q14, q14
-	ldr r9, [sp, #(64 +20)]
-	mov r11, #1
-	add r4, r4, r8
-	vmov.32 d28[0], r11
-	ldr r10, [sp, #(64 +24)]
-	vadd.u64 q12, q14, q15
-	add r5, r5, r9
-	vadd.u64 q13, q14, q12
-	ldr r11, [sp, #(64 +28)]
-	vadd.u64 q14, q14, q13
-	add r6, r6, r10
-	vadd.i32 q3, q3, q12
-	tst r12, r12
-	vadd.i32 q7, q7, q13
-	add r7, r7, r11
-	vadd.i32 q11, q11, q14
-	beq .Lchacha_blocks_neon_nomessage11
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage11:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	tst r12, r12
-	ldm sp, {r0-r7}
-	ldr r8, [sp, #(64 +32)]
-	ldr r9, [sp, #(64 +36)]
-	ldr r10, [sp, #(64 +40)]
-	ldr r11, [sp, #(64 +44)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +48)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +52)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +56)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +60)]
-	add r6, r6, r10
-	adds r8, r8, #4
-	add r7, r7, r11
-	adc r9, r9, #0
-	str r8, [sp, #(64 +48)]
-	tst r12, r12
-	str r9, [sp, #(64 +52)]
-	beq .Lchacha_blocks_neon_nomessage12
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage12:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	tst r12, r12
-	beq .Lchacha_blocks_neon_nomessage13
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q0, q0, q12
-	veor q1, q1, q13
-	veor q2, q2, q14
-	veor q3, q3, q15
-.Lchacha_blocks_neon_nomessage13:
-	vst1.32 {q0,q1}, [r14]!
-	vst1.32 {q2,q3}, [r14]!
-	beq .Lchacha_blocks_neon_nomessage14
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q4, q4, q12
-	veor q5, q5, q13
-	veor q6, q6, q14
-	veor q7, q7, q15
-.Lchacha_blocks_neon_nomessage14:
-	vst1.32 {q4,q5}, [r14]!
-	vst1.32 {q6,q7}, [r14]!
-	beq .Lchacha_blocks_neon_nomessage15
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q8, q8, q12
-	veor q9, q9, q13
-	veor q10, q10, q14
-	veor q11, q11, q15
-.Lchacha_blocks_neon_nomessage15:
-	vst1.32 {q8,q9}, [r14]!
-	vst1.32 {q10,q11}, [r14]!
-	str r12, [sp, #48]
-	str r14, [sp, #40]
-	ldr r3, [sp, #52]
-	sub r3, r3, #256
-	cmp r3, #256
-	str r3, [sp, #52]
-	bhs .Lchacha_blocks_neon_mainloop1
-	tst r3, r3
-	beq .Lchacha_blocks_neon_done
-.Lchacha_blocks_neon_mainloop2:
-	ldr r3, [sp, #52]
-	ldr r1, [sp, #48]
-	cmp r3, #64
-	bhs .Lchacha_blocks_neon_noswap1
-	add r4, sp, #128
-	mov r5, r4
-	tst r1, r1
-	beq .Lchacha_blocks_neon_nocopy1
-.Lchacha_blocks_neon_copyinput1:
-	subs r3, r3, #1
-	ldrb r0, [r1], #1
-	strb r0, [r4], #1
-	bne .Lchacha_blocks_neon_copyinput1
-	str r5, [sp, #48]
-.Lchacha_blocks_neon_nocopy1:
-	ldr r4, [sp, #40]
-	str r5, [sp, #40]
-	str r4, [sp, #56]
-.Lchacha_blocks_neon_noswap1:
-	ldr r0, [sp, #44]
-	str r0, [sp, #0]
-	add r0, sp, #64
-	ldm r0, {r0-r12}
-	ldr r14, [sp, #(64 +60)]
-	str r6, [sp, #8]
-	str r11, [sp, #12]
-	str r14, [sp, #28]
-	ldr r11, [sp, #(64 +52)]
-	ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds2:
-	ldr r6, [sp, #0]
-	add r0, r0, r4
-	add r1, r1, r5
-	eor r12, r12, r0
-	eor r11, r11, r1
-	ror r12, r12, #16
-	ror r11, r11, #16
-	subs r6, r6, #2
-	add r8, r8, r12
-	add r9, r9, r11
-	eor r4, r4, r8
-	eor r5, r5, r9
-	str r6, [sp, #0]
-	ror r4, r4, #20
-	ror r5, r5, #20
-	add r0, r0, r4
-	add r1, r1, r5
-	ldr r6, [sp, #8]
-	eor r12, r12, r0
-	eor r11, r11, r1
-	ror r12, r12, #24
-	ror r11, r11, #24
-	add r8, r8, r12
-	add r9, r9, r11
-	eor r4, r4, r8
-	eor r5, r5, r9
-	str r11, [sp, #20]
-	ror r4, r4, #25
-	ror r5, r5, #25
-	str r4, [sp, #4]
-	ldr r4, [sp, #28]
-	add r2, r2, r6
-	add r3, r3, r7
-	ldr r11, [sp, #12]
-	eor r14, r14, r2
-	eor r4, r4, r3
-	ror r14, r14, #16
-	ror r4, r4, #16
-	add r10, r10, r14
-	add r11, r11, r4
-	eor r6, r6, r10
-	eor r7, r7, r11
-	ror r6, r6, #20
-	ror r7, r7, #20
-	add r2, r2, r6
-	add r3, r3, r7
-	eor r14, r14, r2
-	eor r4, r4, r3
-	ror r14, r14, #24
-	ror r4, r4, #24
-	add r10, r10, r14
-	add r11, r11, r4
-	eor r6, r6, r10
-	eor r7, r7, r11
-	ror r6, r6, #25
-	ror r7, r7, #25
-	add r0, r0, r5
-	add r1, r1, r6
-	eor r4, r4, r0
-	eor r12, r12, r1
-	ror r4, r4, #16
-	ror r12, r12, #16
-	add r10, r10, r4
-	add r11, r11, r12
-	eor r5, r5, r10
-	eor r6, r6, r11
-	ror r5, r5, #20
-	ror r6, r6, #20
-	add r0, r0, r5
-	add r1, r1, r6
-	eor r4, r4, r0
-	eor r12, r12, r1
-	ror r4, r4, #24
-	ror r12, r12, #24
-	add r10, r10, r4
-	add r11, r11, r12
-	eor r5, r5, r10
-	eor r6, r6, r11
-	str r11, [sp, #12]
-	ror r5, r5, #25
-	ror r6, r6, #25
-	str r4, [sp, #28]
-	ldr r4, [sp, #4]
-	add r2, r2, r7
-	add r3, r3, r4
-	ldr r11, [sp, #20]
-	eor r11, r11, r2
-	eor r14, r14, r3
-	ror r11, r11, #16
-	ror r14, r14, #16
-	add r8, r8, r11
-	add r9, r9, r14
-	eor r7, r7, r8
-	eor r4, r4, r9
-	ror r7, r7, #20
-	ror r4, r4, #20
-	str r6, [sp, #8]
-	add r2, r2, r7
-	add r3, r3, r4
-	eor r11, r11, r2
-	eor r14, r14, r3
-	ror r11, r11, #24
-	ror r14, r14, #24
-	add r8, r8, r11
-	add r9, r9, r14
-	eor r7, r7, r8
-	eor r4, r4, r9
-	ror r7, r7, #25
-	ror r4, r4, #25
-	bne .Lchacha_blocks_neon_rounds2
-	str r8, [sp, #0]
-	str r9, [sp, #4]
-	str r10, [sp, #8]
-	str r12, [sp, #16]
-	str r11, [sp, #20]
-	str r14, [sp, #24]
-	ldr r12, [sp, #48]
-	ldr r14, [sp, #40]
-	ldr r8, [sp, #(64 +0)]
-	ldr r9, [sp, #(64 +4)]
-	ldr r10, [sp, #(64 +8)]
-	ldr r11, [sp, #(64 +12)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +16)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +20)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +24)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +28)]
-	add r6, r6, r10
-	tst r12, r12
-	add r7, r7, r11
-	beq .Lchacha_blocks_neon_nomessage21
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage21:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	ldm sp, {r0-r7}
-	ldr r8, [sp, #(64 +32)]
-	ldr r9, [sp, #(64 +36)]
-	ldr r10, [sp, #(64 +40)]
-	ldr r11, [sp, #(64 +44)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +48)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +52)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +56)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +60)]
-	add r6, r6, r10
-	adds r8, r8, #1
-	add r7, r7, r11
-	adc r9, r9, #0
-	str r8, [sp, #(64 +48)]
-	tst r12, r12
-	str r9, [sp, #(64 +52)]
-	beq .Lchacha_blocks_neon_nomessage22
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage22:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	str r12, [sp, #48]
-	str r14, [sp, #40]
-	ldr r3, [sp, #52]
-	cmp r3, #64
-	sub r4, r3, #64
-	str r4, [sp, #52]
-	bhi .Lchacha_blocks_neon_mainloop2
-	cmp r3, #64
-	beq .Lchacha_blocks_neon_nocopy2
-	ldr r1, [sp, #56]
-	sub r14, r14, #64
-.Lchacha_blocks_neon_copyinput2:
-	subs r3, r3, #1
-	ldrb r0, [r14], #1
-	strb r0, [r1], #1
-	bne .Lchacha_blocks_neon_copyinput2
-.Lchacha_blocks_neon_nocopy2:
-.Lchacha_blocks_neon_done:
-	ldr r7, [sp, #60]
-	ldr r8, [sp, #(64 +48)]
-	ldr r9, [sp, #(64 +52)]
-	str r8, [r7, #(48 + 0)]
-	str r9, [r7, #(48 + 4)]
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* register macros */
+#define INPUT r0
+#define DST   r1
+#define SRC   r2
+#define NBLKS r3
+#define ROUND r4
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (STACK_VEC_X12 + 16)
+#define STACK_TMP     (STACK_VEC_X13 + 16)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 q0
+#define X1 q1
+#define X2 q2
+#define X3 q3
+#define X4 q4
+#define X5 q5
+#define X6 q6
+#define X7 q7
+#define X8 q8
+#define X9 q9
+#define X10 q10
+#define X11 q11
+#define X12 q12
+#define X13 q13
+#define X14 q14
+#define X15 q15
+
+#define X0l d0
+#define X1l d2
+#define X2l d4
+#define X3l d6
+#define X4l d8
+#define X5l d10
+#define X6l d12
+#define X7l d14
+#define X8l d16
+#define X9l d18
+#define X10l d20
+#define X11l d22
+#define X12l d24
+#define X13l d26
+#define X14l d28
+#define X15l d30
+
+#define X0h d1
+#define X1h d3
+#define X2h d5
+#define X3h d7
+#define X4h d9
+#define X5h d11
+#define X6h d13
+#define X7h d15
+#define X8h d17
+#define X9h d19
+#define X10h d21
+#define X11h d23
+#define X12h d25
+#define X13h d27
+#define X14h d29
+#define X15h d31
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4_part1(_q0, _q1, _q2, _q3)	\
+	vtrn.32 _q0, _q1;			\
+	vtrn.32 _q2, _q3;
+#define transpose_4x4_part2(_q0, _q1, _q2, _q3)	\
+	vswp _q0##h, _q2##l;			\
+	vswp _q1##h, _q3##l;
+
+#define clear(x) veor x,x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2)		\
+	vshl.u32 dst1, src1, #(c);		\
+	vshl.u32 dst2, src2, #(c);		\
+	vsri.u32 dst1, src1, #(32 - (c));	\
+	vsri.u32 dst2, src2, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2)		\
+	vrev32.16 dst1, src1;			\
+	vrev32.16 dst2, src2;
+
+#define XOR(d,s1,s2) \
+	veor d, s2, s1;
+
+#define PLUS(ds,s) \
+	vadd.u32 ds, ds, s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)		\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2_16(d1, d2, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2(d1, d2,  8, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+	.long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_armv7_neon_blocks4
+.type _gcry_chacha20_armv7_neon_blocks4,%function;
+
+_gcry_chacha20_armv7_neon_blocks4:
+	/* input:
+	 *	r0: input
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: nblks (multiple of 4)
+	 */
+
+	vpush {q4-q7};
+	push {r4-r12,lr};
+
 	mov r12, sp
-	stmia r12!, {r0-r7}
-	add r12, r12, #48
-	stmia r12!, {r0-r7}
-	sub r0, sp, #8
-	ldr sp, [sp, #192]
-	ldmfd sp!, {r4-r12, r14}
-	vldm sp!, {q4-q7}
-	sub r0, sp, r0
-	bx lr
-.Lchacha_blocks_neon_nobytes:
-	mov r0, #0;
+
+	mov r6, sp;
+	sub r6, r6, #(STACK_MAX);
+	and r6, r6, #(~15);
+	mov sp, r6;
+	GET_DATA_POINTER(r9, .Linc_counter, lr);
+	add lr, INPUT, #(12*4);
+	add r8, sp, #STACK_VEC_X12;
+
+.Loop4:
+	mov ROUND, #20;
+
+	/* Construct counter vectors X12 and X13 */
+
+	vld1.8 {X15}, [lr];
+	mov lr, INPUT;
+	vld1.8 {X8}, [r9];
+	vdup.32 X12, X15l[0];
+	vdup.32 X13, X15l[1];
+	vld1.8 {X3}, [lr]!;
+	vadd.u32 X12, X12, X8;
+	vdup.32 X0, X3l[0];
+	vdup.32 X1, X3l[1];
+	vdup.32 X2, X3h[0];
+	vcgt.u32 X8, X8, X12;
+	vdup.32 X3, X3h[1];
+	vdup.32 X14, X15h[0];
+	vdup.32 X15, X15h[1];
+	vsub.u32 X13, X13, X8;
+	vld1.8 {X7}, [lr]!;
+	vld1.8 {X11}, [lr];
+	vst1.8 {X12, X13}, [r8];
+	vdup.32 X4, X7l[0];
+	vdup.32 X5, X7l[1];
+	vdup.32 X6, X7h[0];
+	vdup.32 X7, X7h[1];
+	vdup.32 X8, X11l[0];
+	vdup.32 X9, X11l[1];
+	vdup.32 X10, X11h[0];
+	vdup.32 X11, X11h[1];
+
+	add r7, sp, #STACK_TMP2;
+	add r6, sp, #STACK_TMP1;
+	add r5, sp, #STACK_TMP;
+	vst1.8 {X15}, [r6];
+	vst1.8 {X11}, [r5];
+
+	mov lr, INPUT;
+.Lround2:
+	subs ROUND, ROUND, #2
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X8}, [r5];
+	vst1.8 {X9}, [r6];
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	vld1.8 {X8}, [r5];
+	vld1.8 {X9}, [r6];
+	vst1.8 {X11}, [r5];
+	vst1.8 {X15}, [r6];
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	bne .Lround2;
+
+	vld1.8 {X11}, [lr]!;
+	vst1.8 {X14}, [r7];
+
+	vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */
+	vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */
+	PLUS(X0, X14);
+	PLUS(X1, X15);
+	vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */
+	vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */
+	PLUS(X2, X14);
+	PLUS(X3, X15);
+
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X0}, [r5];
+	vld1.8 {X0}, [lr]!;
+	vst1.8 {X1}, [r6];
+
+	vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 5 * 4 */
+	PLUS(X4, X14);
+	PLUS(X5, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 7 * 4 */
+	PLUS(X6, X14);
+	PLUS(X7, X1);
+
+	vld1.8 {X0}, [lr]!;
+
+	vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 9 * 4 */
+	PLUS(X8, X14);
+	PLUS(X9, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X10, X14);
+	PLUS(X11, X1);
+
+	vld1.8 {X0}, [lr];
+	add lr, INPUT, #(12*4)
+	vld1.8 {X14}, [r7];
+
+	vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */
+	ldm lr, {r10, r11}; /* Update counter */
+	vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X14, X1);
+	PLUS(X15, X0);
+	adds r10, r10, #4;  /* Update counter */
+	vld1.8 {X0, X1}, [r8];
+
+	PLUS(X12, X0);
+	vld1.8 {X0}, [r5];
+	PLUS(X13, X1);
+	adc r11, r11, #0;   /* Update counter */
+
+	vld1.8 {X1}, [r6];
+	stm lr, {r10, r11}; /* Update counter */
+	transpose_4x4_part1(X0, X1, X2, X3);
+	transpose_4x4_part1(X4, X5, X6, X7);
+	transpose_4x4_part1(X8, X9, X10, X11);
+	transpose_4x4_part1(X12, X13, X14, X15);
+	transpose_4x4_part2(X0, X1, X2, X3);
+	transpose_4x4_part2(X4, X5, X6, X7);
+	transpose_4x4_part2(X8, X9, X10, X11);
+	transpose_4x4_part2(X12, X13, X14, X15);
+
+	subs NBLKS, NBLKS, #4;
+
+	vst1.8 {X10}, [r5];
+	add lr, INPUT, #(12*4)
+	vst1.8 {X11}, [r6];
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X10, X0, X10;
+	vld1.8 {X0}, [SRC]!;
+	veor X11, X4, X11;
+	vld1.8 {X4}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X0, X8, X0;
+	veor X4, X12, X4;
+	veor X10, X1, X10;
+	veor X11, X5, X11;
+	vst1.8 {X0}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4}, [DST]!;
+	vld1.8 {X4, X5}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10}, [r5];
+	vld1.8 {X11}, [r6];
+	veor X0, X9, X0;
+	vld1.8 {X8, X9}, [SRC]!;
+	veor X1, X13, X1;
+	vld1.8 {X12, X13}, [SRC]!;
+	veor X4, X2, X4;
+	veor X5, X6, X5;
+	vst1.8 {X0, X1}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4, X5}, [DST]!;
+	veor X8, X10, X8;
+	veor X9, X14, X9;
+	veor X12, X3, X12;
+	veor X13, X7, X13;
+	veor X0, X11, X0;
+	veor X1, X15, X1;
+	vst1.8 {X8, X9}, [DST]!;
+	vst1.8 {X12, X13}, [DST]!;
+	vst1.8 {X0, X1}, [DST]!;
+
+	bne .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	vst1.8 {X0}, [r5];
+	vst1.8 {X0}, [r6];
+	vst1.8 {X0}, [r7];
+	vst1.8 {X0}, [r8]!;
+	vst1.8 {X0}, [r8];
+
+	mov sp, r12
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	pop {r4-r12,lr}
+	vpop {q4-q7}
+	eor r0, r0, r0
 	bx lr
-.ltorg
-.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks;
+.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4;
 
 #endif
diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S
deleted file mode 100644
index 8c085bad6..000000000
--- a/cipher/chacha20-avx2-amd64.S
+++ /dev/null
@@ -1,956 +0,0 @@
-/* chacha20-avx2-amd64.S  -  AMD64/AVX2 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_avx2_blocks
-ELF(.type  _gcry_chacha20_amd64_avx2_blocks, at function;)
-_gcry_chacha20_amd64_avx2_blocks:
-.Lchacha_blocks_avx2_local:
-	vzeroupper
-	pushq %rbx
-	pushq %rbp
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	leaq .LC RIP, %rax
-	vmovdqu 0(%rax), %xmm6
-	vmovdqu 16(%rax), %xmm7
-	vmovdqu 0(%rdi), %xmm8
-	vmovdqu 16(%rdi), %xmm9
-	vmovdqu 32(%rdi), %xmm10
-	vmovdqu 48(%rdi), %xmm11
-	movl $20, %eax
-	movq $1, %r9
-	vmovdqa %xmm8, 0(%rsp)
-	vmovdqa %xmm9, 16(%rsp)
-	vmovdqa %xmm10, 32(%rsp)
-	vmovdqa %xmm11, 48(%rsp)
-	movq %rax, 64(%rsp)
-	vmovdqa %xmm6, 448(%rsp)
-	vmovdqa %xmm6, 464(%rsp)
-	vmovdqa %xmm7, 480(%rsp)
-	vmovdqa %xmm7, 496(%rsp)
-	cmpq $512, %rcx
-	jae .Lchacha_blocks_avx2_atleast512
-	cmp $256, %rcx
-	jae .Lchacha_blocks_avx2_atleast256
-	jmp .Lchacha_blocks_avx2_below256
-	.p2align 6,,63
-.Lchacha_blocks_avx2_atleast512:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	leaq 5(%rax), %r11
-	leaq 6(%rax), %r12
-	leaq 7(%rax), %r13
-	leaq 8(%rax), %r14
-	movl %eax, 128(%rsp)
-	movl %r8d, 4+128(%rsp)
-	movl %r9d, 8+128(%rsp)
-	movl %r10d, 12+128(%rsp)
-	movl %ebx, 16+128(%rsp)
-	movl %r11d, 20+128(%rsp)
-	movl %r12d, 24+128(%rsp)
-	movl %r13d, 28+128(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	shrq $32, %rbx
-	shrq $32, %r11
-	shrq $32, %r12
-	shrq $32, %r13
-	movl %eax, 160(%rsp)
-	movl %r8d, 4+160(%rsp)
-	movl %r9d, 8+160(%rsp)
-	movl %r10d, 12+160(%rsp)
-	movl %ebx, 16+160(%rsp)
-	movl %r11d, 20+160(%rsp)
-	movl %r12d, 24+160(%rsp)
-	movl %r13d, 28+160(%rsp)
-	movq %r14, 48(%rsp)
-	movq 64(%rsp), %rax
-	vpbroadcastd 0(%rsp), %ymm0
-	vpbroadcastd 4+0(%rsp), %ymm1
-	vpbroadcastd 8+0(%rsp), %ymm2
-	vpbroadcastd 12+0(%rsp), %ymm3
-	vpbroadcastd 16(%rsp), %ymm4
-	vpbroadcastd 4+16(%rsp), %ymm5
-	vpbroadcastd 8+16(%rsp), %ymm6
-	vpbroadcastd 12+16(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-.Lchacha_blocks_avx2_mainloop1:
-	vpaddd %ymm0, %ymm4, %ymm0
-	vpaddd %ymm1, %ymm5, %ymm1
-	vpxor %ymm12, %ymm0, %ymm12
-	vpxor %ymm13, %ymm1, %ymm13
-	vpaddd %ymm2, %ymm6, %ymm2
-	vpaddd %ymm3, %ymm7, %ymm3
-	vpxor %ymm14, %ymm2, %ymm14
-	vpxor %ymm15, %ymm3, %ymm15
-	vpshufb 448(%rsp), %ymm12, %ymm12
-	vpshufb 448(%rsp), %ymm13, %ymm13
-	vpaddd %ymm8, %ymm12, %ymm8
-	vpaddd %ymm9, %ymm13, %ymm9
-	vpshufb 448(%rsp), %ymm14, %ymm14
-	vpshufb 448(%rsp), %ymm15, %ymm15
-	vpaddd %ymm10, %ymm14, %ymm10
-	vpaddd %ymm11, %ymm15, %ymm11
-	vmovdqa %ymm12, 96(%rsp)
-	vpxor %ymm4, %ymm8, %ymm4
-	vpxor %ymm5, %ymm9, %ymm5
-	vpslld $ 12, %ymm4, %ymm12
-	vpsrld $20, %ymm4, %ymm4
-	vpxor %ymm4, %ymm12, %ymm4
-	vpslld $ 12, %ymm5, %ymm12
-	vpsrld $20, %ymm5, %ymm5
-	vpxor %ymm5, %ymm12, %ymm5
-	vpxor %ymm6, %ymm10, %ymm6
-	vpxor %ymm7, %ymm11, %ymm7
-	vpslld $ 12, %ymm6, %ymm12
-	vpsrld $20, %ymm6, %ymm6
-	vpxor %ymm6, %ymm12, %ymm6
-	vpslld $ 12, %ymm7, %ymm12
-	vpsrld $20, %ymm7, %ymm7
-	vpxor %ymm7, %ymm12, %ymm7
-	vpaddd %ymm0, %ymm4, %ymm0
-	vpaddd %ymm1, %ymm5, %ymm1
-	vpxor 96(%rsp), %ymm0, %ymm12
-	vpxor %ymm13, %ymm1, %ymm13
-	vpaddd %ymm2, %ymm6, %ymm2
-	vpaddd %ymm3, %ymm7, %ymm3
-	vpxor %ymm14, %ymm2, %ymm14
-	vpxor %ymm15, %ymm3, %ymm15
-	vpshufb 480(%rsp), %ymm12, %ymm12
-	vpshufb 480(%rsp), %ymm13, %ymm13
-	vpaddd %ymm8, %ymm12, %ymm8
-	vpaddd %ymm9, %ymm13, %ymm9
-	vpshufb 480(%rsp), %ymm14, %ymm14
-	vpshufb 480(%rsp), %ymm15, %ymm15
-	vpaddd %ymm10, %ymm14, %ymm10
-	vpaddd %ymm11, %ymm15, %ymm11
-	vmovdqa %ymm12, 96(%rsp)
-	vpxor %ymm4, %ymm8, %ymm4
-	vpxor %ymm5, %ymm9, %ymm5
-	vpslld $ 7, %ymm4, %ymm12
-	vpsrld $25, %ymm4, %ymm4
-	vpxor %ymm4, %ymm12, %ymm4
-	vpslld $ 7, %ymm5, %ymm12
-	vpsrld $25, %ymm5, %ymm5
-	vpxor %ymm5, %ymm12, %ymm5
-	vpxor %ymm6, %ymm10, %ymm6
-	vpxor %ymm7, %ymm11, %ymm7
-	vpslld $ 7, %ymm6, %ymm12
-	vpsrld $25, %ymm6, %ymm6
-	vpxor %ymm6, %ymm12, %ymm6
-	vpslld $ 7, %ymm7, %ymm12
-	vpsrld $25, %ymm7, %ymm7
-	vpxor %ymm7, %ymm12, %ymm7
-	vpaddd %ymm0, %ymm5, %ymm0
-	vpaddd %ymm1, %ymm6, %ymm1
-	vpxor %ymm15, %ymm0, %ymm15
-	vpxor 96(%rsp), %ymm1, %ymm12
-	vpaddd %ymm2, %ymm7, %ymm2
-	vpaddd %ymm3, %ymm4, %ymm3
-	vpxor %ymm13, %ymm2, %ymm13
-	vpxor %ymm14, %ymm3, %ymm14
-	vpshufb 448(%rsp), %ymm15, %ymm15
-	vpshufb 448(%rsp), %ymm12, %ymm12
-	vpaddd %ymm10, %ymm15, %ymm10
-	vpaddd %ymm11, %ymm12, %ymm11
-	vpshufb 448(%rsp), %ymm13, %ymm13
-	vpshufb 448(%rsp), %ymm14, %ymm14
-	vpaddd %ymm8, %ymm13, %ymm8
-	vpaddd %ymm9, %ymm14, %ymm9
-	vmovdqa %ymm15, 96(%rsp)
-	vpxor %ymm5, %ymm10, %ymm5
-	vpxor %ymm6, %ymm11, %ymm6
-	vpslld $ 12, %ymm5, %ymm15
-	vpsrld $20, %ymm5, %ymm5
-	vpxor %ymm5, %ymm15, %ymm5
-	vpslld $ 12, %ymm6, %ymm15
-	vpsrld $20, %ymm6, %ymm6
-	vpxor %ymm6, %ymm15, %ymm6
-	vpxor %ymm7, %ymm8, %ymm7
-	vpxor %ymm4, %ymm9, %ymm4
-	vpslld $ 12, %ymm7, %ymm15
-	vpsrld $20, %ymm7, %ymm7
-	vpxor %ymm7, %ymm15, %ymm7
-	vpslld $ 12, %ymm4, %ymm15
-	vpsrld $20, %ymm4, %ymm4
-	vpxor %ymm4, %ymm15, %ymm4
-	vpaddd %ymm0, %ymm5, %ymm0
-	vpaddd %ymm1, %ymm6, %ymm1
-	vpxor 96(%rsp), %ymm0, %ymm15
-	vpxor %ymm12, %ymm1, %ymm12
-	vpaddd %ymm2, %ymm7, %ymm2
-	vpaddd %ymm3, %ymm4, %ymm3
-	vpxor %ymm13, %ymm2, %ymm13
-	vpxor %ymm14, %ymm3, %ymm14
-	vpshufb 480(%rsp), %ymm15, %ymm15
-	vpshufb 480(%rsp), %ymm12, %ymm12
-	vpaddd %ymm10, %ymm15, %ymm10
-	vpaddd %ymm11, %ymm12, %ymm11
-	vpshufb 480(%rsp), %ymm13, %ymm13
-	vpshufb 480(%rsp), %ymm14, %ymm14
-	vpaddd %ymm8, %ymm13, %ymm8
-	vpaddd %ymm9, %ymm14, %ymm9
-	vmovdqa %ymm15, 96(%rsp)
-	vpxor %ymm5, %ymm10, %ymm5
-	vpxor %ymm6, %ymm11, %ymm6
-	vpslld $ 7, %ymm5, %ymm15
-	vpsrld $25, %ymm5, %ymm5
-	vpxor %ymm5, %ymm15, %ymm5
-	vpslld $ 7, %ymm6, %ymm15
-	vpsrld $25, %ymm6, %ymm6
-	vpxor %ymm6, %ymm15, %ymm6
-	vpxor %ymm7, %ymm8, %ymm7
-	vpxor %ymm4, %ymm9, %ymm4
-	vpslld $ 7, %ymm7, %ymm15
-	vpsrld $25, %ymm7, %ymm7
-	vpxor %ymm7, %ymm15, %ymm7
-	vpslld $ 7, %ymm4, %ymm15
-	vpsrld $25, %ymm4, %ymm4
-	vpxor %ymm4, %ymm15, %ymm4
-	vmovdqa 96(%rsp), %ymm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop1
-	vmovdqa %ymm8, 192(%rsp)
-	vmovdqa %ymm9, 224(%rsp)
-	vmovdqa %ymm10, 256(%rsp)
-	vmovdqa %ymm11, 288(%rsp)
-	vmovdqa %ymm12, 320(%rsp)
-	vmovdqa %ymm13, 352(%rsp)
-	vmovdqa %ymm14, 384(%rsp)
-	vmovdqa %ymm15, 416(%rsp)
-	vpbroadcastd 0(%rsp), %ymm8
-	vpbroadcastd 4+0(%rsp), %ymm9
-	vpbroadcastd 8+0(%rsp), %ymm10
-	vpbroadcastd 12+0(%rsp), %ymm11
-	vpbroadcastd 16(%rsp), %ymm12
-	vpbroadcastd 4+16(%rsp), %ymm13
-	vpbroadcastd 8+16(%rsp), %ymm14
-	vpbroadcastd 12+16(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput1
-	vpxor 0(%rsi), %ymm8, %ymm8
-	vpxor 64(%rsi), %ymm9, %ymm9
-	vpxor 128(%rsi), %ymm10, %ymm10
-	vpxor 192(%rsi), %ymm11, %ymm11
-	vpxor 256(%rsi), %ymm12, %ymm12
-	vpxor 320(%rsi), %ymm13, %ymm13
-	vpxor 384(%rsi), %ymm14, %ymm14
-	vpxor 448(%rsi), %ymm15, %ymm15
-	vmovdqu %ymm8, 0(%rdx)
-	vmovdqu %ymm9, 64(%rdx)
-	vmovdqu %ymm10, 128(%rdx)
-	vmovdqu %ymm11, 192(%rdx)
-	vmovdqu %ymm12, 256(%rdx)
-	vmovdqu %ymm13, 320(%rdx)
-	vmovdqu %ymm14, 384(%rdx)
-	vmovdqu %ymm15, 448(%rdx)
-	vmovdqa 192(%rsp), %ymm0
-	vmovdqa 224(%rsp), %ymm1
-	vmovdqa 256(%rsp), %ymm2
-	vmovdqa 288(%rsp), %ymm3
-	vmovdqa 320(%rsp), %ymm4
-	vmovdqa 352(%rsp), %ymm5
-	vmovdqa 384(%rsp), %ymm6
-	vmovdqa 416(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	vpxor 32(%rsi), %ymm8, %ymm8
-	vpxor 96(%rsi), %ymm9, %ymm9
-	vpxor 160(%rsi), %ymm10, %ymm10
-	vpxor 224(%rsi), %ymm11, %ymm11
-	vpxor 288(%rsi), %ymm12, %ymm12
-	vpxor 352(%rsi), %ymm13, %ymm13
-	vpxor 416(%rsi), %ymm14, %ymm14
-	vpxor 480(%rsi), %ymm15, %ymm15
-	vmovdqu %ymm8, 32(%rdx)
-	vmovdqu %ymm9, 96(%rdx)
-	vmovdqu %ymm10, 160(%rdx)
-	vmovdqu %ymm11, 224(%rdx)
-	vmovdqu %ymm12, 288(%rdx)
-	vmovdqu %ymm13, 352(%rdx)
-	vmovdqu %ymm14, 416(%rdx)
-	vmovdqu %ymm15, 480(%rdx)
-	addq $512, %rsi
-	jmp .Lchacha_blocks_avx2_mainloop1_cont
-.Lchacha_blocks_avx2_noinput1:
-	vmovdqu %ymm8, 0(%rdx)
-	vmovdqu %ymm9, 64(%rdx)
-	vmovdqu %ymm10, 128(%rdx)
-	vmovdqu %ymm11, 192(%rdx)
-	vmovdqu %ymm12, 256(%rdx)
-	vmovdqu %ymm13, 320(%rdx)
-	vmovdqu %ymm14, 384(%rdx)
-	vmovdqu %ymm15, 448(%rdx)
-	vmovdqa 192(%rsp), %ymm0
-	vmovdqa 224(%rsp), %ymm1
-	vmovdqa 256(%rsp), %ymm2
-	vmovdqa 288(%rsp), %ymm3
-	vmovdqa 320(%rsp), %ymm4
-	vmovdqa 352(%rsp), %ymm5
-	vmovdqa 384(%rsp), %ymm6
-	vmovdqa 416(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	vmovdqu %ymm8, 32(%rdx)
-	vmovdqu %ymm9, 96(%rdx)
-	vmovdqu %ymm10, 160(%rdx)
-	vmovdqu %ymm11, 224(%rdx)
-	vmovdqu %ymm12, 288(%rdx)
-	vmovdqu %ymm13, 352(%rdx)
-	vmovdqu %ymm14, 416(%rdx)
-	vmovdqu %ymm15, 480(%rdx)
-.Lchacha_blocks_avx2_mainloop1_cont:
-	addq $512, %rdx
-	subq $512, %rcx
-	cmp $512, %rcx
-	jae .Lchacha_blocks_avx2_atleast512
-	cmp $256, %rcx
-	jb .Lchacha_blocks_avx2_below256_fixup
-.Lchacha_blocks_avx2_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 128(%rsp)
-	movl %r8d, 4+128(%rsp)
-	movl %r9d, 8+128(%rsp)
-	movl %r10d, 12+128(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 160(%rsp)
-	movl %r8d, 4+160(%rsp)
-	movl %r9d, 8+160(%rsp)
-	movl %r10d, 12+160(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	vpbroadcastd 0(%rsp), %xmm0
-	vpbroadcastd 4+0(%rsp), %xmm1
-	vpbroadcastd 8+0(%rsp), %xmm2
-	vpbroadcastd 12+0(%rsp), %xmm3
-	vpbroadcastd 16(%rsp), %xmm4
-	vpbroadcastd 4+16(%rsp), %xmm5
-	vpbroadcastd 8+16(%rsp), %xmm6
-	vpbroadcastd 12+16(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-.Lchacha_blocks_avx2_mainloop2:
-	vpaddd %xmm0, %xmm4, %xmm0
-	vpaddd %xmm1, %xmm5, %xmm1
-	vpxor %xmm12, %xmm0, %xmm12
-	vpxor %xmm13, %xmm1, %xmm13
-	vpaddd %xmm2, %xmm6, %xmm2
-	vpaddd %xmm3, %xmm7, %xmm3
-	vpxor %xmm14, %xmm2, %xmm14
-	vpxor %xmm15, %xmm3, %xmm15
-	vpshufb 448(%rsp), %xmm12, %xmm12
-	vpshufb 448(%rsp), %xmm13, %xmm13
-	vpaddd %xmm8, %xmm12, %xmm8
-	vpaddd %xmm9, %xmm13, %xmm9
-	vpshufb 448(%rsp), %xmm14, %xmm14
-	vpshufb 448(%rsp), %xmm15, %xmm15
-	vpaddd %xmm10, %xmm14, %xmm10
-	vpaddd %xmm11, %xmm15, %xmm11
-	vmovdqa %xmm12, 96(%rsp)
-	vpxor %xmm4, %xmm8, %xmm4
-	vpxor %xmm5, %xmm9, %xmm5
-	vpslld $ 12, %xmm4, %xmm12
-	vpsrld $20, %xmm4, %xmm4
-	vpxor %xmm4, %xmm12, %xmm4
-	vpslld $ 12, %xmm5, %xmm12
-	vpsrld $20, %xmm5, %xmm5
-	vpxor %xmm5, %xmm12, %xmm5
-	vpxor %xmm6, %xmm10, %xmm6
-	vpxor %xmm7, %xmm11, %xmm7
-	vpslld $ 12, %xmm6, %xmm12
-	vpsrld $20, %xmm6, %xmm6
-	vpxor %xmm6, %xmm12, %xmm6
-	vpslld $ 12, %xmm7, %xmm12
-	vpsrld $20, %xmm7, %xmm7
-	vpxor %xmm7, %xmm12, %xmm7
-	vpaddd %xmm0, %xmm4, %xmm0
-	vpaddd %xmm1, %xmm5, %xmm1
-	vpxor 96(%rsp), %xmm0, %xmm12
-	vpxor %xmm13, %xmm1, %xmm13
-	vpaddd %xmm2, %xmm6, %xmm2
-	vpaddd %xmm3, %xmm7, %xmm3
-	vpxor %xmm14, %xmm2, %xmm14
-	vpxor %xmm15, %xmm3, %xmm15
-	vpshufb 480(%rsp), %xmm12, %xmm12
-	vpshufb 480(%rsp), %xmm13, %xmm13
-	vpaddd %xmm8, %xmm12, %xmm8
-	vpaddd %xmm9, %xmm13, %xmm9
-	vpshufb 480(%rsp), %xmm14, %xmm14
-	vpshufb 480(%rsp), %xmm15, %xmm15
-	vpaddd %xmm10, %xmm14, %xmm10
-	vpaddd %xmm11, %xmm15, %xmm11
-	vmovdqa %xmm12, 96(%rsp)
-	vpxor %xmm4, %xmm8, %xmm4
-	vpxor %xmm5, %xmm9, %xmm5
-	vpslld $ 7, %xmm4, %xmm12
-	vpsrld $25, %xmm4, %xmm4
-	vpxor %xmm4, %xmm12, %xmm4
-	vpslld $ 7, %xmm5, %xmm12
-	vpsrld $25, %xmm5, %xmm5
-	vpxor %xmm5, %xmm12, %xmm5
-	vpxor %xmm6, %xmm10, %xmm6
-	vpxor %xmm7, %xmm11, %xmm7
-	vpslld $ 7, %xmm6, %xmm12
-	vpsrld $25, %xmm6, %xmm6
-	vpxor %xmm6, %xmm12, %xmm6
-	vpslld $ 7, %xmm7, %xmm12
-	vpsrld $25, %xmm7, %xmm7
-	vpxor %xmm7, %xmm12, %xmm7
-	vpaddd %xmm0, %xmm5, %xmm0
-	vpaddd %xmm1, %xmm6, %xmm1
-	vpxor %xmm15, %xmm0, %xmm15
-	vpxor 96(%rsp), %xmm1, %xmm12
-	vpaddd %xmm2, %xmm7, %xmm2
-	vpaddd %xmm3, %xmm4, %xmm3
-	vpxor %xmm13, %xmm2, %xmm13
-	vpxor %xmm14, %xmm3, %xmm14
-	vpshufb 448(%rsp), %xmm15, %xmm15
-	vpshufb 448(%rsp), %xmm12, %xmm12
-	vpaddd %xmm10, %xmm15, %xmm10
-	vpaddd %xmm11, %xmm12, %xmm11
-	vpshufb 448(%rsp), %xmm13, %xmm13
-	vpshufb 448(%rsp), %xmm14, %xmm14
-	vpaddd %xmm8, %xmm13, %xmm8
-	vpaddd %xmm9, %xmm14, %xmm9
-	vmovdqa %xmm15, 96(%rsp)
-	vpxor %xmm5, %xmm10, %xmm5
-	vpxor %xmm6, %xmm11, %xmm6
-	vpslld $ 12, %xmm5, %xmm15
-	vpsrld $20, %xmm5, %xmm5
-	vpxor %xmm5, %xmm15, %xmm5
-	vpslld $ 12, %xmm6, %xmm15
-	vpsrld $20, %xmm6, %xmm6
-	vpxor %xmm6, %xmm15, %xmm6
-	vpxor %xmm7, %xmm8, %xmm7
-	vpxor %xmm4, %xmm9, %xmm4
-	vpslld $ 12, %xmm7, %xmm15
-	vpsrld $20, %xmm7, %xmm7
-	vpxor %xmm7, %xmm15, %xmm7
-	vpslld $ 12, %xmm4, %xmm15
-	vpsrld $20, %xmm4, %xmm4
-	vpxor %xmm4, %xmm15, %xmm4
-	vpaddd %xmm0, %xmm5, %xmm0
-	vpaddd %xmm1, %xmm6, %xmm1
-	vpxor 96(%rsp), %xmm0, %xmm15
-	vpxor %xmm12, %xmm1, %xmm12
-	vpaddd %xmm2, %xmm7, %xmm2
-	vpaddd %xmm3, %xmm4, %xmm3
-	vpxor %xmm13, %xmm2, %xmm13
-	vpxor %xmm14, %xmm3, %xmm14
-	vpshufb 480(%rsp), %xmm15, %xmm15
-	vpshufb 480(%rsp), %xmm12, %xmm12
-	vpaddd %xmm10, %xmm15, %xmm10
-	vpaddd %xmm11, %xmm12, %xmm11
-	vpshufb 480(%rsp), %xmm13, %xmm13
-	vpshufb 480(%rsp), %xmm14, %xmm14
-	vpaddd %xmm8, %xmm13, %xmm8
-	vpaddd %xmm9, %xmm14, %xmm9
-	vmovdqa %xmm15, 96(%rsp)
-	vpxor %xmm5, %xmm10, %xmm5
-	vpxor %xmm6, %xmm11, %xmm6
-	vpslld $ 7, %xmm5, %xmm15
-	vpsrld $25, %xmm5, %xmm5
-	vpxor %xmm5, %xmm15, %xmm5
-	vpslld $ 7, %xmm6, %xmm15
-	vpsrld $25, %xmm6, %xmm6
-	vpxor %xmm6, %xmm15, %xmm6
-	vpxor %xmm7, %xmm8, %xmm7
-	vpxor %xmm4, %xmm9, %xmm4
-	vpslld $ 7, %xmm7, %xmm15
-	vpsrld $25, %xmm7, %xmm7
-	vpxor %xmm7, %xmm15, %xmm7
-	vpslld $ 7, %xmm4, %xmm15
-	vpsrld $25, %xmm4, %xmm4
-	vpxor %xmm4, %xmm15, %xmm4
-	vmovdqa 96(%rsp), %xmm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop2
-	vmovdqa %xmm8, 192(%rsp)
-	vmovdqa %xmm9, 208(%rsp)
-	vmovdqa %xmm10, 224(%rsp)
-	vmovdqa %xmm11, 240(%rsp)
-	vmovdqa %xmm12, 256(%rsp)
-	vmovdqa %xmm13, 272(%rsp)
-	vmovdqa %xmm14, 288(%rsp)
-	vmovdqa %xmm15, 304(%rsp)
-	vpbroadcastd 0(%rsp), %xmm8
-	vpbroadcastd 4+0(%rsp), %xmm9
-	vpbroadcastd 8+0(%rsp), %xmm10
-	vpbroadcastd 12+0(%rsp), %xmm11
-	vpbroadcastd 16(%rsp), %xmm12
-	vpbroadcastd 4+16(%rsp), %xmm13
-	vpbroadcastd 8+16(%rsp), %xmm14
-	vpbroadcastd 12+16(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput2
-	vpxor 0(%rsi), %xmm0, %xmm0
-	vpxor 16(%rsi), %xmm1, %xmm1
-	vpxor 64(%rsi), %xmm2, %xmm2
-	vpxor 80(%rsi), %xmm3, %xmm3
-	vpxor 128(%rsi), %xmm4, %xmm4
-	vpxor 144(%rsi), %xmm5, %xmm5
-	vpxor 192(%rsi), %xmm6, %xmm6
-	vpxor 208(%rsi), %xmm7, %xmm7
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 64(%rdx)
-	vmovdqu %xmm3, 80(%rdx)
-	vmovdqu %xmm4, 128(%rdx)
-	vmovdqu %xmm5, 144(%rdx)
-	vmovdqu %xmm6, 192(%rdx)
-	vmovdqu %xmm7, 208(%rdx)
-	vmovdqa 192(%rsp), %xmm0
-	vmovdqa 208(%rsp), %xmm1
-	vmovdqa 224(%rsp), %xmm2
-	vmovdqa 240(%rsp), %xmm3
-	vmovdqa 256(%rsp), %xmm4
-	vmovdqa 272(%rsp), %xmm5
-	vmovdqa 288(%rsp), %xmm6
-	vmovdqa 304(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	vpxor 32(%rsi), %xmm0, %xmm0
-	vpxor 48(%rsi), %xmm1, %xmm1
-	vpxor 96(%rsi), %xmm2, %xmm2
-	vpxor 112(%rsi), %xmm3, %xmm3
-	vpxor 160(%rsi), %xmm4, %xmm4
-	vpxor 176(%rsi), %xmm5, %xmm5
-	vpxor 224(%rsi), %xmm6, %xmm6
-	vpxor 240(%rsi), %xmm7, %xmm7
-	vmovdqu %xmm0, 32(%rdx)
-	vmovdqu %xmm1, 48(%rdx)
-	vmovdqu %xmm2, 96(%rdx)
-	vmovdqu %xmm3, 112(%rdx)
-	vmovdqu %xmm4, 160(%rdx)
-	vmovdqu %xmm5, 176(%rdx)
-	vmovdqu %xmm6, 224(%rdx)
-	vmovdqu %xmm7, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_avx2_mainloop2_cont
-.Lchacha_blocks_avx2_noinput2:
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 64(%rdx)
-	vmovdqu %xmm3, 80(%rdx)
-	vmovdqu %xmm4, 128(%rdx)
-	vmovdqu %xmm5, 144(%rdx)
-	vmovdqu %xmm6, 192(%rdx)
-	vmovdqu %xmm7, 208(%rdx)
-	vmovdqa 192(%rsp), %xmm0
-	vmovdqa 208(%rsp), %xmm1
-	vmovdqa 224(%rsp), %xmm2
-	vmovdqa 240(%rsp), %xmm3
-	vmovdqa 256(%rsp), %xmm4
-	vmovdqa 272(%rsp), %xmm5
-	vmovdqa 288(%rsp), %xmm6
-	vmovdqa 304(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	vmovdqu %xmm0, 32(%rdx)
-	vmovdqu %xmm1, 48(%rdx)
-	vmovdqu %xmm2, 96(%rdx)
-	vmovdqu %xmm3, 112(%rdx)
-	vmovdqu %xmm4, 160(%rdx)
-	vmovdqu %xmm5, 176(%rdx)
-	vmovdqu %xmm6, 224(%rdx)
-	vmovdqu %xmm7, 240(%rdx)
-.Lchacha_blocks_avx2_mainloop2_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_avx2_atleast256
-.Lchacha_blocks_avx2_below256_fixup:
-	vmovdqa 448(%rsp), %xmm6
-	vmovdqa 480(%rsp), %xmm7
-	vmovdqa 0(%rsp), %xmm8
-	vmovdqa 16(%rsp), %xmm9
-	vmovdqa 32(%rsp), %xmm10
-	vmovdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_avx2_below256:
-	vmovq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_avx2_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_avx2_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput3
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_avx2_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_avx2_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_avx2_noinput3:
-	movq %rsp, %rdx
-.Lchacha_blocks_avx2_above63:
-	vmovdqa %xmm8, %xmm0
-	vmovdqa %xmm9, %xmm1
-	vmovdqa %xmm10, %xmm2
-	vmovdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_avx2_mainloop3:
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm6, %xmm3, %xmm3
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpxor %xmm1, %xmm2, %xmm1
-	vpslld $12, %xmm1, %xmm4
-	vpsrld $20, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm7, %xmm3, %xmm3
-	vpshufd $0x93, %xmm0, %xmm0
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpshufd $0x4e, %xmm3, %xmm3
-	vpxor %xmm1, %xmm2, %xmm1
-	vpshufd $0x39, %xmm2, %xmm2
-	vpslld $7, %xmm1, %xmm4
-	vpsrld $25, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm6, %xmm3, %xmm3
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpxor %xmm1, %xmm2, %xmm1
-	vpslld $12, %xmm1, %xmm4
-	vpsrld $20, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm7, %xmm3, %xmm3
-	vpshufd $0x39, %xmm0, %xmm0
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpshufd $0x4e, %xmm3, %xmm3
-	vpxor %xmm1, %xmm2, %xmm1
-	vpshufd $0x93, %xmm2, %xmm2
-	vpslld $7, %xmm1, %xmm4
-	vpsrld $25, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop3
-	vpaddd %xmm0, %xmm8, %xmm0
-	vpaddd %xmm1, %xmm9, %xmm1
-	vpaddd %xmm2, %xmm10, %xmm2
-	vpaddd %xmm3, %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput4
-	vpxor 0(%rsi), %xmm0, %xmm0
-	vpxor 16(%rsi), %xmm1, %xmm1
-	vpxor 32(%rsi), %xmm2, %xmm2
-	vpxor 48(%rsi), %xmm3, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_avx2_noinput4:
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 32(%rdx)
-	vmovdqu %xmm3, 48(%rdx)
-	vpaddq %xmm11, %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_avx2_mainloop3_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_avx2_below256
-.Lchacha_blocks_avx2_mainloop3_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_avx2_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_avx2_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_avx2_copyoutput
-.Lchacha_blocks_avx2_done:
-	vmovdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	popq %r14
-	popq %r13
-	popq %r12
-	popq %rbp
-	popq %rbx
-	vzeroall
-	movl $(63 + 512), %eax
-	ret
-ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;)
-
-.align 16
-.LC:
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S
deleted file mode 100644
index 2b9842c13..000000000
--- a/cipher/chacha20-sse2-amd64.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* chacha20-sse2-amd64.S  -  AMD64/SSE2 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_sse2_blocks
-ELF(.type  _gcry_chacha20_amd64_sse2_blocks, at function;)
-_gcry_chacha20_amd64_sse2_blocks:
-.Lchacha_blocks_sse2_local:
-	pushq %rbx
-	pushq %rbp
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	movdqu (%rdi), %xmm8
-	movdqu 16(%rdi), %xmm9
-	movdqu 32(%rdi), %xmm10
-	movdqu 48(%rdi), %xmm11
-	movq $20, %rax
-	movq $1, %r9
-	movdqa %xmm8, 0(%rsp)
-	movdqa %xmm9, 16(%rsp)
-	movdqa %xmm10, 32(%rsp)
-	movdqa %xmm11, 48(%rsp)
-	movq %rax, 64(%rsp)
-	cmpq $256, %rcx
-	jb .Lchacha_blocks_sse2_below256
-	pshufd $0x00, %xmm8, %xmm0
-	pshufd $0x55, %xmm8, %xmm1
-	pshufd $0xaa, %xmm8, %xmm2
-	pshufd $0xff, %xmm8, %xmm3
-	movdqa %xmm0, 128(%rsp)
-	movdqa %xmm1, 144(%rsp)
-	movdqa %xmm2, 160(%rsp)
-	movdqa %xmm3, 176(%rsp)
-	pshufd $0x00, %xmm9, %xmm0
-	pshufd $0x55, %xmm9, %xmm1
-	pshufd $0xaa, %xmm9, %xmm2
-	pshufd $0xff, %xmm9, %xmm3
-	movdqa %xmm0, 192(%rsp)
-	movdqa %xmm1, 208(%rsp)
-	movdqa %xmm2, 224(%rsp)
-	movdqa %xmm3, 240(%rsp)
-	pshufd $0x00, %xmm10, %xmm0
-	pshufd $0x55, %xmm10, %xmm1
-	pshufd $0xaa, %xmm10, %xmm2
-	pshufd $0xff, %xmm10, %xmm3
-	movdqa %xmm0, 256(%rsp)
-	movdqa %xmm1, 272(%rsp)
-	movdqa %xmm2, 288(%rsp)
-	movdqa %xmm3, 304(%rsp)
-	pshufd $0xaa, %xmm11, %xmm0
-	pshufd $0xff, %xmm11, %xmm1
-	movdqa %xmm0, 352(%rsp)
-	movdqa %xmm1, 368(%rsp)
-	jmp .Lchacha_blocks_sse2_atleast256
-.p2align 6,,63
-.Lchacha_blocks_sse2_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 320(%rsp)
-	movl %r8d, 4+320(%rsp)
-	movl %r9d, 8+320(%rsp)
-	movl %r10d, 12+320(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 336(%rsp)
-	movl %r8d, 4+336(%rsp)
-	movl %r9d, 8+336(%rsp)
-	movl %r10d, 12+336(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	movdqa 128(%rsp), %xmm0
-	movdqa 144(%rsp), %xmm1
-	movdqa 160(%rsp), %xmm2
-	movdqa 176(%rsp), %xmm3
-	movdqa 192(%rsp), %xmm4
-	movdqa 208(%rsp), %xmm5
-	movdqa 224(%rsp), %xmm6
-	movdqa 240(%rsp), %xmm7
-	movdqa 256(%rsp), %xmm8
-	movdqa 272(%rsp), %xmm9
-	movdqa 288(%rsp), %xmm10
-	movdqa 304(%rsp), %xmm11
-	movdqa 320(%rsp), %xmm12
-	movdqa 336(%rsp), %xmm13
-	movdqa 352(%rsp), %xmm14
-	movdqa 368(%rsp), %xmm15
-.Lchacha_blocks_sse2_mainloop1:
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	movdqa %xmm6, 96(%rsp)
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshuflw $0xb1,%xmm12,%xmm12
-	pshufhw $0xb1,%xmm12,%xmm12
-	pshuflw $0xb1,%xmm13,%xmm13
-	pshufhw $0xb1,%xmm13,%xmm13
-	pshuflw $0xb1,%xmm14,%xmm14
-	pshufhw $0xb1,%xmm14,%xmm14
-	pshuflw $0xb1,%xmm15,%xmm15
-	pshufhw $0xb1,%xmm15,%xmm15
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa 96(%rsp), %xmm6
-	movdqa %xmm4, %xmm12
-	pslld $ 12, %xmm4
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 12, %xmm5
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 12, %xmm6
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 12, %xmm7
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	movdqa %xmm6, 96(%rsp)
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	movdqa %xmm12, %xmm6
-	pslld $ 8, %xmm12
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm12
-	movdqa %xmm13, %xmm6
-	pslld $ 8, %xmm13
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	movdqa %xmm14, %xmm6
-	pslld $ 8, %xmm14
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm14
-	movdqa %xmm15, %xmm6
-	pslld $ 8, %xmm15
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa 96(%rsp), %xmm6
-	movdqa %xmm4, %xmm12
-	pslld $ 7, %xmm4
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 7, %xmm5
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 7, %xmm6
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 7, %xmm7
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	movdqa %xmm7, 96(%rsp)
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshuflw $0xb1,%xmm15,%xmm15
-	pshufhw $0xb1,%xmm15,%xmm15
-	pshuflw $0xb1,%xmm12,%xmm12
-	pshufhw $0xb1,%xmm12,%xmm12
-	pshuflw $0xb1,%xmm13,%xmm13
-	pshufhw $0xb1,%xmm13,%xmm13
-	pshuflw $0xb1,%xmm14,%xmm14
-	pshufhw $0xb1,%xmm14,%xmm14
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa %xmm5, %xmm15
-	pslld $ 12, %xmm5
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 12, %xmm6
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 12, %xmm7
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 12, %xmm4
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	movdqa %xmm7, 96(%rsp)
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	movdqa %xmm15, %xmm7
-	pslld $ 8, %xmm15
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm15
-	movdqa %xmm12, %xmm7
-	pslld $ 8, %xmm12
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	movdqa %xmm13, %xmm7
-	pslld $ 8, %xmm13
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm13
-	movdqa %xmm14, %xmm7
-	pslld $ 8, %xmm14
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa %xmm5, %xmm15
-	pslld $ 7, %xmm5
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 7, %xmm6
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 7, %xmm7
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 7, %xmm4
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_sse2_mainloop1
-	paddd 128(%rsp), %xmm0
-	paddd 144(%rsp), %xmm1
-	paddd 160(%rsp), %xmm2
-	paddd 176(%rsp), %xmm3
-	paddd 192(%rsp), %xmm4
-	paddd 208(%rsp), %xmm5
-	paddd 224(%rsp), %xmm6
-	paddd 240(%rsp), %xmm7
-	paddd 256(%rsp), %xmm8
-	paddd 272(%rsp), %xmm9
-	paddd 288(%rsp), %xmm10
-	paddd 304(%rsp), %xmm11
-	paddd 320(%rsp), %xmm12
-	paddd 336(%rsp), %xmm13
-	paddd 352(%rsp), %xmm14
-	paddd 368(%rsp), %xmm15
-	movdqa %xmm8, 384(%rsp)
-	movdqa %xmm9, 400(%rsp)
-	movdqa %xmm10, 416(%rsp)
-	movdqa %xmm11, 432(%rsp)
-	movdqa %xmm12, 448(%rsp)
-	movdqa %xmm13, 464(%rsp)
-	movdqa %xmm14, 480(%rsp)
-	movdqa %xmm15, 496(%rsp)
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	movdqa %xmm0, %xmm1
-	movdqa %xmm4, %xmm3
-	movdqa %xmm8, %xmm5
-	movdqa %xmm10, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm1
-	punpcklqdq %xmm6, %xmm3
-	punpcklqdq %xmm9, %xmm5
-	punpcklqdq %xmm11, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput1
-	movdqu 0(%rsi), %xmm2
-	movdqu 16(%rsi), %xmm6
-	movdqu 64(%rsi), %xmm9
-	movdqu 80(%rsi), %xmm11
-	movdqu 128(%rsi), %xmm12
-	movdqu 144(%rsi), %xmm13
-	movdqu 192(%rsi), %xmm14
-	movdqu 208(%rsi), %xmm15
-	pxor %xmm2, %xmm5
-	pxor %xmm6, %xmm7
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm1
-	pxor %xmm13, %xmm3
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu 32(%rsi), %xmm2
-	movdqu 48(%rsi), %xmm6
-	movdqu 96(%rsi), %xmm9
-	movdqu 112(%rsi), %xmm11
-	movdqu 160(%rsi), %xmm12
-	movdqu 176(%rsi), %xmm13
-	movdqu 224(%rsi), %xmm14
-	movdqu 240(%rsi), %xmm15
-	pxor %xmm2, %xmm1
-	pxor %xmm6, %xmm5
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm3
-	pxor %xmm13, %xmm7
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_sse2_mainloop_cont
-.Lchacha_blocks_sse2_noinput1:
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-.Lchacha_blocks_sse2_mainloop_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_sse2_atleast256
-	movdqa 0(%rsp), %xmm8
-	movdqa 16(%rsp), %xmm9
-	movdqa 32(%rsp), %xmm10
-	movdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_sse2_below256:
-	movq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_sse2_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_sse2_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput2
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_sse2_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_sse2_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_sse2_noinput2:
-	movq %rsp, %rdx
-.Lchacha_blocks_sse2_above63:
-	movdqa %xmm8, %xmm0
-	movdqa %xmm9, %xmm1
-	movdqa %xmm10, %xmm2
-	movdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_sse2_mainloop2:
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshuflw $0xb1,%xmm3,%xmm3
-	pshufhw $0xb1,%xmm3,%xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1,%xmm4
-	pslld $12, %xmm1
-	psrld $20, %xmm4
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	movdqa %xmm3,%xmm4
-	pslld $8, %xmm3
-	psrld $24, %xmm4
-	pshufd $0x93,%xmm0,%xmm0
-	pxor %xmm4, %xmm3
-	paddd %xmm3, %xmm2
-	pshufd $0x4e,%xmm3,%xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x39,%xmm2,%xmm2
-	movdqa %xmm1,%xmm4
-	pslld $7, %xmm1
-	psrld $25, %xmm4
-	pxor %xmm4, %xmm1
-	subq $2, %rax
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshuflw $0xb1,%xmm3,%xmm3
-	pshufhw $0xb1,%xmm3,%xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1,%xmm4
-	pslld $12, %xmm1
-	psrld $20, %xmm4
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	movdqa %xmm3,%xmm4
-	pslld $8, %xmm3
-	psrld $24, %xmm4
-	pshufd $0x39,%xmm0,%xmm0
-	pxor %xmm4, %xmm3
-	paddd %xmm3, %xmm2
-	pshufd $0x4e,%xmm3,%xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x93,%xmm2,%xmm2
-	movdqa %xmm1,%xmm4
-	pslld $7, %xmm1
-	psrld $25, %xmm4
-	pxor %xmm4, %xmm1
-	jnz .Lchacha_blocks_sse2_mainloop2
-	paddd %xmm8, %xmm0
-	paddd %xmm9, %xmm1
-	paddd %xmm10, %xmm2
-	paddd %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput3
-	movdqu 0(%rsi), %xmm12
-	movdqu 16(%rsi), %xmm13
-	movdqu 32(%rsi), %xmm14
-	movdqu 48(%rsi), %xmm15
-	pxor %xmm12, %xmm0
-	pxor %xmm13, %xmm1
-	pxor %xmm14, %xmm2
-	pxor %xmm15, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_sse2_noinput3:
-	movdqu %xmm0, 0(%rdx)
-	movdqu %xmm1, 16(%rdx)
-	movdqu %xmm2, 32(%rdx)
-	movdqu %xmm3, 48(%rdx)
-	paddq %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_sse2_mainloop2_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_sse2_below256
-.Lchacha_blocks_sse2_mainloop2_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_sse2_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_sse2_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_sse2_copyoutput
-.Lchacha_blocks_sse2_done:
-	movdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	popq %rbp
-	popq %rbx
-	movl $(63 + 512 + 16), %eax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;)
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S
deleted file mode 100644
index c04010e7b..000000000
--- a/cipher/chacha20-ssse3-amd64.S
+++ /dev/null
@@ -1,632 +0,0 @@
-/* chacha20-ssse3-amd64.S  -  AMD64/SSSE3 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_ssse3_blocks
-ELF(.type  _gcry_chacha20_amd64_ssse3_blocks, at function;)
-_gcry_chacha20_amd64_ssse3_blocks:
-.Lchacha_blocks_ssse3_local:
-	pushq %rbx
-	pushq %rbp
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	leaq .LC RIP, %rax
-	movdqa 0(%rax), %xmm6
-	movdqa 16(%rax), %xmm7
-	movdqu 0(%rdi), %xmm8
-	movdqu 16(%rdi), %xmm9
-	movdqu 32(%rdi), %xmm10
-	movdqu 48(%rdi), %xmm11
-	movl $20, %eax
-	movq $1, %r9
-	movdqa %xmm8, 0(%rsp)
-	movdqa %xmm9, 16(%rsp)
-	movdqa %xmm10, 32(%rsp)
-	movdqa %xmm11, 48(%rsp)
-	movdqa %xmm6, 80(%rsp)
-	movdqa %xmm7, 96(%rsp)
-	movq %rax, 64(%rsp)
-	cmpq $256, %rcx
-	jb .Lchacha_blocks_ssse3_below256
-	pshufd $0x00, %xmm8, %xmm0
-	pshufd $0x55, %xmm8, %xmm1
-	pshufd $0xaa, %xmm8, %xmm2
-	pshufd $0xff, %xmm8, %xmm3
-	movdqa %xmm0, 128(%rsp)
-	movdqa %xmm1, 144(%rsp)
-	movdqa %xmm2, 160(%rsp)
-	movdqa %xmm3, 176(%rsp)
-	pshufd $0x00, %xmm9, %xmm0
-	pshufd $0x55, %xmm9, %xmm1
-	pshufd $0xaa, %xmm9, %xmm2
-	pshufd $0xff, %xmm9, %xmm3
-	movdqa %xmm0, 192(%rsp)
-	movdqa %xmm1, 208(%rsp)
-	movdqa %xmm2, 224(%rsp)
-	movdqa %xmm3, 240(%rsp)
-	pshufd $0x00, %xmm10, %xmm0
-	pshufd $0x55, %xmm10, %xmm1
-	pshufd $0xaa, %xmm10, %xmm2
-	pshufd $0xff, %xmm10, %xmm3
-	movdqa %xmm0, 256(%rsp)
-	movdqa %xmm1, 272(%rsp)
-	movdqa %xmm2, 288(%rsp)
-	movdqa %xmm3, 304(%rsp)
-	pshufd $0xaa, %xmm11, %xmm0
-	pshufd $0xff, %xmm11, %xmm1
-	movdqa %xmm0, 352(%rsp)
-	movdqa %xmm1, 368(%rsp)
-	jmp .Lchacha_blocks_ssse3_atleast256
-.p2align 6,,63
-	# align to 4 mod 64
-	nop;nop;nop;nop;
-.Lchacha_blocks_ssse3_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 320(%rsp)
-	movl %r8d, 4+320(%rsp)
-	movl %r9d, 8+320(%rsp)
-	movl %r10d, 12+320(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 336(%rsp)
-	movl %r8d, 4+336(%rsp)
-	movl %r9d, 8+336(%rsp)
-	movl %r10d, 12+336(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	movdqa 128(%rsp), %xmm0
-	movdqa 144(%rsp), %xmm1
-	movdqa 160(%rsp), %xmm2
-	movdqa 176(%rsp), %xmm3
-	movdqa 192(%rsp), %xmm4
-	movdqa 208(%rsp), %xmm5
-	movdqa 224(%rsp), %xmm6
-	movdqa 240(%rsp), %xmm7
-	movdqa 256(%rsp), %xmm8
-	movdqa 272(%rsp), %xmm9
-	movdqa 288(%rsp), %xmm10
-	movdqa 304(%rsp), %xmm11
-	movdqa 320(%rsp), %xmm12
-	movdqa 336(%rsp), %xmm13
-	movdqa 352(%rsp), %xmm14
-	movdqa 368(%rsp), %xmm15
-.Lchacha_blocks_ssse3_mainloop1:
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshufb 80(%rsp), %xmm12
-	pshufb 80(%rsp), %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	pshufb 80(%rsp), %xmm14
-	pshufb 80(%rsp), %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa %xmm4, %xmm12
-	pslld $ 12, %xmm4
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 12, %xmm5
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 12, %xmm6
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 12, %xmm7
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshufb 96(%rsp), %xmm12
-	pshufb 96(%rsp), %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	pshufb 96(%rsp), %xmm14
-	pshufb 96(%rsp), %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa %xmm4, %xmm12
-	pslld $ 7, %xmm4
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 7, %xmm5
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 7, %xmm6
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 7, %xmm7
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshufb 80(%rsp), %xmm15
-	pshufb 80(%rsp), %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	pshufb 80(%rsp), %xmm13
-	pshufb 80(%rsp), %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa %xmm5, %xmm15
-	pslld $ 12, %xmm5
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 12, %xmm6
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 12, %xmm7
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 12, %xmm4
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshufb 96(%rsp), %xmm15
-	pshufb 96(%rsp), %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	pshufb 96(%rsp), %xmm13
-	pshufb 96(%rsp), %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa %xmm5, %xmm15
-	pslld $ 7, %xmm5
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 7, %xmm6
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 7, %xmm7
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 7, %xmm4
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm4
-	subq $2, %rax
-	movdqa 112(%rsp), %xmm15
-	jnz .Lchacha_blocks_ssse3_mainloop1
-	paddd 128(%rsp), %xmm0
-	paddd 144(%rsp), %xmm1
-	paddd 160(%rsp), %xmm2
-	paddd 176(%rsp), %xmm3
-	paddd 192(%rsp), %xmm4
-	paddd 208(%rsp), %xmm5
-	paddd 224(%rsp), %xmm6
-	paddd 240(%rsp), %xmm7
-	paddd 256(%rsp), %xmm8
-	paddd 272(%rsp), %xmm9
-	paddd 288(%rsp), %xmm10
-	paddd 304(%rsp), %xmm11
-	paddd 320(%rsp), %xmm12
-	paddd 336(%rsp), %xmm13
-	paddd 352(%rsp), %xmm14
-	paddd 368(%rsp), %xmm15
-	movdqa %xmm8, 384(%rsp)
-	movdqa %xmm9, 400(%rsp)
-	movdqa %xmm10, 416(%rsp)
-	movdqa %xmm11, 432(%rsp)
-	movdqa %xmm12, 448(%rsp)
-	movdqa %xmm13, 464(%rsp)
-	movdqa %xmm14, 480(%rsp)
-	movdqa %xmm15, 496(%rsp)
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	movdqa %xmm0, %xmm1
-	movdqa %xmm4, %xmm3
-	movdqa %xmm8, %xmm5
-	movdqa %xmm10, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm1
-	punpcklqdq %xmm6, %xmm3
-	punpcklqdq %xmm9, %xmm5
-	punpcklqdq %xmm11, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput1
-	movdqu 0(%rsi), %xmm2
-	movdqu 16(%rsi), %xmm6
-	movdqu 64(%rsi), %xmm9
-	movdqu 80(%rsi), %xmm11
-	movdqu 128(%rsi), %xmm12
-	movdqu 144(%rsi), %xmm13
-	movdqu 192(%rsi), %xmm14
-	movdqu 208(%rsi), %xmm15
-	pxor %xmm2, %xmm5
-	pxor %xmm6, %xmm7
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm1
-	pxor %xmm13, %xmm3
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu 32(%rsi), %xmm2
-	movdqu 48(%rsi), %xmm6
-	movdqu 96(%rsi), %xmm9
-	movdqu 112(%rsi), %xmm11
-	movdqu 160(%rsi), %xmm12
-	movdqu 176(%rsi), %xmm13
-	movdqu 224(%rsi), %xmm14
-	movdqu 240(%rsi), %xmm15
-	pxor %xmm2, %xmm1
-	pxor %xmm6, %xmm5
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm3
-	pxor %xmm13, %xmm7
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_ssse3_mainloop_cont
-.Lchacha_blocks_ssse3_noinput1:
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-.Lchacha_blocks_ssse3_mainloop_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_ssse3_atleast256
-	movdqa 80(%rsp), %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa 0(%rsp), %xmm8
-	movdqa 16(%rsp), %xmm9
-	movdqa 32(%rsp), %xmm10
-	movdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_ssse3_below256:
-	movq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_ssse3_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_ssse3_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput2
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_ssse3_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_ssse3_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_ssse3_noinput2:
-	movq %rsp, %rdx
-.Lchacha_blocks_ssse3_above63:
-	movdqa %xmm8, %xmm0
-	movdqa %xmm9, %xmm1
-	movdqa %xmm10, %xmm2
-	movdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_ssse3_mainloop2:
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm6, %xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1, %xmm4
-	pslld $12, %xmm4
-	psrld $20, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm7, %xmm3
-	pshufd $0x93, %xmm0, %xmm0
-	paddd %xmm3, %xmm2
-	pshufd $0x4e, %xmm3, %xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x39, %xmm2, %xmm2
-	movdqa %xmm1, %xmm4
-	pslld $7, %xmm4
-	psrld $25, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm6, %xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1, %xmm4
-	pslld $12, %xmm4
-	psrld $20, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm7, %xmm3
-	pshufd $0x39, %xmm0, %xmm0
-	paddd %xmm3, %xmm2
-	pshufd $0x4e, %xmm3, %xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x93, %xmm2, %xmm2
-	movdqa %xmm1, %xmm4
-	pslld $7, %xmm4
-	psrld $25, %xmm1
-	pxor %xmm4, %xmm1
-	subq $2, %rax
-	jnz .Lchacha_blocks_ssse3_mainloop2
-	paddd %xmm8, %xmm0
-	paddd %xmm9, %xmm1
-	paddd %xmm10, %xmm2
-	paddd %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput3
-	movdqu 0(%rsi), %xmm12
-	movdqu 16(%rsi), %xmm13
-	movdqu 32(%rsi), %xmm14
-	movdqu 48(%rsi), %xmm15
-	pxor %xmm12, %xmm0
-	pxor %xmm13, %xmm1
-	pxor %xmm14, %xmm2
-	pxor %xmm15, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_ssse3_noinput3:
-	movdqu %xmm0, 0(%rdx)
-	movdqu %xmm1, 16(%rdx)
-	movdqu %xmm2, 32(%rdx)
-	movdqu %xmm3, 48(%rdx)
-	paddq %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_ssse3_mainloop2_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_ssse3_below256
-.Lchacha_blocks_ssse3_mainloop2_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_ssse3_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_ssse3_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_ssse3_copyoutput
-.Lchacha_blocks_ssse3_done:
-	movdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	popq %rbp
-	popq %rbx
-	movl $(63 + 512 + 16), %eax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;)
-
-.align 16;
-.LC:
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a9..ac6cc29e8 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,5 +1,5 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -20,16 +20,15 @@
  *   http://cr.yp.to/chacha.html
  */
 
-/* The code is based on salsa20.c and public-domain ChaCha implementations:
- *  chacha-ref.c version 20080118
- *  D. J. Bernstein
- *  Public domain.
- * and
- *  Andrew Moon
- *  https://github.com/floodyberry/chacha-opt
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
  */
 
-
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -46,295 +45,216 @@
 #define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_CTR_SIZE     16        /* Bytes.  */
-#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4)
 
-/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */
-#undef USE_SSE2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_SSE2 1
-#endif
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(HAVE_GCC_INLINE_ASM_SSSE3)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
-/* USE_NEON indicates whether to enable ARM NEON assembly code. */
-#undef USE_NEON
+/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
+#undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
-#  define USE_NEON 1
+#  define USE_ARMV7_NEON 1
 # endif
-#endif /*ENABLE_NEON_SUPPORT*/
-
-
-struct CHACHA20_context_s;
-
+#endif
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
-# define ASM_EXTRA_STACK (10 * 16)
 #else
 # define ASM_FUNC_ABI
-# define ASM_EXTRA_STACK 0
 #endif
 
 
-typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
-                                           byte *dst,
-                                           size_t bytes) ASM_FUNC_ABI;
-
 typedef struct CHACHA20_context_s
 {
-  u32 input[CHACHA20_INPUT_LENGTH];
-  u32 pad[CHACHA20_INPUT_LENGTH];
-  chacha20_blocks_t blocks;
+  u32 input[16];
+  unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
+  int use_ssse3:1;
+  int use_avx2:1;
+  int use_neon:1;
 } CHACHA20_context_t;
 
 
-#ifdef USE_SSE2
-
-unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
-
-#endif /* USE_SSE2 */
-
 #ifdef USE_SSSE3
 
-unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
-                                               byte *out,
-                                               size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
-unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
-#ifdef USE_NEON
+#ifdef USE_ARMV7_NEON
 
-unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks);
 
-#endif /* USE_NEON */
+#endif /* USE_ARMV7_NEON */
 
 
-static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
 

+#define ROTATE(v,c)	(rol(v,c))
+#define XOR(v,w)	((v) ^ (w))
+#define PLUS(v,w)	((u32)((v) + (w)))
+#define PLUSONE(v)	(PLUS((v),1))
 
-#define QROUND(a,b,c,d)         \
-  do {                          \
-    a += b; d = rol(d ^ a, 16); \
-    c += d; b = rol(b ^ c, 12); \
-    a += b; d = rol(d ^ a, 8);  \
-    c += d; b = rol(b ^ c, 7);  \
-  } while (0)
+#define QUARTERROUND(a,b,c,d) \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
 
-#define QOUT(ai, bi, ci, di) \
-  DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di)
+#define BUF_XOR_LE32(dst, src, offset, x) \
+  buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
-
-#ifndef USE_SSE2
-ASM_FUNC_ABI static unsigned int
-chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
+static unsigned int
+chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
-  u32 pad[CHACHA20_INPUT_LENGTH];
-  u32 inp[CHACHA20_INPUT_LENGTH];
+  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
 
-  /* Note: 'bytes' must be multiple of 64 and not zero. */
-
-  inp[0] = state[0];
-  inp[1] = state[1];
-  inp[2] = state[2];
-  inp[3] = state[3];
-  inp[4] = state[4];
-  inp[5] = state[5];
-  inp[6] = state[6];
-  inp[7] = state[7];
-  inp[8] = state[8];
-  inp[9] = state[9];
-  inp[10] = state[10];
-  inp[11] = state[11];
-  inp[12] = state[12];
-  inp[13] = state[13];
-  inp[14] = state[14];
-  inp[15] = state[15];
-
-  do
+  while (nblks)
     {
-      /* First round. */
-      pad[0] = inp[0];
-      pad[4] = inp[4];
-      pad[8] = inp[8];
-      pad[12] = inp[12];
-      QROUND (pad[0], pad[4], pad[8], pad[12]);
-      pad[1] = inp[1];
-      pad[5] = inp[5];
-      pad[9] = inp[9];
-      pad[13] = inp[13];
-      QROUND (pad[1], pad[5], pad[9], pad[13]);
-      pad[2] = inp[2];
-      pad[6] = inp[6];
-      pad[10] = inp[10];
-      pad[14] = inp[14];
-      QROUND (pad[2], pad[6], pad[10], pad[14]);
-      pad[3] = inp[3];
-      pad[7] = inp[7];
-      pad[11] = inp[11];
-      pad[15] = inp[15];
-      QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-      QROUND (pad[0], pad[5], pad[10], pad[15]);
-      QROUND (pad[1], pad[6], pad[11], pad[12]);
-      QROUND (pad[2], pad[7], pad[8], pad[13]);
-      QROUND (pad[3], pad[4], pad[9], pad[14]);
-
-      for (i = 2; i < 20 - 2; i += 2)
-      {
-        QROUND (pad[0], pad[4], pad[8], pad[12]);
-        QROUND (pad[1], pad[5], pad[9], pad[13]);
-        QROUND (pad[2], pad[6], pad[10], pad[14]);
-        QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-        QROUND (pad[0], pad[5], pad[10], pad[15]);
-        QROUND (pad[1], pad[6], pad[11], pad[12]);
-        QROUND (pad[2], pad[7], pad[8], pad[13]);
-        QROUND (pad[3], pad[4], pad[9], pad[14]);
-      }
-
-      QROUND (pad[0], pad[4], pad[8], pad[12]);
-      QROUND (pad[1], pad[5], pad[9], pad[13]);
-      QROUND (pad[2], pad[6], pad[10], pad[14]);
-      QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-      if (src)
-        {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \
-                                 (pad[idx] + inp[idx]) ^ \
-                                  buf_get_le32(src + (idx) * 4))
-          /* Last round. */
-          QROUND (pad[0], pad[5], pad[10], pad[15]);
-          QOUT(0, 5, 10, 15);
-          QROUND (pad[1], pad[6], pad[11], pad[12]);
-          QOUT(1, 6, 11, 12);
-          QROUND (pad[2], pad[7], pad[8], pad[13]);
-          QOUT(2, 7, 8, 13);
-          QROUND (pad[3], pad[4], pad[9], pad[14]);
-          QOUT(3, 4, 9, 14);
-#undef DO_OUT
-        }
-      else
-        {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx])
-          /* Last round. */
-          QROUND (pad[0], pad[5], pad[10], pad[15]);
-          QOUT(0, 5, 10, 15);
-          QROUND (pad[1], pad[6], pad[11], pad[12]);
-          QOUT(1, 6, 11, 12);
-          QROUND (pad[2], pad[7], pad[8], pad[13]);
-          QOUT(2, 7, 8, 13);
-          QROUND (pad[3], pad[4], pad[9], pad[14]);
-          QOUT(3, 4, 9, 14);
-#undef DO_OUT
-        }
-
-      /* Update counter. */
-      inp[13] += (!++inp[12]);
-
-      bytes -= CHACHA20_BLOCK_SIZE;
+      x0 = input[0];
+      x1 = input[1];
+      x2 = input[2];
+      x3 = input[3];
+      x4 = input[4];
+      x5 = input[5];
+      x6 = input[6];
+      x7 = input[7];
+      x8 = input[8];
+      x9 = input[9];
+      x10 = input[10];
+      x11 = input[11];
+      x12 = input[12];
+      x13 = input[13];
+      x14 = input[14];
+      x15 = input[15];
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND(x0, x4,  x8, x12)
+	  QUARTERROUND(x1, x5,  x9, x13)
+	  QUARTERROUND(x2, x6, x10, x14)
+	  QUARTERROUND(x3, x7, x11, x15)
+	  QUARTERROUND(x0, x5, x10, x15)
+	  QUARTERROUND(x1, x6, x11, x12)
+	  QUARTERROUND(x2, x7,  x8, x13)
+	  QUARTERROUND(x3, x4,  x9, x14)
+	}
+
+      x0 = PLUS(x0, input[0]);
+      x1 = PLUS(x1, input[1]);
+      x2 = PLUS(x2, input[2]);
+      x3 = PLUS(x3, input[3]);
+      x4 = PLUS(x4, input[4]);
+      x5 = PLUS(x5, input[5]);
+      x6 = PLUS(x6, input[6]);
+      x7 = PLUS(x7, input[7]);
+      x8 = PLUS(x8, input[8]);
+      x9 = PLUS(x9, input[9]);
+      x10 = PLUS(x10, input[10]);
+      x11 = PLUS(x11, input[11]);
+      x12 = PLUS(x12, input[12]);
+      x13 = PLUS(x13, input[13]);
+      x14 = PLUS(x14, input[14]);
+      x15 = PLUS(x15, input[15]);
+
+      input[12] = PLUSONE(input[12]);
+      input[13] = PLUS(input[13], !input[12]);
+
+      BUF_XOR_LE32(dst, src, 0, x0);
+      BUF_XOR_LE32(dst, src, 4, x1);
+      BUF_XOR_LE32(dst, src, 8, x2);
+      BUF_XOR_LE32(dst, src, 12, x3);
+      BUF_XOR_LE32(dst, src, 16, x4);
+      BUF_XOR_LE32(dst, src, 20, x5);
+      BUF_XOR_LE32(dst, src, 24, x6);
+      BUF_XOR_LE32(dst, src, 28, x7);
+      BUF_XOR_LE32(dst, src, 32, x8);
+      BUF_XOR_LE32(dst, src, 36, x9);
+      BUF_XOR_LE32(dst, src, 40, x10);
+      BUF_XOR_LE32(dst, src, 44, x11);
+      BUF_XOR_LE32(dst, src, 48, x12);
+      BUF_XOR_LE32(dst, src, 52, x13);
+      BUF_XOR_LE32(dst, src, 56, x14);
+      BUF_XOR_LE32(dst, src, 60, x15);
+
+      src += CHACHA20_BLOCK_SIZE;
       dst += CHACHA20_BLOCK_SIZE;
-      src += (src) ? CHACHA20_BLOCK_SIZE : 0;
+      nblks--;
     }
-  while (bytes >= CHACHA20_BLOCK_SIZE);
-
-  state[12] = inp[12];
-  state[13] = inp[13];
 
   /* burn_stack */
-  return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *));
-}
-#endif /*!USE_SSE2*/
-
-#undef QROUND
-#undef QOUT
-
-
-static unsigned int
-chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
-{
-  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE)
-         + ASM_EXTRA_STACK;
+  return (17 * sizeof(u32) + 6 * sizeof(void *));
 }
 
 
 static void
-chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key,
+chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
 {
-  /* These constants are the little endian encoding of the string
-     "expand 32-byte k".  For the 128 bit variant, the "32" in that
-     string will be fixed up to "16".  */
-  ctx->input[0] = 0x61707865;        /* "apxe"  */
-  ctx->input[1] = 0x3320646e;        /* "3 dn"  */
-  ctx->input[2] = 0x79622d32;        /* "yb-2"  */
-  ctx->input[3] = 0x6b206574;        /* "k et"  */
-
-  ctx->input[4] = buf_get_le32 (key + 0);
-  ctx->input[5] = buf_get_le32 (key + 4);
-  ctx->input[6] = buf_get_le32 (key + 8);
-  ctx->input[7] = buf_get_le32 (key + 12);
-
+  static const char sigma[16] = "expand 32-byte k";
+  static const char tau[16] = "expand 16-byte k";
+  const char *constants;
+
+  ctx->input[4] = buf_get_le32(key + 0);
+  ctx->input[5] = buf_get_le32(key + 4);
+  ctx->input[6] = buf_get_le32(key + 8);
+  ctx->input[7] = buf_get_le32(key + 12);
   if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
     {
-      ctx->input[8] = buf_get_le32 (key + 16);
-      ctx->input[9] = buf_get_le32 (key + 20);
-      ctx->input[10] = buf_get_le32 (key + 24);
-      ctx->input[11] = buf_get_le32 (key + 28);
+      key += 16;
+      constants = sigma;
     }
   else /* 128 bits */
     {
-      ctx->input[8] = ctx->input[4];
-      ctx->input[9] = ctx->input[5];
-      ctx->input[10] = ctx->input[6];
-      ctx->input[11] = ctx->input[7];
-
-      ctx->input[1] -= 0x02000000;        /* Change to "1 dn".  */
-      ctx->input[2] += 0x00000004;        /* Change to "yb-6".  */
+      constants = tau;
     }
+  ctx->input[8] = buf_get_le32(key + 0);
+  ctx->input[9] = buf_get_le32(key + 4);
+  ctx->input[10] = buf_get_le32(key + 8);
+  ctx->input[11] = buf_get_le32(key + 12);
+  ctx->input[0] = buf_get_le32(constants + 0);
+  ctx->input[1] = buf_get_le32(constants + 4);
+  ctx->input[2] = buf_get_le32(constants + 8);
+  ctx->input[3] = buf_get_le32(constants + 12);
 }
 
 
 static void
-chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
+chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
 {
   if (ivlen == CHACHA20_CTR_SIZE)
     {
@@ -367,9 +287,30 @@ chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
 }
 
 
+static void
+chacha20_setiv (void *context, const byte *iv, size_t ivlen)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
+  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
+      && ivlen != CHACHA20_CTR_SIZE)
+    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
+
+  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
+             || ivlen == CHACHA20_CTR_SIZE))
+    chacha20_ivsetup (ctx, iv, ivlen);
+  else
+    chacha20_ivsetup (ctx, NULL, 0);
+
+  /* Reset the unused pad bytes counter.  */
+  ctx->unused = 0;
+}
+
+
 static gcry_err_code_t
-chacha20_do_setkey (CHACHA20_context_t * ctx,
-                    const byte * key, unsigned int keylen)
+chacha20_do_setkey (CHACHA20_context_t *ctx,
+                    const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
@@ -388,25 +329,15 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
   if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
-#ifdef USE_SSE2
-  ctx->blocks = _gcry_chacha20_amd64_sse2_blocks;
-#else
-  ctx->blocks = chacha20_blocks;
-#endif
-
 #ifdef USE_SSSE3
-  if (features & HWF_INTEL_SSSE3)
-    ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks;
+  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX2
-  if (features & HWF_INTEL_AVX2)
-    ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
+  ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
-#ifdef USE_NEON
-  if (features & HWF_ARM_NEON)
-    ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
+#ifdef USE_ARMV7_NEON
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
-
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
@@ -419,7 +350,7 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
 
 
 static gcry_err_code_t
-chacha20_setkey (void *context, const byte * key, unsigned int keylen)
+chacha20_setkey (void *context, const byte *key, unsigned int keylen)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
@@ -429,37 +360,19 @@ chacha20_setkey (void *context, const byte * key, unsigned int keylen)
 
 
 static void
-chacha20_setiv (void *context, const byte * iv, size_t ivlen)
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+                         size_t length)
 {
+  static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
-  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
-  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
-      && ivlen != CHACHA20_CTR_SIZE)
-    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
-
-  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
-             || ivlen == CHACHA20_CTR_SIZE))
-    chacha20_ivsetup (ctx, iv, ivlen);
-  else
-    chacha20_ivsetup (ctx, NULL, 0);
-
-  /* Reset the unused pad bytes counter.  */
-  ctx->unused = 0;
-}
-

-
-
-/* Note: This function requires LENGTH > 0.  */
-static void
-chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
-                            byte * outbuf, const byte * inbuf, size_t length)
-{
   unsigned int nburn, burn = 0;
 
+  if (!length)
+    return;
+
   if (ctx->unused)
     {
-      unsigned char *p = (void *) ctx->pad;
+      unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
@@ -467,29 +380,73 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
       n = ctx->unused;
       if (n > length)
         n = length;
+
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
+
       if (!length)
         return;
       gcry_assert (!ctx->unused);
     }
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
+						 nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_ARMV7_NEON
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      size_t bytes = nblocks * CHACHA20_BLOCK_SIZE;
-      burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes);
-      length -= bytes;
-      outbuf += bytes;
-      inbuf  += bytes;
+      nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 
   if (length > 0)
     {
-      nburn = chacha20_core (ctx->pad, ctx);
+      nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);
@@ -500,17 +457,6 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
 }
 
 
-static void
-chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf,
-                         size_t length)
-{
-  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
-  if (length)
-    chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length);
-}
-
-
 static const char *
 selftest (void)
 {
diff --git a/configure.ac b/configure.ac
index c4b59f4dd..a5aba144c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2227,9 +2227,8 @@ if test "$found" = "1" ; then
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-sse2-amd64.lo"
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
    esac
 

From wk at gnupg.org  Mon Jan  8 09:44:00 2018
From: wk at gnupg.org (Werner Koch)
Date: Mon, 08 Jan 2018 09:44:00 +0100
Subject: [PATCH] Add ARMv8/AArch64 implementation of chacha20
In-Reply-To: <a5a84ff6-34bb-4e84-09f8-32aaeb93362d@iki.fi> (Jussi Kivilinna's
 message of "Sat, 6 Jan 2018 19:59:39 +0200")
References: <150202138983.15504.16870943748027047649.stgit@localhost.localdomain>
 <8760dzpgzs.fsf@wheatstone.g10code.de>
 <2a8e1f8d-a04a-4770-24f1-665a2b9e21a2@iki.fi>
 <a5a84ff6-34bb-4e84-09f8-32aaeb93362d@iki.fi>
Message-ID: <871sj0pwof.fsf@wheatstone.g10code.de>

On Sat,  6 Jan 2018 18:59, jussi.kivilinna at iki.fi said:

> I tried to contact author with email and through github but have not
> got any response so far. 

:-(

> I've prepared new implementations of poly1305 and chacha20 to replace
> existing ones that are based on source on Andrew Moon's github
> repositories. I'll send those to mailing list next.

Thank you.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180108/b93db46d/attachment.sig>

From wk at gnupg.org  Mon Jan  8 09:44:50 2018
From: wk at gnupg.org (Werner Koch)
Date: Mon, 08 Jan 2018 09:44:50 +0100
Subject: [PATCH] mpi/ec: fix when 'unsigned long' is 32-bit but limb size
 is 64-bit
In-Reply-To: <151526196339.10194.13412606082380457756.stgit@localhost.localdomain>
 (Jussi Kivilinna's message of "Sat, 06 Jan 2018 20:06:03 +0200")
References: <151526196339.10194.13412606082380457756.stgit@localhost.localdomain>
Message-ID: <87wp0soi2l.fsf@wheatstone.g10code.de>

On Sat,  6 Jan 2018 19:06, jussi.kivilinna at iki.fi said:

> Patch fixes mpi/ec.c compiler warnings and failing tests cases on
> Win64.

Do you have a bug report or acutal suse case for this?  
Shall we apply this to 1.8?


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180108/aaf496e2/attachment.sig>

From jussi.kivilinna at iki.fi  Tue Jan  9 08:27:18 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 9 Jan 2018 09:27:18 +0200
Subject: [PATCH] mpi/ec: fix when 'unsigned long' is 32-bit but limb size
 is 64-bit
In-Reply-To: <87wp0soi2l.fsf@wheatstone.g10code.de>
References: <151526196339.10194.13412606082380457756.stgit@localhost.localdomain>
 <87wp0soi2l.fsf@wheatstone.g10code.de>
Message-ID: <6727e32c-9639-fe77-ee39-c82680c581b5@iki.fi>

On 08.01.2018 10:44, Werner Koch wrote:
> On Sat,  6 Jan 2018 19:06, jussi.kivilinna at iki.fi said:
> 
>> Patch fixes mpi/ec.c compiler warnings and failing tests cases on
>> Win64.
> 
> Do you have a bug report or acutal suse case for this?  
> Shall we apply this to 1.8?
> 

No bug report, I ran into this problem when testing development branch.
Bug was introduced in commit 9ed0fb37bd637d1a2e9498c24097cfeadec682e
which is not in the 1.8 branch. 

-Jussi


From wk at gnupg.org  Tue Jan  9 09:53:53 2018
From: wk at gnupg.org (Werner Koch)
Date: Tue, 09 Jan 2018 09:53:53 +0100
Subject: [PATCH] mpi/ec: fix when 'unsigned long' is 32-bit but limb size
 is 64-bit
In-Reply-To: <6727e32c-9639-fe77-ee39-c82680c581b5@iki.fi> (Jussi Kivilinna's
 message of "Tue, 9 Jan 2018 09:27:18 +0200")
References: <151526196339.10194.13412606082380457756.stgit@localhost.localdomain>
 <87wp0soi2l.fsf@wheatstone.g10code.de>
 <6727e32c-9639-fe77-ee39-c82680c581b5@iki.fi>
Message-ID: <87373fl8f2.fsf@wheatstone.g10code.de>

On Tue,  9 Jan 2018 08:27, jussi.kivilinna at iki.fi said:

> Bug was introduced in commit 9ed0fb37bd637d1a2e9498c24097cfeadec682e
> which is not in the 1.8 branch. 

Okay.  Thanks.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180109/701520cf/attachment.sig>

From jussi.kivilinna at iki.fi  Tue Jan  9 18:23:35 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 09 Jan 2018 19:23:35 +0200
Subject: [PATCH] Make BMI2 inline assembly check more robust
Message-ID: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>

* configure.ac (gcry_cv_gcc_inline_asm_bmi2): New assembly test.
--

Use actual assembly snippets from keccak.c to check that compiler
has proper support for used BMI2 instructions.

GnuPG-bug-id: 3408
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 configure.ac |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 42cd4c27b..aaf3c82a9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1403,8 +1403,15 @@ AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-          [[void a(void) {
-              __asm__("rorxl \$23, %%eax, %%edx\\n\\t":::"memory");
+          [[unsigned int a(unsigned int x, unsigned int y) {
+              unsigned int tmp1, tmp2;
+              asm ("rorxl %2, %1, %0"
+                   : "=r" (tmp1)
+                   : "rm0" (x), "J" (32 - ((23) & 31)));
+              asm ("andnl %2, %1, %0"
+                   : "=r" (tmp2)
+                   : "r0" (x), "rm" (y));
+              return tmp1 + tmp2;
             }]])],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])


From jussi.kivilinna at iki.fi  Tue Jan  9 18:25:12 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 09 Jan 2018 19:25:12 +0200
Subject: [PATCH 1/2] Move AMD64 MS to SysV calling convention conversion to
 assembly side
Message-ID: <151551871260.19627.9961362268607461169.stgit@localhost.localdomain>

* cipher/Makefile.am: Add 'asm-common-amd64.h'.
* cipher/asm-common-amd64.h: New.
* cipher/blowfish-amd64.S: Add ENTER_SYSV_FUNC_* and EXIT_SYSV_FUNC for
each global function from 'asm-common-amd64.h'.
* cipher/cast5-amd64.S: Ditto.
* cipher/des-amd64.S: Ditto.
* cipher/rijndael-amd64.S: Ditto.
* cipher/twofish-amd64.S: Ditto.
* cipher/arcfour-amd64.S: Ditto.
* cipher/blowfish.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
(call_sysv_fn): Remove.
* cipher/cast5.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
(call_sysv_fn): Remove.
* cipher/twofish.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
(call_sysv_fn, call_sysv_fn5, call_sysv_fn6): Remove.
* cipher/rijndael.c (do_encrypt, do_decrypt)
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Remove assembly block for
calling SysV ABI function.
* cipher/arcfour.c [USE_AMD64_ASM] (encrypt_stream): Ditto.
--

Old approach was to convert MS ABI to SysV ABI calling convention
for AMD64 assembly functions at caller side. This patch moves
calling convention conversion to assembly/callee side.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am        |    1 +
 cipher/arcfour-amd64.S    |    8 ++--
 cipher/arcfour.c          |   14 -------
 cipher/asm-common-amd64.h |   90 +++++++++++++++++++++++++++++++++++++++++++++
 cipher/blowfish-amd64.S   |   24 ++++++++++--
 cipher/blowfish.c         |   44 ----------------------
 cipher/cast5-amd64.S      |   42 +++++++++------------
 cipher/cast5.c            |   38 -------------------
 cipher/des-amd64.S        |   27 +++++++-------
 cipher/des.c              |   33 -----------------
 cipher/rijndael-amd64.S   |   20 +++++-----
 cipher/rijndael.c         |   38 -------------------
 cipher/twofish-amd64.S    |   36 +++++++++++++-----
 cipher/twofish.c          |   87 --------------------------------------------
 14 files changed, 179 insertions(+), 323 deletions(-)
 create mode 100644 cipher/asm-common-amd64.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3c4eae0b9..bba815bbe 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -61,6 +61,7 @@ dsa-common.c rsa-common.c \
 sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
+asm-common-amd64.h \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index 2e52ea00d..c08f3453b 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -18,17 +18,14 @@
 #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 .align 16
 .globl _gcry_arcfour_amd64
 ELF(.type _gcry_arcfour_amd64, at function)
 _gcry_arcfour_amd64:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	push	%rbp
 	push	%rbx
 	mov	%rdi,		%rbp	# key = ARG(key)
@@ -96,6 +93,7 @@ _gcry_arcfour_amd64:
 	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
 	pop	%rbx
 	pop	%rbp
+	EXIT_SYSV_FUNC
 	ret
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
diff --git a/cipher/arcfour.c b/cipher/arcfour.c
index 44e8ef46c..085df9bbd 100644
--- a/cipher/arcfour.c
+++ b/cipher/arcfour.c
@@ -54,21 +54,7 @@ static void
 encrypt_stream (void *context,
                 byte *outbuf, const byte *inbuf, size_t length)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  const void *fn = _gcry_arcfour_amd64;
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (context),
-                  "+S" (length),
-                  "+d" (inbuf),
-                  "+c" (outbuf)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-#else
   _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
-#endif
 }
 
 #else /*!USE_AMD64_ASM*/
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
new file mode 100644
index 000000000..7eb426495
--- /dev/null
+++ b/cipher/asm-common-amd64.h
@@ -0,0 +1,90 @@
+/* asm-common-amd64.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AMD64_H
+#define GCRY_ASM_COMMON_AMD64_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+#ifdef __PIC__
+#  define RIP %rip
+#else
+#  define RIP
+#endif
+
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
+#else
+#  ifdef __code_model_large__
+#    define GET_EXTERN_POINTER(name, reg) \
+	       pushq %r15; \
+	       pushq %r14; \
+	    1: leaq 1b(%rip), reg; \
+	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
+	       movabsq $name at GOT, %r15; \
+	       addq %r14, reg; \
+	       popq %r14; \
+	       movq (reg, %r15), reg; \
+	       popq %r15;
+#  else
+#    define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
+#  endif
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ENTER_SYSV_FUNC_PARAMS_0_4 \
+	pushq %rdi; \
+	pushq %rsi; \
+	movq %rcx, %rdi; \
+	movq %rdx, %rsi; \
+	movq %r8, %rdx; \
+	movq %r9, %rcx; \
+
+# define ENTER_SYSV_FUNC_PARAMS_5 \
+	ENTER_SYSV_FUNC_PARAMS_0_4; \
+	movq 0x38(%rsp), %r8;
+
+# define ENTER_SYSV_FUNC_PARAMS_6 \
+	ENTER_SYSV_FUNC_PARAMS_5; \
+	movq 0x40(%rsp), %r9;
+
+# define EXIT_SYSV_FUNC \
+	popq %rsi; \
+	popq %rdi;
+#else
+# define ENTER_SYSV_FUNC_PARAMS_0_4
+# define ENTER_SYSV_FUNC_PARAMS_5
+# define ENTER_SYSV_FUNC_PARAMS_6
+# define EXIT_SYSV_FUNC
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 21b63fc1c..02d3b7102 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -24,11 +24,7 @@
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -165,6 +161,8 @@ _gcry_blowfish_amd64_do_encrypt:
 	 *	%rsi: u32 *ret_xl
 	 *	%rdx: u32 *ret_xr
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	movl (%rdx), RX0d;
 	shlq $32, RX0;
 	movl (%rsi), RT3d;
@@ -178,6 +176,7 @@ _gcry_blowfish_amd64_do_encrypt:
 	shrq $32, RX0;
 	movl RX0d, (RX2);
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
@@ -191,6 +190,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rsi, %r10;
 
@@ -202,6 +202,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	movq %r10, RIO;
 	write_block();
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
@@ -215,6 +216,8 @@ _gcry_blowfish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	movq %rbp, %r11;
 
 	movq %rsi, %r10;
@@ -238,6 +241,7 @@ _gcry_blowfish_amd64_decrypt_block:
 
 	movq %r11, %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
@@ -392,6 +396,8 @@ _gcry_blowfish_amd64_ctr_enc:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -436,6 +442,7 @@ _gcry_blowfish_amd64_ctr_enc:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
@@ -449,6 +456,8 @@ _gcry_blowfish_amd64_cbc_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -484,6 +493,7 @@ _gcry_blowfish_amd64_cbc_dec:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
@@ -497,6 +507,8 @@ _gcry_blowfish_amd64_cfb_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -534,6 +546,8 @@ _gcry_blowfish_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index a3fc26ce2..724d64e98 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -281,87 +281,43 @@ extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
 extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *iv);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static void
 do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_do_encrypt, bc, ret_xl, ret_xr, NULL);
-#else
   _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
-#endif
 }
 
 static void
 do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_encrypt_block, context, outbuf, inbuf,
-                NULL);
-#else
   _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_decrypt_block, context, outbuf, inbuf,
-                NULL);
-#else
   _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static inline void
 blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_ctr_enc, ctx, out, in, ctr);
-#else
   _gcry_blowfish_amd64_ctr_enc(ctx, out, in, ctr);
-#endif
 }
 
 static inline void
 blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_cbc_dec, ctx, out, in, iv);
-#else
   _gcry_blowfish_amd64_cbc_dec(ctx, out, in, iv);
-#endif
 }
 
 static inline void
 blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_cfb_dec, ctx, out, in, iv);
-#else
   _gcry_blowfish_amd64_cfb_dec(ctx, out, in, iv);
-#endif
 }
 
 static unsigned int
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index c04015a29..1a1d43fd5 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -23,30 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
 
-#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
-#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
-#else
-#  ifdef __code_model_large__
-#    define GET_EXTERN_POINTER(name, reg) \
-	       pushq %r15; \
-	       pushq %r14; \
-	    1: leaq 1b(%rip), reg; \
-	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
-	       movabsq $name at GOT, %r15; \
-	       addq %r14, reg; \
-	       popq %r14; \
-	       movq (reg, %r15), reg; \
-	       popq %r15;
-#  else
-#    define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
-#  endif
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -206,6 +183,8 @@ _gcry_cast5_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 
@@ -233,6 +212,8 @@ _gcry_cast5_amd64_encrypt_block:
 
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
@@ -246,6 +227,8 @@ _gcry_cast5_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 
@@ -273,6 +256,8 @@ _gcry_cast5_amd64_decrypt_block:
 
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
@@ -444,6 +429,7 @@ _gcry_cast5_amd64_ctr_enc:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -489,6 +475,8 @@ _gcry_cast5_amd64_ctr_enc:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
@@ -502,6 +490,7 @@ _gcry_cast5_amd64_cbc_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -542,6 +531,8 @@ _gcry_cast5_amd64_cbc_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
@@ -556,6 +547,7 @@ _gcry_cast5_amd64_cfb_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -597,6 +589,8 @@ _gcry_cast5_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 94dcee76a..d23882b9a 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -373,72 +373,34 @@ extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
 extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *iv);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static void
 do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_encrypt_block, context, outbuf, inbuf, NULL);
-#else
   _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_decrypt_block, context, outbuf, inbuf, NULL);
-#else
   _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_ctr_enc, ctx, out, in, ctr);
-#else
   _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr);
-#endif
 }
 
 static void
 cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_cbc_dec, ctx, out, in, iv);
-#else
   _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv);
-#endif
 }
 
 static void
 cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_cfb_dec, ctx, out, in, iv);
-#else
   _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv);
-#endif
 }
 
 static unsigned int
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index 1b7cfba85..f25573d99 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -23,17 +23,7 @@
 #if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -200,6 +190,8 @@ _gcry_3des_amd64_crypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -208,7 +200,7 @@ _gcry_3des_amd64_crypt_block:
 	pushq %r15;
 	pushq %rsi; /*dst*/
 
-	leaq .L_s1 RIP, SBOXES;
+	leaq .L_s1 rRIP, SBOXES;
 
 	read_block(%rdx, RL0, RR0);
 	initial_permutation(RL0, RR0);
@@ -277,6 +269,7 @@ _gcry_3des_amd64_crypt_block:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
@@ -473,7 +466,7 @@ _gcry_3des_amd64_crypt_blk3:
 	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
 	 */
 
-	leaq .L_s1 RIP, SBOXES;
+	leaq .L_s1 rRIP, SBOXES;
 
 	initial_permutation3(RL, RR);
 
@@ -547,6 +540,7 @@ _gcry_3des_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -610,6 +604,7 @@ _gcry_3des_amd64_cbc_dec:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -623,6 +618,7 @@ _gcry_3des_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -688,6 +684,7 @@ _gcry_3des_amd64_ctr_enc:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -701,6 +698,8 @@ _gcry_3des_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -763,6 +762,8 @@ _gcry_3des_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
diff --git a/cipher/des.c b/cipher/des.c
index 5c99f50d3..7801b08fc 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -772,23 +772,6 @@ extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out,
 
 #define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *))
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
 
 /*
  * Electronic Codebook Mode Triple-DES encryption/decryption of data
@@ -803,11 +786,7 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
 
   keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_crypt_block, keys, to, from, NULL);
-#else
   _gcry_3des_amd64_crypt_block(keys, to, from);
-#endif
 
   return 0;
 }
@@ -815,31 +794,19 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
 static inline void
 tripledes_amd64_ctr_enc(const void *keys, byte *out, const byte *in, byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_ctr_enc, keys, out, in, ctr);
-#else
   _gcry_3des_amd64_ctr_enc(keys, out, in, ctr);
-#endif
 }
 
 static inline void
 tripledes_amd64_cbc_dec(const void *keys, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_cbc_dec, keys, out, in, iv);
-#else
   _gcry_3des_amd64_cbc_dec(keys, out, in, iv);
-#endif
 }
 
 static inline void
 tripledes_amd64_cfb_dec(const void *keys, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_cfb_dec, keys, out, in, iv);
-#else
   _gcry_3des_amd64_cfb_dec(keys, out, in, iv);
-#endif
 }
 
 #else /*USE_AMD64_ASM*/
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index b149e9485..798ff51af 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -222,6 +212,8 @@ _gcry_aes_amd64_encrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  encryption tables
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(5 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
@@ -265,6 +257,8 @@ _gcry_aes_amd64_encrypt_block:
 	addq $(5 * 8), %rsp;
 
 	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 .align 4
@@ -382,6 +376,8 @@ _gcry_aes_amd64_decrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  decryption tables
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(5 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
@@ -426,6 +422,8 @@ _gcry_aes_amd64_decrypt_block:
 	addq $(5 * 8), %rsp;
 
 	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 .align 4
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 548bfa099..df1363f28 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -740,27 +740,8 @@ do_encrypt (const RIJNDAEL_context *ctx,
             unsigned char *bx, const unsigned char *ax)
 {
 #ifdef USE_AMD64_ASM
-# ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
   return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds,
 				       encT);
-# else
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  const void *key = ctx->keyschenc;
-  uintptr_t rounds = ctx->rounds;
-  uintptr_t ret;
-  asm volatile ("movq %[encT], %%r8\n\t"
-                "callq *%[ret]\n\t"
-                : [ret] "=a" (ret),
-                  "+D" (key),
-                  "+S" (bx),
-                  "+d" (ax),
-                  "+c" (rounds)
-                : "0" (_gcry_aes_amd64_encrypt_block),
-                  [encT] "r" (encT)
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-  return ret;
-# endif /* HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS */
 #elif defined(USE_ARM_ASM)
   return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT);
 #else
@@ -1123,27 +1104,8 @@ do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
             const unsigned char *ax)
 {
 #ifdef USE_AMD64_ASM
-# ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
   return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
 				       &dec_tables);
-# else
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  const void *key = ctx->keyschdec;
-  uintptr_t rounds = ctx->rounds;
-  uintptr_t ret;
-  asm volatile ("movq %[dectabs], %%r8\n\t"
-                "callq *%[ret]\n\t"
-                : [ret] "=a" (ret),
-                  "+D" (key),
-                  "+S" (bx),
-                  "+d" (ax),
-                  "+c" (rounds)
-                : "0" (_gcry_aes_amd64_decrypt_block),
-                  [dectabs] "r" (&dec_tables)
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-  return ret;
-# endif /* HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS */
 #elif defined(USE_ARM_ASM)
   return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
 				     &dec_tables);
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index aa964e037..7a836463c 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP %rip
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -181,6 +171,8 @@ _gcry_twofish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(3 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
@@ -211,6 +203,7 @@ _gcry_twofish_amd64_encrypt_block:
 	movq (1 * 8)(%rsp), %rbp;
 	addq $(3 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -224,6 +217,8 @@ _gcry_twofish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(3 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
@@ -254,6 +249,7 @@ _gcry_twofish_amd64_decrypt_block:
 	movq (1 * 8)(%rsp), %rbp;
 	addq $(3 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -530,6 +526,8 @@ _gcry_twofish_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -599,6 +597,7 @@ _gcry_twofish_amd64_ctr_enc:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
@@ -612,6 +611,8 @@ _gcry_twofish_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(9 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -665,6 +666,7 @@ _gcry_twofish_amd64_cbc_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(9 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
@@ -678,6 +680,8 @@ _gcry_twofish_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -731,6 +735,7 @@ _gcry_twofish_amd64_cfb_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
@@ -746,6 +751,8 @@ _gcry_twofish_amd64_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_6
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -838,6 +845,7 @@ _gcry_twofish_amd64_ocb_enc:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
@@ -853,6 +861,8 @@ _gcry_twofish_amd64_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_6
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -953,6 +963,7 @@ _gcry_twofish_amd64_ocb_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
@@ -967,6 +978,8 @@ _gcry_twofish_amd64_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -1039,6 +1052,7 @@ _gcry_twofish_amd64_ocb_auth:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 942e8d429..48feaae9f 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -829,145 +829,58 @@ extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
 					 const byte *abuf, byte *offset,
 					 byte *checksum, const u64 Ls[3]);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-
-static inline void
-call_sysv_fn5 (const void *fn, const void *arg1, const void *arg2,
-               const void *arg3, const void *arg4, const void *arg5)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("movq %[arg5], %%r8\n\t"
-		"callq *%0\n\t"
-		: "+a" (fn),
-		  "+D" (arg1),
-		  "+S" (arg2),
-		  "+d" (arg3),
-		  "+c" (arg4)
-		: [arg5] "g" (arg5)
-		: "cc", "memory", "r8", "r9", "r10", "r11");
-}
-
-static inline void
-call_sysv_fn6 (const void *fn, const void *arg1, const void *arg2,
-               const void *arg3, const void *arg4, const void *arg5,
-	       const void *arg6)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("movq %[arg5], %%r8\n\t"
-		"movq %[arg6], %%r9\n\t"
-		"callq *%0\n\t"
-		: "+a" (fn),
-		  "+D" (arg1),
-		  "+S" (arg2),
-		  "+d" (arg3),
-		  "+c" (arg4)
-		: [arg5] "g" (arg5),
-		  [arg6] "g" (arg6)
-		: "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static inline void
 twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_encrypt_block, c, out, in, NULL);
-#else
   _gcry_twofish_amd64_encrypt_block(c, out, in);
-#endif
 }
 
 static inline void
 twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_decrypt_block, c, out, in, NULL);
-#else
   _gcry_twofish_amd64_decrypt_block(c, out, in);
-#endif
 }
 
 static inline void
 twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_ctr_enc, c, out, in, ctr);
-#else
   _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
-#endif
 }
 
 static inline void
 twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_cbc_dec, c, out, in, iv);
-#else
   _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
-#endif
 }
 
 static inline void
 twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_cfb_dec, c, out, in, iv);
-#else
   _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
 		      byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn6(_gcry_twofish_amd64_ocb_enc, ctx, out, in, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
 		      byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn6(_gcry_twofish_amd64_ocb_dec, ctx, out, in, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
 		       byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn5(_gcry_twofish_amd64_ocb_auth, ctx, abuf, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
-#endif
 }
 
 #elif defined(USE_ARM_ASM)


From jussi.kivilinna at iki.fi  Tue Jan  9 18:25:17 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 09 Jan 2018 19:25:17 +0200
Subject: [PATCH 2/2] rijndael-ssse3: call assembly functions directly
In-Reply-To: <151551871260.19627.9961362268607461169.stgit@localhost.localdomain>
References: <151551871260.19627.9961362268607461169.stgit@localhost.localdomain>
Message-ID: <151551871763.19627.12236803845513890422.stgit@localhost.localdomain>

* cipher/rijndael-ssse3-amd64-asm.S (_gcry_aes_ssse3_enc_preload)
(_gcry_aes_ssse3_dec_preload, _gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add
ENTER_SYSV_FUNC_PARAMS_* at function entry and EXIT_SYSV_FUNC at exit.
(_gcry_aes_ssse3_encrypt_core, _gcry_aes_ssse3_decrypt_core): Change
to input parameters to RDI and RSI registers.
* cipher/rijndael-ssse3-amd64.c (_gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add parameters
for function prototypes.
(PUSH_STACK_PTR, POP_STACK_PTR): Remove.
(vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Remove inline assembly to
call functions, and call directly instead.
--

Instead of using inline assembly to call assembly functions in
AES SSSE3 implementation, change assembly functions so that they
can be called directly instead.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ssse3-amd64-asm.S |   31 +++++++++----
 cipher/rijndael-ssse3-amd64.c     |   91 ++++++-------------------------------
 2 files changed, 35 insertions(+), 87 deletions(-)

diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index 3ae55e8b6..ffce5df2f 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -40,11 +40,7 @@
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ELF(...)
-#else
-# define ELF(...) __VA_ARGS__
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -54,6 +50,7 @@
 ELF(.type _gcry_aes_ssse3_enc_preload, at function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10 # inv
@@ -62,6 +59,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
 	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
@@ -71,6 +69,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ELF(.type _gcry_aes_ssse3_dec_preload, at function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10  # inv
@@ -80,6 +79,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
 	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
@@ -98,11 +98,11 @@ ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 ##  Inputs:
 ##     %xmm0 = input
 ##     %xmm9-%xmm15 as in .Laes_preheat
-##    (%rdx) = scheduled keys
-##     %rax  = nrounds - 1
+##    (%rdi) = scheduled keys
+##     %rsi  = nrounds
 ##
 ##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
 ##  Preserves %xmm6 - %xmm7 so you get some local vectors
 ##
 ##
@@ -111,6 +111,9 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
+	leaq	-1(%rsi), %rax
 	lea	.Laes_consts(%rip), %rcx
 	leaq	.Lk_mc_backward(%rcx), %rdi
 	mov	$16,	%rsi
@@ -185,6 +188,7 @@ _aes_encrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
@@ -198,8 +202,11 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
-	movl	%eax,	%esi
+	subl	$1,	%esi
+	movl	%esi,   %eax
 	shll	$4,	%esi
 	xorl	$48,	%esi
 	andl	$48,	%esi
@@ -288,6 +295,7 @@ _aes_decrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
@@ -306,6 +314,8 @@ _aes_schedule_core:
 	# rsi = size in bits
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
+	# r8 = rotoffs
+	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
 	lea	.Laes_consts(%rip), %r10
@@ -659,8 +669,9 @@ _aes_schedule_core:
 	pxor	%xmm6,  %xmm6
 	pxor	%xmm7,  %xmm7
 	pxor	%xmm8,  %xmm8
+	EXIT_SYSV_FUNC
 	ret
-ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
 ##                                                    ##
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index da5339e36..98660ecc8 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -58,13 +58,14 @@
 
 
 /* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
-   have custom calling convention and need to be called from assembly
-   blocks, not directly. */
+   have custom calling convention (additional XMM parameters). */
 extern void _gcry_aes_ssse3_enc_preload(void);
 extern void _gcry_aes_ssse3_dec_preload(void);
-extern void _gcry_aes_ssse3_schedule_core(void);
-extern void _gcry_aes_ssse3_encrypt_core(void);
-extern void _gcry_aes_ssse3_decrypt_core(void);
+extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits,
+					  void *buffer, u64 decrypt,
+					  u64 rotoffs);
+extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds);
+extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds);
 
 
@@ -110,8 +111,6 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   : \
                   : "r" (ssse3_state) \
                   : "memory" )
-# define PUSH_STACK_PTR
-# define POP_STACK_PTR
 #else
 # define SSSE3_STATE_SIZE 1
 # define vpaes_ssse3_prepare() (void)ssse3_state
@@ -126,31 +125,15 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   "pxor	%%xmm7,  %%xmm7 \n\t" \
                   "pxor	%%xmm8,  %%xmm8 \n\t" \
                   ::: "memory" )
-/* Old GCC versions use red-zone of AMD64 SYSV ABI and stack pointer is
- * not properly adjusted for assembly block. Therefore stack pointer
- * needs to be manually corrected. */
-# define PUSH_STACK_PTR "subq $128, %%rsp;\n\t"
-# define POP_STACK_PTR  "addq $128, %%rsp;\n\t"
 #endif
 
 #define vpaes_ssse3_prepare_enc() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_enc_preload) \
-                  : "rax", "cc", "memory" )
+    _gcry_aes_ssse3_enc_preload();
 
 #define vpaes_ssse3_prepare_dec() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_dec_preload) \
-                  : "rax", "cc", "memory" )
-
+    _gcry_aes_ssse3_dec_preload();
 
 
 void
@@ -161,23 +144,7 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (&_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (*key),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschenc32[0][0]),
-                  [dir] "g" (0),
-                  [rotoffs] "g" (48)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48);
 
   /* Save key for setting up decryption. */
   if (keybits > 192)
@@ -216,23 +183,9 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (ctx->keyschdec32[0][0]),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
-                  [dir] "g" (1),
-                  [rotoffs] "g" ((keybits == 192) ? 0 : 32)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits,
+				&ctx->keyschdec32[ctx->rounds][0], 1,
+				(keybits == 192) ? 0 : 32);
 
   vpaes_ssse3_cleanup();
 }
@@ -243,15 +196,7 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 static inline void
 do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschenc32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_encrypt_core)
-		: "rcx", "rsi", "rdi", "cc", "memory");
+  _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds);
 }
 
 
@@ -260,15 +205,7 @@ do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 static inline void
 do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschdec32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_decrypt_core)
-		: "rcx", "rsi", "cc", "memory");
+  _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds);
 }
 
 
From cvs at cvs.gnupg.org  Tue Jan  9 18:17:26 2018
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Tue, 09 Jan 2018 18:17:26 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-32-ga00c5b2
Message-ID: <E1eYxWh-00052G-Sf@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  a00c5b2988cea256c7823a76ce601febf02c790f (commit)
       via  c9e9cb2eb6a1c659d3825ca627228b732f2f2152 (commit)
      from  b3ec0f752c925cde36f560f0f9309ab6450bbfd9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit a00c5b2988cea256c7823a76ce601febf02c790f
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    Add AES-NI acceleration for AES-XTS
    
    * cipher/cipher-internal.h (gcry_cipher_handle): Change bulk
    XTS function to take cipher context.
    * cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto.
    * cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI
    XTS bulk function.
    * cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc)
    (_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New.
    * cipher/rijndael.c (_gcry_aes_aesni_xts_crypt)
    (_gcry_aes_xts_crypt): New.
    * src/cipher.h (_gcry_aes_xts_crypt): New.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo):
    
    Before:
            XTS enc |      1.66 ns/B     575.7 MiB/s      6.63 c/B
            XTS dec |      1.66 ns/B     575.5 MiB/s      6.63 c/B
    
    After (~6x faster):
            XTS enc |     0.270 ns/B    3528.5 MiB/s      1.08 c/B
            XTS dec |     0.272 ns/B    3511.5 MiB/s      1.09 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index b748125..8c897d7 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -146,7 +146,7 @@ struct gcry_cipher_handle
 			const void *inbuf_arg, size_t nblocks, int encrypt);
     size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
 		       size_t nblocks);
-    void (*xts_crypt)(gcry_cipher_hd_t c, unsigned char *tweak,
+    void (*xts_crypt)(void *context, unsigned char *tweak,
 		      void *outbuf_arg, const void *inbuf_arg,
 		      size_t nblocks, int encrypt);
   } bulk;
diff --git a/cipher/cipher-xts.c b/cipher/cipher-xts.c
index 4da89e5..06cefbe 100644
--- a/cipher/cipher-xts.c
+++ b/cipher/cipher-xts.c
@@ -93,7 +93,8 @@ _gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
   /* Use a bulk method if available.  */
   if (nblocks && c->bulk.xts_crypt)
     {
-      c->bulk.xts_crypt (c, c->u_ctr.ctr, outbuf, inbuf, nblocks, encrypt);
+      c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+			 encrypt);
       inbuf  += nblocks * GCRY_XTS_BLOCK_LEN;
       outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
       inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 9812738..063c13d 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -532,6 +532,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               h->bulk.ocb_crypt = _gcry_aes_ocb_crypt;
               h->bulk.ocb_auth  = _gcry_aes_ocb_auth;
+              h->bulk.xts_crypt = _gcry_aes_xts_crypt;
               break;
 #endif /*USE_AES*/
 #ifdef USE_BLOWFISH
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 3d323cf..50a0745 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+  { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+		: [tweak] "=m" (*tweak)
+		:
+		: "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+                : [tweak] "=m" (*tweak)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			   unsigned char *outbuf, const unsigned char *inbuf,
+			   size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+  else
+    _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
 #endif /* USE_AESNI */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8637195..548bfa0 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -103,6 +103,11 @@ extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        int encrypt);
 extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                       size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx,
+				       unsigned char *tweak,
+				       unsigned char *outbuf,
+				       const unsigned char *inbuf,
+				       size_t nblocks, int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -1467,6 +1472,85 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 }
 
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t crypt_fn;
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+  if (encrypt)
+    {
+      if (ctx->prefetch_enc_fn)
+        ctx->prefetch_enc_fn();
+
+      crypt_fn = ctx->encrypt_fn;
+    }
+  else
+    {
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+        ctx->prefetch_dec_fn();
+
+      crypt_fn = ctx->decrypt_fn;
+    }
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      tweak_next_lo = buf_get_le64 (tweak + 0);
+      tweak_next_hi = buf_get_le64 (tweak + 8);
+
+      while (nblocks)
+	{
+	  tweak_lo = tweak_next_lo;
+	  tweak_hi = tweak_next_hi;
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+	  tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+	  buf_put_le64 (outbuf + 0, tmp_lo);
+	  buf_put_le64 (outbuf + 8, tmp_hi);
+
+	  /* Generate next tweak. */
+	  carry = -(tweak_next_hi >> 63) & 0x87;
+	  tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+	  tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+	  burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+	  buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+	  buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+	  outbuf += GCRY_XTS_BLOCK_LEN;
+	  inbuf += GCRY_XTS_BLOCK_LEN;
+	  nblocks--;
+	}
+
+      buf_put_le64 (tweak + 0, tweak_next_lo);
+      buf_put_le64 (tweak + 8, tweak_next_hi);
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
 
 /* Run the self-tests for AES 128.  Returns NULL on success. */
 static const char*
diff --git a/src/cipher.h b/src/cipher.h
index a6f257d..7c2e5d9 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -164,6 +164,9 @@ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			    const void *inbuf_arg, size_t nblocks, int encrypt);
 size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			   size_t nblocks);
+void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  size_t nblocks, int encrypt);
 
 /*-- blowfish.c --*/
 void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,

commit c9e9cb2eb6a1c659d3825ca627228b732f2f2152
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    AES-NI improvements for AMD64
    
    * cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable)
    (aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8)
    (do_aesni_dec_vec8, do_aesni_ctr_8): New.
    (_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec)
    (_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec)
    (_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks
    processing.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.175 ns/B    5448.7 MiB/s     0.700 c/B
            CFB dec |     0.174 ns/B    5466.2 MiB/s     0.698 c/B
            CTR enc |     0.182 ns/B    5226.0 MiB/s     0.730 c/B
            OCB enc |     0.194 ns/B    4913.9 MiB/s     0.776 c/B
            OCB dec |     0.200 ns/B    4769.2 MiB/s     0.800 c/B
           OCB auth |     0.172 ns/B    5545.0 MiB/s     0.688 c/B
    
    After (1.08x to 1.14x faster):
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.157 ns/B    6075.6 MiB/s     0.628 c/B
            CFB dec |     0.158 ns/B    6034.1 MiB/s     0.632 c/B
            CTR enc |     0.159 ns/B    5979.4 MiB/s     0.638 c/B
            OCB enc |     0.175 ns/B    5447.1 MiB/s     0.700 c/B
            OCB dec |     0.183 ns/B    5203.9 MiB/s     0.733 c/B
           OCB auth |     0.156 ns/B    6101.3 MiB/s     0.625 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 735e5cd..3d323cf 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -55,6 +55,7 @@ typedef struct u128_s
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
 # define aesni_prepare_2_6_variable char win64tmp[16]
+# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
 # define aesni_prepare() do { } while (0)
 # define aesni_prepare_2_6()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
@@ -62,6 +63,20 @@ typedef struct u128_s
                       :                                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_prepare_7_15()                                           \
+   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
@@ -76,6 +91,20 @@ typedef struct u128_s
                       : "m" (*win64tmp)                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_cleanup_7_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 #else
 # define aesni_prepare_2_6_variable
 # define aesni_prepare() do { } while (0)
@@ -91,6 +120,21 @@ typedef struct u128_s
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
+# ifdef __x86_64__
+#  define aesni_prepare_7_15_variable
+#  define aesni_prepare_7_15() do { } while (0)
+#  define aesni_cleanup_7_15()                                          \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm8, %%xmm8\n"                           \
+                      "pxor %%xmm9, %%xmm9\n"                           \
+                      "pxor %%xmm10, %%xmm10\n"                         \
+                      "pxor %%xmm11, %%xmm11\n"                         \
+                      "pxor %%xmm12, %%xmm12\n"                         \
+                      "pxor %%xmm13, %%xmm13\n"                         \
+                      "pxor %%xmm14, %%xmm14\n"                         \
+                      "pxor %%xmm15, %%xmm15\n":: );                    \
+   } while (0)
+# endif
 #endif
 
 void
@@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesenclast %%xmm0, %%xmm1\n\t"
+                "aesenclast %%xmm0, %%xmm2\n\t"
+                "aesenclast %%xmm0, %%xmm3\n\t"
+                "aesenclast %%xmm0, %%xmm4\n\t"
+                "aesenclast %%xmm0, %%xmm8\n\t"
+                "aesenclast %%xmm0, %%xmm9\n\t"
+                "aesenclast %%xmm0, %%xmm10\n\t"
+                "aesenclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesdeclast %%xmm0, %%xmm1\n\t"
+                "aesdeclast %%xmm0, %%xmm2\n\t"
+                "aesdeclast %%xmm0, %%xmm3\n\t"
+                "aesdeclast %%xmm0, %%xmm4\n\t"
+                "aesdeclast %%xmm0, %%xmm8\n\t"
+                "aesdeclast %%xmm0, %%xmm9\n\t"
+                "aesdeclast %%xmm0, %%xmm10\n\t"
+                "aesdeclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 #define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
 
   /* Register usage:
-      esi   keyschedule
+      [key] keyschedule
       xmm0  CTR-0
       xmm1  temp / round key
       xmm2  CTR-1
@@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 }
 
 
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr.  */
+static void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+    };
+  const void *bige_addb = bige_addb_const;
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+      xmm8  CTR-4
+      xmm9  CTR-5
+      xmm10 CTR-6
+      xmm11 CTR-7
+      xmm12 temp
+      xmm13 temp
+      xmm14 temp
+      xmm15 temp
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "cmpb   $0xf7, 15(%[ctr])\n\t"
+                "ja     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa 3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) */
+                "movdqa 4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) */
+                "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
+                "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
+                "movdqa 7*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(8) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm8\n\t"     /* xmm8 := be(4) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm9\n\t"     /* xmm9 := be(5) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm10\n\t"    /* xmm10 := be(6) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm11\n\t"    /* xmm11 := be(7) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(8) + CTR (xmm0) */
+                "jmp    .Lstore_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
+                "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
+                "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
+                "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
+                "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
+                "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
+                "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
+                "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
+                "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffff8, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
+                "cmpl   $0xfffffffa, %%esi\n\t"
+                "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
+                "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
+                "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm8%=:\n\t"
+                "psubq   %%xmm1, %%xmm8\n\t"
+                ".Lcarry_xmm9%=:\n\t"
+                "psubq   %%xmm1, %%xmm9\n\t"
+                ".Lcarry_xmm10%=:\n\t"
+                "psubq   %%xmm1, %%xmm10\n\t"
+                ".Lcarry_xmm11%=:\n\t"
+                "psubq   %%xmm1, %%xmm11\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
+                "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
+                "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
+                "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+
+                ".Lstore_ctr%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "jb .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "je .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                "aesenclast %%xmm1, %%xmm0\n\t"
+                "aesenclast %%xmm1, %%xmm2\n\t"
+                "aesenclast %%xmm1, %%xmm3\n\t"
+                "aesenclast %%xmm1, %%xmm4\n\t"
+                "aesenclast %%xmm1, %%xmm8\n\t"
+                "aesenclast %%xmm1, %%xmm9\n\t"
+                "aesenclast %%xmm1, %%xmm10\n\t"
+                "aesenclast %%xmm1, %%xmm11\n\t"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+
+  asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm1\n\t"  /* Get block 5.      */
+                "pxor %%xmm12, %%xmm0\n\t"         /* EncCTR-1 ^= input */
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "pxor %%xmm13, %%xmm2\n\t"         /* EncCTR-2 ^= input */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "pxor %%xmm14, %%xmm3\n\t"         /* EncCTR-3 ^= input */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "pxor %%xmm15, %%xmm4\n\t"         /* EncCTR-4 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm1,  %%xmm8\n\t"         /* EncCTR-5 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm12, %%xmm9\n\t"         /* EncCTR-6 ^= input */
+                "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
+                "pxor %%xmm13, %%xmm10\n\t"        /* EncCTR-7 ^= input */
+                "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "pxor %%xmm14, %%xmm11\n\t"        /* EncCTR-8 ^= input */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
+                "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
+                "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
+                "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 unsigned int
 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
@@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                   [ctr] "m" (*ctr)
                 : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
       outbuf += 4*BLOCKSIZE;
@@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                 : "memory" );
 
   /* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+	     "movdqa %%xmm2, %%xmm12\n\t"
+	     "movdqa %%xmm3, %%xmm13\n\t"
+	     "movdqa %%xmm4, %%xmm14\n\t"
+	     "movdqa %%xmm8, %%xmm15\n\t"
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile
+	    (
+	     "pxor %%xmm12, %%xmm1\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "pxor %%xmm13, %%xmm2\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+	     "pxor %%xmm14, %%xmm3\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+	     "pxor %%xmm15, %%xmm4\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+
+	     "pxor %%xmm12, %%xmm8\n\t"
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm9\n\t"
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm10\n\t"
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "pxor %%xmm15, %%xmm11\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
@@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
      : [iv] "m" (*iv)
      : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile
+	    ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqa %%xmm1, %%xmm12\n\t"
+	     "movdqa %%xmm2, %%xmm13\n\t"
+	     "movdqa %%xmm3, %%xmm14\n\t"
+	     "movdqa %%xmm4, %%xmm15\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile
+	    ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+
+	     "pxor %%xmm12, %%xmm2\n\t"		/* xor IV with output */
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+
+	     "pxor %%xmm13, %%xmm3\n\t"		/* xor IV with output */
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+
+	     "pxor %%xmm14, %%xmm4\n\t"		/* xor IV with output */
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+
+	     "pxor %%xmm15, %%xmm8\n\t"		/* xor IV with output */
+	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+	     "pxor %%xmm12, %%xmm9\n\t"		/* xor IV with output */
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm10\n\t"		/* xor IV with output */
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm11\n\t"		/* xor IV with output */
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       asm volatile
         ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
@@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm2,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm15\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
+			"pxor   %%xmm4,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm9,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+    aesni_cleanup_7_15();
+  }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm1,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
@@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm3,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
@@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			"pxor   %%xmm2,    %%xmm1\n\t"
+			"pxor   %%xmm4,    %%xmm1\n\t"
+			"pxor   %%xmm9,    %%xmm1\n\t"
+			"pxor   %%xmm11,   %%xmm1\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
@@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
       asm volatile ("movdqu %[l3],     %%xmm0\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
@@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       abuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    "movdqu %[l1], %%xmm12\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l1] "m" (*c->u_mode.ocb.L[1])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			:
+			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			:
+			: [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			:
+			: [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			:
+			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf5],  %%xmm9\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			:
+			: [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf6],  %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			:
+			: [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[abuf7],  %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm2,   %%xmm1\n\t"
+			"pxor   %%xmm3,   %%xmm1\n\t"
+			"pxor   %%xmm4,   %%xmm1\n\t"
+			"pxor   %%xmm8,   %%xmm1\n\t"
+			"pxor   %%xmm9,   %%xmm6\n\t"
+			"pxor   %%xmm10,  %%xmm6\n\t"
+			"pxor   %%xmm11,  %%xmm6\n\t"
+			"pxor   %%xmm1,   %%xmm6\n\t"
+			:
+			:
+			: "memory" );
+
+	  abuf += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[abuf0],  %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
@@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[abuf2],  %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    :
 		    : [l2] "m" (*c->u_mode.ocb.L[0]),

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-internal.h |    2 +-
 cipher/cipher-xts.c      |    3 +-
 cipher/cipher.c          |    1 +
 cipher/rijndael-aesni.c  | 1539 +++++++++++++++++++++++++++++++++++++++++++++-
 cipher/rijndael.c        |   84 +++
 src/cipher.h             |    3 +
 6 files changed, 1603 insertions(+), 29 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From wk at gnupg.org  Thu Jan 11 13:30:43 2018
From: wk at gnupg.org (Werner Koch)
Date: Thu, 11 Jan 2018 13:30:43 +0100
Subject: [PATCH] Make BMI2 inline assembly check more robust
In-Reply-To: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
 (Jussi Kivilinna's message of "Tue, 09 Jan 2018 19:23:35 +0200")
References: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
Message-ID: <87h8rsh91o.fsf@wheatstone.g10code.de>

Hi,

and thanks for working on all these nice performance improvements.

We should slowly start to think about a release plan for 1.9.  My idea
is to have a new release sometime in spring; 1.8 was release last July.

We have not done this for quite some time but I think that an announced
beta release will make sense.  This should help us to recover from build
problems on non-major platforms before the actual release.

Jussi: Do you have more optimization in mind for 1.9?

Gniibe: Do you want to get any optimization for ECC or new curves into
1.9 or should we defer that for 1.10?


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180111/d4449afb/attachment.sig>

From jussi.kivilinna at iki.fi  Thu Jan 11 22:42:53 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 11 Jan 2018 23:42:53 +0200
Subject: [PATCH] Make BMI2 inline assembly check more robust
In-Reply-To: <87h8rsh91o.fsf@wheatstone.g10code.de>
References: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
 <87h8rsh91o.fsf@wheatstone.g10code.de>
Message-ID: <cea1d7a9-ab7d-6710-20f9-0227998dfe8f@iki.fi>

On 11.01.2018 14:30, Werner Koch wrote:
> Hi,
> 
> and thanks for working on all these nice performance improvements.
> 
> We should slowly start to think about a release plan for 1.9.  My idea
> is to have a new release sometime in spring; 1.8 was release last July.
> 
> We have not done this for quite some time but I think that an announced
> beta release will make sense.  This should help us to recover from build
> problems on non-major platforms before the actual release.
> 
> Jussi: Do you have more optimization in mind for 1.9?
> 

I have AES XTS optimization patch for ARMv8 coming later this week.

-Jussi


From wk at gnupg.org  Fri Jan 12 10:01:05 2018
From: wk at gnupg.org (Werner Koch)
Date: Fri, 12 Jan 2018 10:01:05 +0100
Subject: [PATCH] Make BMI2 inline assembly check more robust
In-Reply-To: <cea1d7a9-ab7d-6710-20f9-0227998dfe8f@iki.fi> (Jussi Kivilinna's
 message of "Thu, 11 Jan 2018 23:42:53 +0200")
References: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
 <87h8rsh91o.fsf@wheatstone.g10code.de>
 <cea1d7a9-ab7d-6710-20f9-0227998dfe8f@iki.fi>
Message-ID: <87o9lzfo32.fsf@wheatstone.g10code.de>

On Thu, 11 Jan 2018 22:42, jussi.kivilinna at iki.fi said:

> I have AES XTS optimization patch for ARMv8 coming later this week.

No need to hurry.  

I think it will also be useful to implement EAX mode because we will
need this for RFC4880bis.  


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180112/427d322c/attachment.sig>

From jussi.kivilinna at iki.fi  Fri Jan 12 18:32:27 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 12 Jan 2018 19:32:27 +0200
Subject: [PATCH] Add ARMv8/CE acceleration for AES-XTS
Message-ID: <151577834710.21567.1458806439907940871.stgit@localhost.localdomain>

* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce, xts_crypt_fn_t)
(_gcry_aes_armv8_ce_xts_crypt): New.
* cipher/rijndael.c (_gcry_aes_armv8_ce_xts_crypt): New.
(_gcry_aes_xts_crypt) [USE_ARM_CE]: New.
--

Benchmark on Cortex-A53 (AArch64, 1152 Mhz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      4.88 ns/B     195.5 MiB/s      5.62 c/B
        XTS dec |      4.94 ns/B     192.9 MiB/s      5.70 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      5.55 ns/B     171.8 MiB/s      6.39 c/B
        XTS dec |      5.61 ns/B     169.9 MiB/s      6.47 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      6.22 ns/B     153.3 MiB/s      7.17 c/B
        XTS dec |      6.29 ns/B     151.7 MiB/s      7.24 c/B
                =

After (~2.6x faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      1.83 ns/B     520.9 MiB/s      2.11 c/B
        XTS dec |      1.82 ns/B     524.9 MiB/s      2.09 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      1.97 ns/B     483.3 MiB/s      2.27 c/B
        XTS dec |      1.96 ns/B     486.9 MiB/s      2.26 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      2.11 ns/B     450.9 MiB/s      2.44 c/B
        XTS dec |      2.10 ns/B     453.8 MiB/s      2.42 c/B
                =

Benchmark on Cortex-A53 (AArch32, 1152 Mhz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      6.52 ns/B     146.2 MiB/s      7.51 c/B
        XTS dec |      6.57 ns/B     145.2 MiB/s      7.57 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      7.10 ns/B     134.3 MiB/s      8.18 c/B
        XTS dec |      7.11 ns/B     134.2 MiB/s      8.19 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      7.30 ns/B     130.7 MiB/s      8.41 c/B
        XTS dec |      7.38 ns/B     129.3 MiB/s      8.50 c/B
                =

After (~2.7x faster):
Cipher:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      2.33 ns/B     409.6 MiB/s      2.68 c/B
        XTS dec |      2.35 ns/B     405.3 MiB/s      2.71 c/B
                =
 AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      2.53 ns/B     377.6 MiB/s      2.91 c/B
        XTS dec |      2.54 ns/B     375.5 MiB/s      2.93 c/B
                =
 AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
        XTS enc |      2.75 ns/B     346.8 MiB/s      3.17 c/B
        XTS dec |      2.76 ns/B     345.2 MiB/s      3.18 c/B
                =

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-armv8-aarch32-ce.S |  311 ++++++++++++++++++++++++++++++++++++
 cipher/rijndael-armv8-aarch64-ce.S |  274 ++++++++++++++++++++++++++++++++
 cipher/rijndael-armv8-ce.c         |   28 +++
 cipher/rijndael.c                  |   12 +
 4 files changed, 625 insertions(+)

diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 5c8fa3c09..66440bd4e 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1517,6 +1517,317 @@ _gcry_aes_ocb_auth_armv8_ce:
 .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
 
 
+
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type  _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_enc_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_enc_entry_192
+  bhi .Lxts_enc_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_enc_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_enc_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_enc_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_enc_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type  _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_dec_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_dec_entry_192
+  bhi .Lxts_dec_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_dec_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_dec_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_dec_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_dec_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 708ef340d..40097a710 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -1276,6 +1276,280 @@ _gcry_aes_ocb_auth_armv8_ce:
 .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
 
 
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type  _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lxts_enc_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_enc_entry_192
+  b.hi .Lxts_enc_entry_256
+
+#define XTS_ENC(bits) \
+  .Lxts_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  XTS_ENC(128)
+  XTS_ENC(192)
+  XTS_ENC(256)
+
+#undef XTS_ENC
+
+.Lxts_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_enc_skip:
+  ret
+
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type  _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lxts_dec_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_dec_entry_192
+  b.hi .Lxts_dec_entry_256
+
+#define XTS_DEC(bits) \
+  .Lxts_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(d, imc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  XTS_DEC(128)
+  XTS_DEC(192)
+  XTS_DEC(256)
+
+#undef XTS_DEC
+
+.Lxts_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_dec_skip:
+  ret
+
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 334cf6848..6af7108f8 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -101,6 +101,16 @@ extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
                                          size_t nblocks,
                                          unsigned int nrounds,
                                          unsigned int blkn);
+extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
 
 typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
                                 const unsigned char *inbuf,
@@ -108,6 +118,11 @@ typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
                                 unsigned char *L_table, size_t nblocks,
                                 unsigned int nrounds, unsigned int blkn);
 
+typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+                                const unsigned char *inbuf,
+                                unsigned char *tweak, size_t nblocks,
+                                unsigned int nrounds);
+
 void
 _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
@@ -361,4 +376,17 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
 			      nblocks, nrounds, (unsigned int)blkn);
 }
 
+void
+_gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			      unsigned char *outbuf, const unsigned char *inbuf,
+			      size_t nblocks, int encrypt)
+{
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
+                                    : _gcry_aes_xts_dec_armv8_ce;
+  unsigned int nrounds = ctx->rounds;
+
+  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+}
+
 #endif /* USE_ARM_CE */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index df1363f28..0f676fe14 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -211,6 +211,11 @@ extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                           int encrypt);
 extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c,
                                          const void *abuf_arg, size_t nblocks);
+extern void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx,
+                                          unsigned char *tweak,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          size_t nblocks, int encrypt);
 #endif /*USE_ARM_ASM*/
 
 static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -1473,6 +1478,13 @@ _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
       burn_depth = 0;
     }
 #endif /*USE_AESNI*/
+#ifdef USE_ARM_CE
+  else if (ctx->use_arm_ce)
+    {
+      _gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_ARM_CE*/
   else
     {
       tweak_next_lo = buf_get_le64 (tweak + 0);


From cvs at cvs.gnupg.org  Fri Jan 12 18:36:54 2018
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Fri, 12 Jan 2018 18:36:54 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-35-gc3d60ac
Message-ID: <E1ea3GB-0000ZB-KZ@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  c3d60acc3ab5c6d60c2258882175bf31351cc998 (commit)
       via  a518b6680ea80a4325731028545a701c1d71fc02 (commit)
       via  135250e3060e79be698d4f36a819aa8a880789f8 (commit)
      from  a00c5b2988cea256c7823a76ce601febf02c790f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit c3d60acc3ab5c6d60c2258882175bf31351cc998
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 23:21:44 2018 +0200

    rijndael-ssse3: call assembly functions directly
    
    * cipher/rijndael-ssse3-amd64-asm.S (_gcry_aes_ssse3_enc_preload)
    (_gcry_aes_ssse3_dec_preload, _gcry_aes_ssse3_encrypt_core)
    (_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add
    ENTER_SYSV_FUNC_PARAMS_* at function entry and EXIT_SYSV_FUNC at exit.
    (_gcry_aes_ssse3_encrypt_core, _gcry_aes_ssse3_decrypt_core): Change
    to input parameters to RDI and RSI registers.
    * cipher/rijndael-ssse3-amd64.c (_gcry_aes_ssse3_encrypt_core)
    (_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add parameters
    for function prototypes.
    (PUSH_STACK_PTR, POP_STACK_PTR): Remove.
    (vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
    (_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
    (do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Remove inline assembly to
    call functions, and call directly instead.
    --
    
    Instead of using inline assembly to call assembly functions in
    AES SSSE3 implementation, change assembly functions so that they
    can be called directly instead.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index 3ae55e8..ffce5df 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -40,11 +40,7 @@
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ELF(...)
-#else
-# define ELF(...) __VA_ARGS__
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -54,6 +50,7 @@
 ELF(.type _gcry_aes_ssse3_enc_preload, at function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10 # inv
@@ -62,6 +59,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
 	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
@@ -71,6 +69,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ELF(.type _gcry_aes_ssse3_dec_preload, at function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10  # inv
@@ -80,6 +79,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
 	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
@@ -98,11 +98,11 @@ ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 ##  Inputs:
 ##     %xmm0 = input
 ##     %xmm9-%xmm15 as in .Laes_preheat
-##    (%rdx) = scheduled keys
-##     %rax  = nrounds - 1
+##    (%rdi) = scheduled keys
+##     %rsi  = nrounds
 ##
 ##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
 ##  Preserves %xmm6 - %xmm7 so you get some local vectors
 ##
 ##
@@ -111,6 +111,9 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
+	leaq	-1(%rsi), %rax
 	lea	.Laes_consts(%rip), %rcx
 	leaq	.Lk_mc_backward(%rcx), %rdi
 	mov	$16,	%rsi
@@ -185,6 +188,7 @@ _aes_encrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
@@ -198,8 +202,11 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
-	movl	%eax,	%esi
+	subl	$1,	%esi
+	movl	%esi,   %eax
 	shll	$4,	%esi
 	xorl	$48,	%esi
 	andl	$48,	%esi
@@ -288,6 +295,7 @@ _aes_decrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
@@ -306,6 +314,8 @@ _aes_schedule_core:
 	# rsi = size in bits
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
+	# r8 = rotoffs
+	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
 	lea	.Laes_consts(%rip), %r10
@@ -659,8 +669,9 @@ _aes_schedule_core:
 	pxor	%xmm6,  %xmm6
 	pxor	%xmm7,  %xmm7
 	pxor	%xmm8,  %xmm8
+	EXIT_SYSV_FUNC
 	ret
-ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
 ##                                                    ##
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index da5339e..98660ec 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -58,13 +58,14 @@
 
 
 /* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
-   have custom calling convention and need to be called from assembly
-   blocks, not directly. */
+   have custom calling convention (additional XMM parameters). */
 extern void _gcry_aes_ssse3_enc_preload(void);
 extern void _gcry_aes_ssse3_dec_preload(void);
-extern void _gcry_aes_ssse3_schedule_core(void);
-extern void _gcry_aes_ssse3_encrypt_core(void);
-extern void _gcry_aes_ssse3_decrypt_core(void);
+extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits,
+					  void *buffer, u64 decrypt,
+					  u64 rotoffs);
+extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds);
+extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds);
 
 
@@ -110,8 +111,6 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   : \
                   : "r" (ssse3_state) \
                   : "memory" )
-# define PUSH_STACK_PTR
-# define POP_STACK_PTR
 #else
 # define SSSE3_STATE_SIZE 1
 # define vpaes_ssse3_prepare() (void)ssse3_state
@@ -126,31 +125,15 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   "pxor	%%xmm7,  %%xmm7 \n\t" \
                   "pxor	%%xmm8,  %%xmm8 \n\t" \
                   ::: "memory" )
-/* Old GCC versions use red-zone of AMD64 SYSV ABI and stack pointer is
- * not properly adjusted for assembly block. Therefore stack pointer
- * needs to be manually corrected. */
-# define PUSH_STACK_PTR "subq $128, %%rsp;\n\t"
-# define POP_STACK_PTR  "addq $128, %%rsp;\n\t"
 #endif
 
 #define vpaes_ssse3_prepare_enc() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_enc_preload) \
-                  : "rax", "cc", "memory" )
+    _gcry_aes_ssse3_enc_preload();
 
 #define vpaes_ssse3_prepare_dec() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_dec_preload) \
-                  : "rax", "cc", "memory" )
-
+    _gcry_aes_ssse3_dec_preload();
 
 
 void
@@ -161,23 +144,7 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (&_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (*key),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschenc32[0][0]),
-                  [dir] "g" (0),
-                  [rotoffs] "g" (48)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48);
 
   /* Save key for setting up decryption. */
   if (keybits > 192)
@@ -216,23 +183,9 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (ctx->keyschdec32[0][0]),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
-                  [dir] "g" (1),
-                  [rotoffs] "g" ((keybits == 192) ? 0 : 32)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits,
+				&ctx->keyschdec32[ctx->rounds][0], 1,
+				(keybits == 192) ? 0 : 32);
 
   vpaes_ssse3_cleanup();
 }
@@ -243,15 +196,7 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 static inline void
 do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschenc32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_encrypt_core)
-		: "rcx", "rsi", "rdi", "cc", "memory");
+  _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds);
 }
 
 
@@ -260,15 +205,7 @@ do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 static inline void
 do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschdec32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_decrypt_core)
-		: "rcx", "rsi", "cc", "memory");
+  _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds);
 }
 
 
commit a518b6680ea80a4325731028545a701c1d71fc02
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 22:19:56 2018 +0200

    Move AMD64 MS to SysV calling convention conversion to assembly side
    
    * cipher/Makefile.am: Add 'asm-common-amd64.h'.
    * cipher/asm-common-amd64.h: New.
    * cipher/blowfish-amd64.S: Add ENTER_SYSV_FUNC_* and EXIT_SYSV_FUNC for
    each global function from 'asm-common-amd64.h'.
    * cipher/cast5-amd64.S: Ditto.
    * cipher/des-amd64.S: Ditto.
    * cipher/rijndael-amd64.S: Ditto.
    * cipher/twofish-amd64.S: Ditto.
    * cipher/arcfour-amd64.S: Ditto.
    * cipher/blowfish.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
    (call_sysv_fn): Remove.
    * cipher/cast5.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
    (call_sysv_fn): Remove.
    * cipher/twofish.c [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
    (call_sysv_fn, call_sysv_fn5, call_sysv_fn6): Remove.
    * cipher/rijndael.c (do_encrypt, do_decrypt)
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Remove assembly block for
    calling SysV ABI function.
    * cipher/arcfour.c [USE_AMD64_ASM] (encrypt_stream): Ditto.
    --
    
    Old approach was to convert MS ABI to SysV ABI calling convention
    for AMD64 assembly functions at caller side. This patch moves
    calling convention conversion to assembly/callee side.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3c4eae0..bba815b 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -61,6 +61,7 @@ dsa-common.c rsa-common.c \
 sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
+asm-common-amd64.h \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index 2e52ea0..c08f345 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -18,17 +18,14 @@
 #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 .align 16
 .globl _gcry_arcfour_amd64
 ELF(.type _gcry_arcfour_amd64, at function)
 _gcry_arcfour_amd64:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	push	%rbp
 	push	%rbx
 	mov	%rdi,		%rbp	# key = ARG(key)
@@ -96,6 +93,7 @@ _gcry_arcfour_amd64:
 	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
 	pop	%rbx
 	pop	%rbp
+	EXIT_SYSV_FUNC
 	ret
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
diff --git a/cipher/arcfour.c b/cipher/arcfour.c
index 44e8ef4..085df9b 100644
--- a/cipher/arcfour.c
+++ b/cipher/arcfour.c
@@ -54,21 +54,7 @@ static void
 encrypt_stream (void *context,
                 byte *outbuf, const byte *inbuf, size_t length)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  const void *fn = _gcry_arcfour_amd64;
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (context),
-                  "+S" (length),
-                  "+d" (inbuf),
-                  "+c" (outbuf)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-#else
   _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
-#endif
 }
 
 #else /*!USE_AMD64_ASM*/
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
new file mode 100644
index 0000000..7eb4264
--- /dev/null
+++ b/cipher/asm-common-amd64.h
@@ -0,0 +1,90 @@
+/* asm-common-amd64.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AMD64_H
+#define GCRY_ASM_COMMON_AMD64_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+#ifdef __PIC__
+#  define RIP %rip
+#else
+#  define RIP
+#endif
+
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
+#else
+#  ifdef __code_model_large__
+#    define GET_EXTERN_POINTER(name, reg) \
+	       pushq %r15; \
+	       pushq %r14; \
+	    1: leaq 1b(%rip), reg; \
+	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
+	       movabsq $name at GOT, %r15; \
+	       addq %r14, reg; \
+	       popq %r14; \
+	       movq (reg, %r15), reg; \
+	       popq %r15;
+#  else
+#    define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
+#  endif
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ENTER_SYSV_FUNC_PARAMS_0_4 \
+	pushq %rdi; \
+	pushq %rsi; \
+	movq %rcx, %rdi; \
+	movq %rdx, %rsi; \
+	movq %r8, %rdx; \
+	movq %r9, %rcx; \
+
+# define ENTER_SYSV_FUNC_PARAMS_5 \
+	ENTER_SYSV_FUNC_PARAMS_0_4; \
+	movq 0x38(%rsp), %r8;
+
+# define ENTER_SYSV_FUNC_PARAMS_6 \
+	ENTER_SYSV_FUNC_PARAMS_5; \
+	movq 0x40(%rsp), %r9;
+
+# define EXIT_SYSV_FUNC \
+	popq %rsi; \
+	popq %rdi;
+#else
+# define ENTER_SYSV_FUNC_PARAMS_0_4
+# define ENTER_SYSV_FUNC_PARAMS_5
+# define ENTER_SYSV_FUNC_PARAMS_6
+# define EXIT_SYSV_FUNC
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 21b63fc..02d3b71 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -24,11 +24,7 @@
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -165,6 +161,8 @@ _gcry_blowfish_amd64_do_encrypt:
 	 *	%rsi: u32 *ret_xl
 	 *	%rdx: u32 *ret_xr
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	movl (%rdx), RX0d;
 	shlq $32, RX0;
 	movl (%rsi), RT3d;
@@ -178,6 +176,7 @@ _gcry_blowfish_amd64_do_encrypt:
 	shrq $32, RX0;
 	movl RX0d, (RX2);
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
@@ -191,6 +190,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rsi, %r10;
 
@@ -202,6 +202,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	movq %r10, RIO;
 	write_block();
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
@@ -215,6 +216,8 @@ _gcry_blowfish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	movq %rbp, %r11;
 
 	movq %rsi, %r10;
@@ -238,6 +241,7 @@ _gcry_blowfish_amd64_decrypt_block:
 
 	movq %r11, %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
@@ -392,6 +396,8 @@ _gcry_blowfish_amd64_ctr_enc:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -436,6 +442,7 @@ _gcry_blowfish_amd64_ctr_enc:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
@@ -449,6 +456,8 @@ _gcry_blowfish_amd64_cbc_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -484,6 +493,7 @@ _gcry_blowfish_amd64_cbc_dec:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
@@ -497,6 +507,8 @@ _gcry_blowfish_amd64_cfb_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -534,6 +546,8 @@ _gcry_blowfish_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index a3fc26c..724d64e 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -281,87 +281,43 @@ extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
 extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *iv);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static void
 do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_do_encrypt, bc, ret_xl, ret_xr, NULL);
-#else
   _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
-#endif
 }
 
 static void
 do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_encrypt_block, context, outbuf, inbuf,
-                NULL);
-#else
   _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_decrypt_block, context, outbuf, inbuf,
-                NULL);
-#else
   _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static inline void
 blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_ctr_enc, ctx, out, in, ctr);
-#else
   _gcry_blowfish_amd64_ctr_enc(ctx, out, in, ctr);
-#endif
 }
 
 static inline void
 blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_cbc_dec, ctx, out, in, iv);
-#else
   _gcry_blowfish_amd64_cbc_dec(ctx, out, in, iv);
-#endif
 }
 
 static inline void
 blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
                        byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_blowfish_amd64_cfb_dec, ctx, out, in, iv);
-#else
   _gcry_blowfish_amd64_cfb_dec(ctx, out, in, iv);
-#endif
 }
 
 static unsigned int
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index c04015a..1a1d43f 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -23,30 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
 
-#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
-#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
-#else
-#  ifdef __code_model_large__
-#    define GET_EXTERN_POINTER(name, reg) \
-	       pushq %r15; \
-	       pushq %r14; \
-	    1: leaq 1b(%rip), reg; \
-	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
-	       movabsq $name at GOT, %r15; \
-	       addq %r14, reg; \
-	       popq %r14; \
-	       movq (reg, %r15), reg; \
-	       popq %r15;
-#  else
-#    define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
-#  endif
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -206,6 +183,8 @@ _gcry_cast5_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 
@@ -233,6 +212,8 @@ _gcry_cast5_amd64_encrypt_block:
 
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
@@ -246,6 +227,8 @@ _gcry_cast5_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 
@@ -273,6 +256,8 @@ _gcry_cast5_amd64_decrypt_block:
 
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
@@ -444,6 +429,7 @@ _gcry_cast5_amd64_ctr_enc:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -489,6 +475,8 @@ _gcry_cast5_amd64_ctr_enc:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
@@ -502,6 +490,7 @@ _gcry_cast5_amd64_cbc_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -542,6 +531,8 @@ _gcry_cast5_amd64_cbc_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
@@ -556,6 +547,7 @@ _gcry_cast5_amd64_cfb_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -597,6 +589,8 @@ _gcry_cast5_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 94dcee7..d23882b 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -373,72 +373,34 @@ extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
 extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *iv);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static void
 do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_encrypt_block, context, outbuf, inbuf, NULL);
-#else
   _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_decrypt_block, context, outbuf, inbuf, NULL);
-#else
   _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
-#endif
 }
 
 static void
 cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_ctr_enc, ctx, out, in, ctr);
-#else
   _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr);
-#endif
 }
 
 static void
 cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_cbc_dec, ctx, out, in, iv);
-#else
   _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv);
-#endif
 }
 
 static void
 cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_cast5_amd64_cfb_dec, ctx, out, in, iv);
-#else
   _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv);
-#endif
 }
 
 static unsigned int
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index 1b7cfba..f25573d 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -23,17 +23,7 @@
 #if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -200,6 +190,8 @@ _gcry_3des_amd64_crypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -208,7 +200,7 @@ _gcry_3des_amd64_crypt_block:
 	pushq %r15;
 	pushq %rsi; /*dst*/
 
-	leaq .L_s1 RIP, SBOXES;
+	leaq .L_s1 rRIP, SBOXES;
 
 	read_block(%rdx, RL0, RR0);
 	initial_permutation(RL0, RR0);
@@ -277,6 +269,7 @@ _gcry_3des_amd64_crypt_block:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
@@ -473,7 +466,7 @@ _gcry_3des_amd64_crypt_blk3:
 	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
 	 */
 
-	leaq .L_s1 RIP, SBOXES;
+	leaq .L_s1 rRIP, SBOXES;
 
 	initial_permutation3(RL, RR);
 
@@ -547,6 +540,7 @@ _gcry_3des_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -610,6 +604,7 @@ _gcry_3des_amd64_cbc_dec:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -623,6 +618,7 @@ _gcry_3des_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
 	pushq %rbx;
@@ -688,6 +684,7 @@ _gcry_3des_amd64_ctr_enc:
 	popq %rbx;
 	popq %rbp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -701,6 +698,8 @@ _gcry_3des_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
@@ -763,6 +762,8 @@ _gcry_3des_amd64_cfb_dec:
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
+
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
diff --git a/cipher/des.c b/cipher/des.c
index 5c99f50..7801b08 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -772,23 +772,6 @@ extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out,
 
 #define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *))
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
 
 /*
  * Electronic Codebook Mode Triple-DES encryption/decryption of data
@@ -803,11 +786,7 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
 
   keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_crypt_block, keys, to, from, NULL);
-#else
   _gcry_3des_amd64_crypt_block(keys, to, from);
-#endif
 
   return 0;
 }
@@ -815,31 +794,19 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
 static inline void
 tripledes_amd64_ctr_enc(const void *keys, byte *out, const byte *in, byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_ctr_enc, keys, out, in, ctr);
-#else
   _gcry_3des_amd64_ctr_enc(keys, out, in, ctr);
-#endif
 }
 
 static inline void
 tripledes_amd64_cbc_dec(const void *keys, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_cbc_dec, keys, out, in, iv);
-#else
   _gcry_3des_amd64_cbc_dec(keys, out, in, iv);
-#endif
 }
 
 static inline void
 tripledes_amd64_cfb_dec(const void *keys, byte *out, const byte *in, byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn (_gcry_3des_amd64_cfb_dec, keys, out, in, iv);
-#else
   _gcry_3des_amd64_cfb_dec(keys, out, in, iv);
-#endif
 }
 
 #else /*USE_AMD64_ASM*/
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index b149e94..798ff51 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -222,6 +212,8 @@ _gcry_aes_amd64_encrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  encryption tables
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(5 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
@@ -265,6 +257,8 @@ _gcry_aes_amd64_encrypt_block:
 	addq $(5 * 8), %rsp;
 
 	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 .align 4
@@ -382,6 +376,8 @@ _gcry_aes_amd64_decrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  decryption tables
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(5 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
@@ -426,6 +422,8 @@ _gcry_aes_amd64_decrypt_block:
 	addq $(5 * 8), %rsp;
 
 	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
 	ret;
 
 .align 4
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 548bfa0..df1363f 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -740,27 +740,8 @@ do_encrypt (const RIJNDAEL_context *ctx,
             unsigned char *bx, const unsigned char *ax)
 {
 #ifdef USE_AMD64_ASM
-# ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
   return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds,
 				       encT);
-# else
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  const void *key = ctx->keyschenc;
-  uintptr_t rounds = ctx->rounds;
-  uintptr_t ret;
-  asm volatile ("movq %[encT], %%r8\n\t"
-                "callq *%[ret]\n\t"
-                : [ret] "=a" (ret),
-                  "+D" (key),
-                  "+S" (bx),
-                  "+d" (ax),
-                  "+c" (rounds)
-                : "0" (_gcry_aes_amd64_encrypt_block),
-                  [encT] "r" (encT)
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-  return ret;
-# endif /* HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS */
 #elif defined(USE_ARM_ASM)
   return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT);
 #else
@@ -1123,27 +1104,8 @@ do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
             const unsigned char *ax)
 {
 #ifdef USE_AMD64_ASM
-# ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
   return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
 				       &dec_tables);
-# else
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  const void *key = ctx->keyschdec;
-  uintptr_t rounds = ctx->rounds;
-  uintptr_t ret;
-  asm volatile ("movq %[dectabs], %%r8\n\t"
-                "callq *%[ret]\n\t"
-                : [ret] "=a" (ret),
-                  "+D" (key),
-                  "+S" (bx),
-                  "+d" (ax),
-                  "+c" (rounds)
-                : "0" (_gcry_aes_amd64_decrypt_block),
-                  [dectabs] "r" (&dec_tables)
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-  return ret;
-# endif /* HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS */
 #elif defined(USE_ARM_ASM)
   return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
 				     &dec_tables);
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index aa964e0..7a83646 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP %rip
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -181,6 +171,8 @@ _gcry_twofish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(3 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
@@ -211,6 +203,7 @@ _gcry_twofish_amd64_encrypt_block:
 	movq (1 * 8)(%rsp), %rbp;
 	addq $(3 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -224,6 +217,8 @@ _gcry_twofish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(3 * 8), %rsp;
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
@@ -254,6 +249,7 @@ _gcry_twofish_amd64_decrypt_block:
 	movq (1 * 8)(%rsp), %rbp;
 	addq $(3 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -530,6 +526,8 @@ _gcry_twofish_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -599,6 +597,7 @@ _gcry_twofish_amd64_ctr_enc:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
@@ -612,6 +611,8 @@ _gcry_twofish_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(9 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -665,6 +666,7 @@ _gcry_twofish_amd64_cbc_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(9 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
@@ -678,6 +680,8 @@ _gcry_twofish_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -731,6 +735,7 @@ _gcry_twofish_amd64_cfb_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
@@ -746,6 +751,8 @@ _gcry_twofish_amd64_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_6
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -838,6 +845,7 @@ _gcry_twofish_amd64_ocb_enc:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
@@ -853,6 +861,8 @@ _gcry_twofish_amd64_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_6
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -953,6 +963,7 @@ _gcry_twofish_amd64_ocb_dec:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
@@ -967,6 +978,8 @@ _gcry_twofish_amd64_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[3])
 	 */
+	ENTER_SYSV_FUNC_PARAMS_5
+
 	subq $(8 * 8), %rsp;
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
@@ -1039,6 +1052,7 @@ _gcry_twofish_amd64_ocb_auth:
 	movq (5 * 8)(%rsp), %r15;
 	addq $(8 * 8), %rsp;
 
+	EXIT_SYSV_FUNC
 	ret;
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 942e8d4..48feaae 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -829,145 +829,58 @@ extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
 					 const byte *abuf, byte *offset,
 					 byte *checksum, const u64 Ls[3]);
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-static inline void
-call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
-              const void *arg3, const void *arg4)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("callq *%0\n\t"
-                : "+a" (fn),
-                  "+D" (arg1),
-                  "+S" (arg2),
-                  "+d" (arg3),
-                  "+c" (arg4)
-                :
-                : "cc", "memory", "r8", "r9", "r10", "r11");
-}
-
-static inline void
-call_sysv_fn5 (const void *fn, const void *arg1, const void *arg2,
-               const void *arg3, const void *arg4, const void *arg5)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("movq %[arg5], %%r8\n\t"
-		"callq *%0\n\t"
-		: "+a" (fn),
-		  "+D" (arg1),
-		  "+S" (arg2),
-		  "+d" (arg3),
-		  "+c" (arg4)
-		: [arg5] "g" (arg5)
-		: "cc", "memory", "r8", "r9", "r10", "r11");
-}
-
-static inline void
-call_sysv_fn6 (const void *fn, const void *arg1, const void *arg2,
-               const void *arg3, const void *arg4, const void *arg5,
-	       const void *arg6)
-{
-  /* Call SystemV ABI function without storing non-volatile XMM registers,
-   * as target function does not use vector instruction sets. */
-  asm volatile ("movq %[arg5], %%r8\n\t"
-		"movq %[arg6], %%r9\n\t"
-		"callq *%0\n\t"
-		: "+a" (fn),
-		  "+D" (arg1),
-		  "+S" (arg2),
-		  "+d" (arg3),
-		  "+c" (arg4)
-		: [arg5] "g" (arg5),
-		  [arg6] "g" (arg6)
-		: "cc", "memory", "r8", "r9", "r10", "r11");
-}
-#endif
-
 static inline void
 twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_encrypt_block, c, out, in, NULL);
-#else
   _gcry_twofish_amd64_encrypt_block(c, out, in);
-#endif
 }
 
 static inline void
 twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_decrypt_block, c, out, in, NULL);
-#else
   _gcry_twofish_amd64_decrypt_block(c, out, in);
-#endif
 }
 
 static inline void
 twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *ctr)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_ctr_enc, c, out, in, ctr);
-#else
   _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
-#endif
 }
 
 static inline void
 twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_cbc_dec, c, out, in, iv);
-#else
   _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
-#endif
 }
 
 static inline void
 twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
                       byte *iv)
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn(_gcry_twofish_amd64_cfb_dec, c, out, in, iv);
-#else
   _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
 		      byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn6(_gcry_twofish_amd64_ocb_enc, ctx, out, in, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
 		      byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn6(_gcry_twofish_amd64_ocb_dec, ctx, out, in, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
-#endif
 }
 
 static inline void
 twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
 		       byte *offset, byte *checksum, const u64 Ls[3])
 {
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-  call_sysv_fn5(_gcry_twofish_amd64_ocb_auth, ctx, abuf, offset, checksum, Ls);
-#else
   _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
-#endif
 }
 
 #elif defined(USE_ARM_ASM)

commit 135250e3060e79be698d4f36a819aa8a880789f8
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 20:26:52 2018 +0200

    Make BMI2 inline assembly check more robust
    
    * configure.ac (gcry_cv_gcc_inline_asm_bmi2): New assembly test.
    --
    
    Use actual assembly snippets from keccak.c to check that compiler
    has proper support for used BMI2 instructions.
    
    GnuPG-bug-id: 3408
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/configure.ac b/configure.ac
index 42cd4c2..aaf3c82 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1403,8 +1403,15 @@ AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-          [[void a(void) {
-              __asm__("rorxl \$23, %%eax, %%edx\\n\\t":::"memory");
+          [[unsigned int a(unsigned int x, unsigned int y) {
+              unsigned int tmp1, tmp2;
+              asm ("rorxl %2, %1, %0"
+                   : "=r" (tmp1)
+                   : "rm0" (x), "J" (32 - ((23) & 31)));
+              asm ("andnl %2, %1, %0"
+                   : "=r" (tmp2)
+                   : "r0" (x), "rm" (y));
+              return tmp1 + tmp2;
             }]])],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am                |  1 +
 cipher/arcfour-amd64.S            |  8 ++--
 cipher/arcfour.c                  | 14 ------
 cipher/asm-common-amd64.h         | 90 ++++++++++++++++++++++++++++++++++++++
 cipher/blowfish-amd64.S           | 24 ++++++++---
 cipher/blowfish.c                 | 44 -------------------
 cipher/cast5-amd64.S              | 42 ++++++++----------
 cipher/cast5.c                    | 38 ----------------
 cipher/des-amd64.S                | 27 ++++++------
 cipher/des.c                      | 33 --------------
 cipher/rijndael-amd64.S           | 20 ++++-----
 cipher/rijndael-ssse3-amd64-asm.S | 31 ++++++++-----
 cipher/rijndael-ssse3-amd64.c     | 91 ++++++---------------------------------
 cipher/rijndael.c                 | 38 ----------------
 cipher/twofish-amd64.S            | 36 +++++++++++-----
 cipher/twofish.c                  | 87 -------------------------------------
 configure.ac                      | 11 ++++-
 17 files changed, 223 insertions(+), 412 deletions(-)
 create mode 100644 cipher/asm-common-amd64.h


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From laxminarsaiah.bandla at gmail.com  Sat Jan 13 21:04:46 2018
From: laxminarsaiah.bandla at gmail.com (Laxmi Narsaiah Bandla)
Date: Sun, 14 Jan 2018 01:34:46 +0530
Subject: libgcrypt's scrypt algorithm has N and p values wrongly set
Message-ID: <CAEiGfhonBzJBLy5QDt8DQLxFFOcrZSw8tmW2o8o_DyyH9h+mHg@mail.gmail.com>

Hi All,

In the API gcry_kdf_scrypt implementation i see the below code.

240 gcry_err_code_t
241 _gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
242                   int algo, int subalgo,
243                   const unsigned char *salt, size_t saltlen,
244                   unsigned long iterations,
245                   size_t dkLen, unsigned char *DK)
246 {
247   u64 N = subalgo;    /* CPU/memory cost parameter.  */
248   u32 r;              /* Block size.  */
249   u32 p = iterations; /* Parallelization parameter.  */ <<<<<<<<<<<
250
251   gpg_err_code_t ec;
252   u32 i;
253   unsigned char *B = NULL;
254   unsigned char *tmp1 = NULL;
255   unsigned char *tmp2 = NULL;
256   size_t r128;
257   size_t nbytes;

Here iterations should have been assigned to N (CPU/memory cost supposed to
be higher) but it is assigned to p (parallelization). The same wrapper api
gcry_kdf_derive() api's iteration argument has been correcly used in
_gcry_kdf_pkdf2().

when i set iterations to 20,000 (for PBKDF2) and 16384 (for scrypt) in
gcry_kdf_derive(), on my machine PBKDF2 took less than a second to generate
the key/hash where as scrypt took almost 35 minutes.

when i set iterations = 1 (subalgo) and subalgo = 16384, it took less than
a second to generate the key.

Suggested fix:

1. Either we should document the arguments properly.

OR

2.

 u64 N = iterations;    /* CPU/memory cost parameter.  */
 u32 r;              /* Block size.  */
 u32 p = subalgo; /* Parallelization parameter.  */ <<<<<<<<<<<

Please let me know.

Thanks

P.S : I have raised a task on gnupg : https://dev.gnupg.org/T3737
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180114/3300bf4a/attachment.html>

From jussi.kivilinna at iki.fi  Thu Jan 18 21:44:44 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 18 Jan 2018 22:44:44 +0200
Subject: [PATCH] Make BMI2 inline assembly check more robust
In-Reply-To: <87o9lzfo32.fsf@wheatstone.g10code.de>
References: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
 <87h8rsh91o.fsf@wheatstone.g10code.de>
 <cea1d7a9-ab7d-6710-20f9-0227998dfe8f@iki.fi>
 <87o9lzfo32.fsf@wheatstone.g10code.de>
Message-ID: <61e2c259-5d65-e30a-ddb1-310436c9008c@iki.fi>

On 12.01.2018 11:01, Werner Koch wrote:
> On Thu, 11 Jan 2018 22:42, jussi.kivilinna at iki.fi said:
> 
>> I have AES XTS optimization patch for ARMv8 coming later this week.
> 
> No need to hurry.  
> 
> I think it will also be useful to implement EAX mode because we will
> need this for RFC4880bis.  
> 
Well, if there is no hurry, I can look into implementing EAX mode.

-Jussi


From wk at gnupg.org  Fri Jan 19 09:26:57 2018
From: wk at gnupg.org (Werner Koch)
Date: Fri, 19 Jan 2018 09:26:57 +0100
Subject: [PATCH] Make BMI2 inline assembly check more robust
In-Reply-To: <61e2c259-5d65-e30a-ddb1-310436c9008c@iki.fi> (Jussi Kivilinna's
 message of "Thu, 18 Jan 2018 22:44:44 +0200")
References: <151551861550.5642.12750471651801313528.stgit@localhost.localdomain>
 <87h8rsh91o.fsf@wheatstone.g10code.de>
 <cea1d7a9-ab7d-6710-20f9-0227998dfe8f@iki.fi>
 <87o9lzfo32.fsf@wheatstone.g10code.de>
 <61e2c259-5d65-e30a-ddb1-310436c9008c@iki.fi>
Message-ID: <874lni6ypa.fsf@wheatstone.g10code.de>

On Thu, 18 Jan 2018 21:44, jussi.kivilinna at iki.fi said:

> Well, if there is no hurry, I can look into implementing EAX mode.

That would of course be appreciated :-)


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180119/5c27fa6b/attachment.sig>

From jussi.kivilinna at iki.fi  Sat Jan 20 21:04:45 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 20 Jan 2018 22:04:45 +0200
Subject: [PATCH 1/3] cipher: constify spec arrays
Message-ID: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>

* cipher/cipher.c (cipher_list): Constify array.
* cipher/mac.c (mac_list): Constify array.
* cipher/md.c (digest_list): Constify array.
* cipher/pubkey.c (pubkey_list): Constify array.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher.c b/cipher/cipher.c
index 063c13dab..18b25911a 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -33,7 +33,7 @@
 
 /* This is the list of the default ciphers, which are included in
    libgcrypt.  */
-static gcry_cipher_spec_t *cipher_list[] =
+static gcry_cipher_spec_t * const cipher_list[] =
   {
 #if USE_BLOWFISH
      &_gcry_cipher_spec_blowfish,
diff --git a/cipher/mac.c b/cipher/mac.c
index 46be7b7b9..4a7a47df2 100644
--- a/cipher/mac.c
+++ b/cipher/mac.c
@@ -29,7 +29,7 @@
 
 /* This is the list of the digest implementations included in
    libgcrypt.  */
-static gcry_mac_spec_t *mac_list[] = {
+static gcry_mac_spec_t * const mac_list[] = {
 #if USE_SHA1
   &_gcry_mac_type_spec_hmac_sha1,
 #endif
diff --git a/cipher/md.c b/cipher/md.c
index 94f1b5d64..efbffe18e 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -31,7 +31,7 @@
 
 /* This is the list of the digest implementations included in
    libgcrypt.  */
-static gcry_md_spec_t *digest_list[] =
+static gcry_md_spec_t * const digest_list[] =
   {
 #if USE_CRC
      &_gcry_digest_spec_crc32,
diff --git a/cipher/pubkey.c b/cipher/pubkey.c
index 8ec15fd41..4c07e33bf 100644
--- a/cipher/pubkey.c
+++ b/cipher/pubkey.c
@@ -34,7 +34,7 @@
 
 /* This is the list of the public-key algorithms included in
    Libgcrypt.  */
-static gcry_pk_spec_t *pubkey_list[] =
+static gcry_pk_spec_t * const pubkey_list[] =
   {
 #if USE_ECC
     &_gcry_pubkey_spec_ecc,


From jussi.kivilinna at iki.fi  Sat Jan 20 21:04:55 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 20 Jan 2018 22:04:55 +0200
Subject: [PATCH 3/3] doc: fix double "See" in front of reference
In-Reply-To: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>
References: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>
Message-ID: <151647869546.5266.2962771135689419444.stgit@localhost.localdomain>

* doc/gcrypt.texi: Change @xref to @ref when text already has 'see' in
the front.
--

@xref references start with `See ...'. Use @ref instead
when text already has 'see' in front.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index ccb4b820b..bba07a4d2 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1743,7 +1743,7 @@ other cipher functions and returns a handle to it in `hd'.  In case of
 an error, an according error code is returned.
 
 The ID of algorithm to use must be specified via @var{algo}.  See
- at xref{Available ciphers}, for a list of supported ciphers and the
+ at ref{Available ciphers}, for a list of supported ciphers and the
 according constants.
 
 Besides using the constants directly, the function
@@ -1751,7 +1751,7 @@ Besides using the constants directly, the function
 an algorithm into the according numeric ID.
 
 The cipher mode to use must be specified via @var{mode}.  See
- at xref{Available cipher modes}, for a list of supported cipher modes
+ at ref{Available cipher modes}, for a list of supported cipher modes
 and the according constants.  Note that some modes are incompatible
 with some algorithms - in particular, stream mode
 (@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers.
@@ -3310,7 +3310,7 @@ may be given as @code{0} if the algorithms to use are later set using
 @code{gcry_md_enable}. @var{hd} is guaranteed to either receive a valid
 handle or NULL.
 
-For a list of supported algorithms, see @xref{Available hash
+For a list of supported algorithms, see @ref{Available hash
 algorithms}.
 
 The flags allowed for @var{mode} are:
@@ -3329,7 +3329,7 @@ algorithm is not an extendable-output function.  Note that the function
 @code{gcry_md_setkey} must be used to set the MAC key.  The size of the
 MAC is equal to the message digest of the underlying hash algorithm.
 If you want CBC message authentication codes based on a cipher,
-see @xref{Working with cipher handles}.
+see @ref{Working with cipher handles}.
 
 @item GCRY_MD_FLAG_BUGEMU1
 @cindex bug emulation
@@ -3847,7 +3847,7 @@ bitwise OR of constants described below. @var{hd} is guaranteed to either
 receive a valid handle or NULL. @var{ctx} is context object to associate MAC
 object with. @var{ctx} maybe set to NULL.
 
-For a list of supported algorithms, see @xref{Available MAC algorithms}.
+For a list of supported algorithms, see @ref{Available MAC algorithms}.
 
 The flags allowed for @var{mode} are:
 
@@ -5626,7 +5626,7 @@ self-contained functions.  Due to the wide variety of parameters
 required by different algorithms S-expressions, as flexible way to
 convey these parameters, are used.  There is a set of helper functions
 to work with these S-expressions.
- at c see @xref{S-expression Subsystem Architecture}.
+ at c see @ref{S-expression Subsystem Architecture}.
 
 Aside of functions to register new algorithms, map algorithms names to
 algorithms identifiers and to lookup properties of a key, the


From cvs at cvs.gnupg.org  Sat Jan 20 21:16:46 2018
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Sat, 20 Jan 2018 21:16:46 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-36-g93503c1
Message-ID: <E1eczZI-0005zF-5a@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  93503c127a52c1f6a193750e2bf181a744ba3e6b (commit)
      from  c3d60acc3ab5c6d60c2258882175bf31351cc998 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 93503c127a52c1f6a193750e2bf181a744ba3e6b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 20 22:05:19 2018 +0200

    Add ARMv8/CE acceleration for AES-XTS
    
    * cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_xts_enc_armv8_ce)
    (_gcry_aes_xts_dec_armv8_ce): New.
    * cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_xts_enc_armv8_ce)
    (_gcry_aes_xts_dec_armv8_ce): New.
    * cipher/rijndael-armv8-ce.c (_gcry_aes_xts_enc_armv8_ce)
    (_gcry_aes_xts_dec_armv8_ce, xts_crypt_fn_t)
    (_gcry_aes_armv8_ce_xts_crypt): New.
    * cipher/rijndael.c (_gcry_aes_armv8_ce_xts_crypt): New.
    (_gcry_aes_xts_crypt) [USE_ARM_CE]: New.
    --
    
    Benchmark on Cortex-A53 (AArch64, 1152 Mhz):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      4.88 ns/B     195.5 MiB/s      5.62 c/B
            XTS dec |      4.94 ns/B     192.9 MiB/s      5.70 c/B
                    =
     AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      5.55 ns/B     171.8 MiB/s      6.39 c/B
            XTS dec |      5.61 ns/B     169.9 MiB/s      6.47 c/B
                    =
     AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      6.22 ns/B     153.3 MiB/s      7.17 c/B
            XTS dec |      6.29 ns/B     151.7 MiB/s      7.24 c/B
                    =
    
    After (~2.6x faster):
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      1.83 ns/B     520.9 MiB/s      2.11 c/B
            XTS dec |      1.82 ns/B     524.9 MiB/s      2.09 c/B
                    =
     AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      1.97 ns/B     483.3 MiB/s      2.27 c/B
            XTS dec |      1.96 ns/B     486.9 MiB/s      2.26 c/B
                    =
     AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      2.11 ns/B     450.9 MiB/s      2.44 c/B
            XTS dec |      2.10 ns/B     453.8 MiB/s      2.42 c/B
                    =
    
    Benchmark on Cortex-A53 (AArch32, 1152 Mhz):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      6.52 ns/B     146.2 MiB/s      7.51 c/B
            XTS dec |      6.57 ns/B     145.2 MiB/s      7.57 c/B
                    =
     AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      7.10 ns/B     134.3 MiB/s      8.18 c/B
            XTS dec |      7.11 ns/B     134.2 MiB/s      8.19 c/B
                    =
     AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      7.30 ns/B     130.7 MiB/s      8.41 c/B
            XTS dec |      7.38 ns/B     129.3 MiB/s      8.50 c/B
                    =
    
    After (~2.7x faster):
    Cipher:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      2.33 ns/B     409.6 MiB/s      2.68 c/B
            XTS dec |      2.35 ns/B     405.3 MiB/s      2.71 c/B
                    =
     AES192         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      2.53 ns/B     377.6 MiB/s      2.91 c/B
            XTS dec |      2.54 ns/B     375.5 MiB/s      2.93 c/B
                    =
     AES256         |  nanosecs/byte   mebibytes/sec   cycles/byte
            XTS enc |      2.75 ns/B     346.8 MiB/s      3.17 c/B
            XTS dec |      2.76 ns/B     345.2 MiB/s      3.18 c/B
                    =
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 5c8fa3c..66440bd 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1517,6 +1517,317 @@ _gcry_aes_ocb_auth_armv8_ce:
 .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
 
 
+
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type  _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_enc_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_enc_entry_192
+  bhi .Lxts_enc_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_enc_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_enc_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_enc_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_enc_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type  _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_dec_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_dec_entry_192
+  bhi .Lxts_dec_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_dec_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_dec_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_dec_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_dec_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 708ef34..40097a7 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -1277,6 +1277,280 @@ _gcry_aes_ocb_auth_armv8_ce:
 
 
 /*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type  _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lxts_enc_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_enc_entry_192
+  b.hi .Lxts_enc_entry_256
+
+#define XTS_ENC(bits) \
+  .Lxts_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  XTS_ENC(128)
+  XTS_ENC(192)
+  XTS_ENC(256)
+
+#undef XTS_ENC
+
+.Lxts_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_enc_skip:
+  ret
+
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type  _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+
+  cbz x4, .Lxts_dec_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_dec_entry_192
+  b.hi .Lxts_dec_entry_256
+
+#define XTS_DEC(bits) \
+  .Lxts_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(d, imc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  XTS_DEC(128)
+  XTS_DEC(192)
+  XTS_DEC(256)
+
+#undef XTS_DEC
+
+.Lxts_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_dec_skip:
+  ret
+
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
+/*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
 .align 3
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 334cf68..6af7108 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -101,6 +101,16 @@ extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
                                          size_t nblocks,
                                          unsigned int nrounds,
                                          unsigned int blkn);
+extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
 
 typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
                                 const unsigned char *inbuf,
@@ -108,6 +118,11 @@ typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
                                 unsigned char *L_table, size_t nblocks,
                                 unsigned int nrounds, unsigned int blkn);
 
+typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+                                const unsigned char *inbuf,
+                                unsigned char *tweak, size_t nblocks,
+                                unsigned int nrounds);
+
 void
 _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
@@ -361,4 +376,17 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
 			      nblocks, nrounds, (unsigned int)blkn);
 }
 
+void
+_gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			      unsigned char *outbuf, const unsigned char *inbuf,
+			      size_t nblocks, int encrypt)
+{
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
+                                    : _gcry_aes_xts_dec_armv8_ce;
+  unsigned int nrounds = ctx->rounds;
+
+  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+}
+
 #endif /* USE_ARM_CE */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index df1363f..0f676fe 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -211,6 +211,11 @@ extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                           int encrypt);
 extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c,
                                          const void *abuf_arg, size_t nblocks);
+extern void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx,
+                                          unsigned char *tweak,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          size_t nblocks, int encrypt);
 #endif /*USE_ARM_ASM*/
 
 static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -1473,6 +1478,13 @@ _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
       burn_depth = 0;
     }
 #endif /*USE_AESNI*/
+#ifdef USE_ARM_CE
+  else if (ctx->use_arm_ce)
+    {
+      _gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_ARM_CE*/
   else
     {
       tweak_next_lo = buf_get_le64 (tweak + 0);

-----------------------------------------------------------------------

Summary of changes:
 cipher/rijndael-armv8-aarch32-ce.S | 311 +++++++++++++++++++++++++++++++++++++
 cipher/rijndael-armv8-aarch64-ce.S | 274 ++++++++++++++++++++++++++++++++
 cipher/rijndael-armv8-ce.c         |  28 ++++
 cipher/rijndael.c                  |  12 ++
 4 files changed, 625 insertions(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From jussi.kivilinna at iki.fi  Sat Jan 20 21:04:50 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 20 Jan 2018 22:04:50 +0200
Subject: [PATCH 2/3] Add EAX mode
In-Reply-To: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>
References: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>
Message-ID: <151647869043.5266.8325824367607322184.stgit@localhost.localdomain>

* cipher/Makefile.am: Add 'cipher-eax.c'.
* cipher/cipher-cmac.c (cmac_write): Rename to ...
(_gcry_cmac_write): ... this; Take CMAC context as new input
parameter; Return error code.
(cmac_generate_subkeys): Rename to ...
(_gcry_cmac_generate_subkeys): ... this; Take CMAC context as new
input parameter; Return error code.
(cmac_final): Rename to ...
(_gcry_cmac_final): ... this; Take CMAC context as new input
parameter; Return error code.
(cmac_tag): Take CMAC context as new input parameter.
(_gcry_cmac_reset): New.
(_gcry_cipher_cmac_authenticate): Remove duplicate tag flag check;
Adapt to changes above.
(_gcry_cipher_cmac_get_tag): Adapt to changes above.
(_gcry_cipher_cmac_check_tag): Ditto.
(_gcry_cipher_cmac_set_subkeys): Ditto.
* cipher-eax.c: New.
* cipher-internal.h (gcry_cmac_context_t): New.
(gcry_cipher_handle): Update u_mode.cmac; Add u_mode.eax.
(_gcry_cmac_write, _gcry_cmac_generate_subkeys, _gcry_cmac_final)
(_gcry_cmac_reset, _gcry_cipher_eax_encrypt, _gcry_cipher_eax_decrypt)
(_gcry_cipher_eax_set_nonce, _gcry_cipher_eax_authenticate)
(_gcry_cipher_eax_get_tag, _gcry_cipher_eax_check_tag)
(_gcry_cipher_eax_setkey): New prototypes.
* cipher/cipher.c (_gcry_cipher_open_internal, cipher_setkey)
(cipher_reset, cipher_encrypt, cipher_decrypt, _gcry_cipher_setiv)
(_gcry_cipher_authenticate, _gcry_cipher_gettag, _gcry_cipher_checktag)
(_gcry_cipher_info): Add EAX mode.
* doc/gcrypt.texi: Add EAX mode.
* src/gcrypt.h.in (GCRY_CIPHER_MODE_EAX): New.
* tests/basic.c (_check_gcm_cipher, _check_poly1305_cipher): Constify
test vectors array.
(_check_eax_cipher, check_eax_cipher): New.
(check_ciphers, check_cipher_modes): Add EAX mode.
* tests/bench-slope.c (bench_eax_encrypt_do_bench)
(bench_eax_decrypt_do_bench, bench_eax_authenticate_do_bench)
(eax_encrypt_ops, eax_decrypt_ops, eax_authenticate_ops): New.
(cipher_modes): Add EAX mode.
* tests/benchmark.c (cipher_bench): Add EAX mode.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bba815bbe..6e6c5ac03 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -44,7 +44,7 @@ cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
 cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
   cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
-cipher-poly1305.c cipher-ocb.c cipher-xts.c \
+cipher-poly1305.c cipher-ocb.c cipher-xts.c cipher-eax.c \
 cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
 md.c \
diff --git a/cipher/cipher-cmac.c b/cipher/cipher-cmac.c
index da3ef7592..30567b7fc 100644
--- a/cipher/cipher-cmac.c
+++ b/cipher/cipher-cmac.c
@@ -1,5 +1,5 @@
 /* cmac.c - CMAC, Cipher-based MAC.
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -33,8 +33,9 @@
   (burn) = (burn) > __nburn ? (burn) : __nburn; } while (0)
 
 
-static void
-cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
+gcry_err_code_t
+_gcry_cmac_write (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		  const byte * inbuf, size_t inlen)
 {
   gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   const unsigned int blocksize = c->spec->blocksize;
@@ -42,31 +43,37 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   unsigned int burn = 0;
   unsigned int nblocks;
 
+  if (ctx->tag)
+    return GPG_ERR_INV_STATE;
+
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
-  if (!inlen || !inbuf)
-    return;
+  if (!inbuf)
+    return GPG_ERR_INV_ARG;
+
+  if (inlen == 0)
+    return 0;
 
   /* Last block is needed for cmac_final.  */
-  if (c->unused + inlen <= blocksize)
+  if (ctx->mac_unused + inlen <= blocksize)
     {
-      for (; inlen && c->unused < blocksize; inlen--)
-        c->lastiv[c->unused++] = *inbuf++;
-      return;
+      for (; inlen && ctx->mac_unused < blocksize; inlen--)
+        ctx->macbuf[ctx->mac_unused++] = *inbuf++;
+      return 0;
     }
 
-  if (c->unused)
+  if (ctx->mac_unused)
     {
-      for (; inlen && c->unused < blocksize; inlen--)
-        c->lastiv[c->unused++] = *inbuf++;
+      for (; inlen && ctx->mac_unused < blocksize; inlen--)
+        ctx->macbuf[ctx->mac_unused++] = *inbuf++;
 
-      buf_xor (c->u_iv.iv, c->u_iv.iv, c->lastiv, blocksize);
-      set_burn (burn, enc_fn (&c->context.c, c->u_iv.iv, c->u_iv.iv));
+      buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+      set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
 
-      c->unused = 0;
+      ctx->mac_unused = 0;
     }
 
   if (c->bulk.cbc_enc && inlen > blocksize)
@@ -74,7 +81,7 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
       nblocks = inlen / blocksize;
       nblocks -= (nblocks * blocksize == inlen);
 
-      c->bulk.cbc_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks, 1);
+      c->bulk.cbc_enc (&c->context.c, ctx->u_iv.iv, outbuf, inbuf, nblocks, 1);
       inbuf += nblocks * blocksize;
       inlen -= nblocks * blocksize;
 
@@ -83,8 +90,8 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   else
     while (inlen > blocksize)
       {
-        buf_xor (c->u_iv.iv, c->u_iv.iv, inbuf, blocksize);
-        set_burn (burn, enc_fn (&c->context.c, c->u_iv.iv, c->u_iv.iv));
+        buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, inbuf, blocksize);
+        set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
         inlen -= blocksize;
         inbuf += blocksize;
       }
@@ -93,16 +100,18 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   if (inlen == 0)
     BUG ();
 
-  for (; inlen && c->unused < blocksize; inlen--)
-    c->lastiv[c->unused++] = *inbuf++;
+  for (; inlen && ctx->mac_unused < blocksize; inlen--)
+    ctx->macbuf[ctx->mac_unused++] = *inbuf++;
 
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
 }
 
 
-static void
-cmac_generate_subkeys (gcry_cipher_hd_t c)
+gcry_err_code_t
+_gcry_cmac_generate_subkeys (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
 {
   const unsigned int blocksize = c->spec->blocksize;
   byte rb, carry, t, bi;
@@ -117,7 +126,7 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
   if (MAX_BLOCKSIZE < blocksize)
     BUG ();
@@ -127,7 +136,7 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
   burn = c->spec->encrypt (&c->context.c, u.buf, u.buf);
 
   /* Currently supported blocksizes are 16 and 8. */
-  rb = blocksize == 16 ? 0x87 : 0x1B /*blocksize == 8 */ ;
+  rb = blocksize == 16 ? 0x87 : 0x1B /* blocksize == 8 */ ;
 
   for (j = 0; j < 2; j++)
     {
@@ -139,93 +148,113 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
           t = carry | (bi << 1);
           carry = bi >> 7;
           u.buf[i] = t & 0xff;
-          c->u_mode.cmac.subkeys[j][i] = u.buf[i];
+          ctx->subkeys[j][i] = u.buf[i];
         }
       u.buf[blocksize - 1] ^= carry ? rb : 0;
-      c->u_mode.cmac.subkeys[j][blocksize - 1] = u.buf[blocksize - 1];
+      ctx->subkeys[j][blocksize - 1] = u.buf[blocksize - 1];
     }
 
   wipememory (&u, sizeof (u));
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
 }
 
 
-static void
-cmac_final (gcry_cipher_hd_t c)
+gcry_err_code_t
+_gcry_cmac_final (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
 {
   const unsigned int blocksize = c->spec->blocksize;
-  unsigned int count = c->unused;
+  unsigned int count = ctx->mac_unused;
   unsigned int burn;
   byte *subkey;
 
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
   if (count == blocksize)
-    subkey = c->u_mode.cmac.subkeys[0];        /* K1 */
+    subkey = ctx->subkeys[0];        /* K1 */
   else
     {
-      subkey = c->u_mode.cmac.subkeys[1];      /* K2 */
-      c->lastiv[count++] = 0x80;
+      subkey = ctx->subkeys[1];      /* K2 */
+      ctx->macbuf[count++] = 0x80;
       while (count < blocksize)
-        c->lastiv[count++] = 0;
+        ctx->macbuf[count++] = 0;
     }
 
-  buf_xor (c->lastiv, c->lastiv, subkey, blocksize);
+  buf_xor (ctx->macbuf, ctx->macbuf, subkey, blocksize);
 
-  buf_xor (c->u_iv.iv, c->u_iv.iv, c->lastiv, blocksize);
-  burn = c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_iv.iv);
+  buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+  burn = c->spec->encrypt (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv);
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
 
-  c->unused = 0;
+  ctx->mac_unused = 0;
+
+  return 0;
 }
 
 
 static gcry_err_code_t
-cmac_tag (gcry_cipher_hd_t c, unsigned char *tag, size_t taglen, int check)
+cmac_tag (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+	  unsigned char *tag, size_t taglen, int check)
 {
+  gcry_err_code_t ret;
+
   if (!tag || taglen == 0 || taglen > c->spec->blocksize)
     return GPG_ERR_INV_ARG;
 
-  if (!c->u_mode.cmac.tag)
+  if (!ctx->tag)
     {
-      cmac_final (c);
-      c->u_mode.cmac.tag = 1;
+      ret = _gcry_cmac_final (c, ctx);
+      if (ret != 0)
+	return ret;
+
+      ctx->tag = 1;
     }
 
   if (!check)
     {
-      memcpy (tag, c->u_iv.iv, taglen);
+      memcpy (tag, ctx->u_iv.iv, taglen);
       return GPG_ERR_NO_ERROR;
     }
   else
     {
-      return buf_eq_const (tag, c->u_iv.iv, taglen) ?
+      return buf_eq_const (tag, ctx->u_iv.iv, taglen) ?
         GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
     }
 }
 
 
+void
+_gcry_cmac_reset (gcry_cmac_context_t *ctx)
+{
+  char tmp_buf[sizeof(ctx->subkeys)];
+
+  /* Only keep subkeys when reseting context. */
+
+  buf_cpy (tmp_buf, ctx->subkeys, sizeof(ctx->subkeys));
+  memset (ctx, 0, sizeof(*ctx));
+  buf_cpy (ctx->subkeys, tmp_buf, sizeof(ctx->subkeys));
+  wipememory (tmp_buf, sizeof(tmp_buf));
+}
+
+
 gcry_err_code_t
 _gcry_cipher_cmac_authenticate (gcry_cipher_hd_t c,
                                 const unsigned char *abuf, size_t abuflen)
 {
   if (abuflen > 0 && !abuf)
     return GPG_ERR_INV_ARG;
-  if (c->u_mode.cmac.tag)
-    return GPG_ERR_INV_STATE;
   /* To support new blocksize, update cmac_generate_subkeys() then add new
      blocksize here. */
   if (c->spec->blocksize != 16 && c->spec->blocksize != 8)
     return GPG_ERR_INV_CIPHER_MODE;
 
-  cmac_write (c, abuf, abuflen);
-
-  return GPG_ERR_NO_ERROR;
+  return _gcry_cmac_write (c, &c->u_mode.cmac, abuf, abuflen);
 }
 
 
@@ -233,7 +262,7 @@ gcry_err_code_t
 _gcry_cipher_cmac_get_tag (gcry_cipher_hd_t c,
                            unsigned char *outtag, size_t taglen)
 {
-  return cmac_tag (c, outtag, taglen, 0);
+  return cmac_tag (c, &c->u_mode.cmac, outtag, taglen, 0);
 }
 
 
@@ -241,13 +270,11 @@ gcry_err_code_t
 _gcry_cipher_cmac_check_tag (gcry_cipher_hd_t c,
                              const unsigned char *intag, size_t taglen)
 {
-  return cmac_tag (c, (unsigned char *) intag, taglen, 1);
+  return cmac_tag (c, &c->u_mode.cmac, (unsigned char *) intag, taglen, 1);
 }
 
 gcry_err_code_t
 _gcry_cipher_cmac_set_subkeys (gcry_cipher_hd_t c)
 {
-  cmac_generate_subkeys (c);
-
-  return GPG_ERR_NO_ERROR;
+  return _gcry_cmac_generate_subkeys (c, &c->u_mode.cmac);
 }
diff --git a/cipher/cipher-eax.c b/cipher/cipher-eax.c
new file mode 100644
index 000000000..1ce479755
--- /dev/null
+++ b/cipher/cipher-eax.c
@@ -0,0 +1,248 @@
+/* cipher-eax.c  -  EAX implementation
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+  if (err != 0)
+    return err;
+
+  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf, inbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf, inbuflen);
+  if (err != 0)
+    return err;
+
+  return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_authenticate (gcry_cipher_hd_t c,
+                               const byte * aadbuf, size_t aadbuflen)
+{
+  gcry_err_code_t err;
+
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, aadbuf, aadbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_setkey (gcry_cipher_hd_t c)
+{
+  gcry_err_code_t err;
+
+  err = _gcry_cmac_generate_subkeys (c, &c->u_mode.eax.cmac_header);
+  if (err != 0)
+    return err;
+
+  buf_cpy (c->u_mode.eax.cmac_ciphertext.subkeys,
+	   c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_set_nonce (gcry_cipher_hd_t c, const byte *nonce,
+			    size_t noncelen)
+{
+  gcry_cmac_context_t nonce_cmac;
+  unsigned char initbuf[MAX_BLOCKSIZE];
+  gcry_err_code_t err;
+
+  c->marks.iv = 0;
+  c->marks.tag = 0;
+
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+  /* Calculate nonce CMAC */
+
+  memset(&nonce_cmac, 0, sizeof(nonce_cmac));
+  memset(&initbuf, 0, sizeof(initbuf));
+
+  buf_cpy (&nonce_cmac.subkeys, c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  err = _gcry_cmac_write (c, &nonce_cmac, initbuf, c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  if (noncelen != 0)
+    {
+      err = _gcry_cmac_write (c, &nonce_cmac, nonce, noncelen);
+      if (err != 0)
+        return err;
+    }
+
+  err = _gcry_cmac_final (c, &nonce_cmac);
+  if (err != 0)
+    return err;
+
+  buf_cpy (c->u_iv.iv, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+  buf_cpy (c->u_ctr.ctr, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+
+  wipememory (&nonce_cmac, sizeof(nonce_cmac));
+
+  /* Prepare header CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 1;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  /* Prepare ciphertext CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 2;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  c->marks.iv = 1;
+  c->marks.tag = 0;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+_gcry_cipher_eax_tag (gcry_cipher_hd_t c,
+                      byte *outbuf, size_t outbuflen, int check)
+{
+  gcry_err_code_t err;
+
+  if (!c->marks.tag)
+    {
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_header);
+      if (err != 0)
+	return err;
+
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_ciphertext);
+      if (err != 0)
+	return err;
+
+      buf_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_header.u_iv.iv, MAX_BLOCKSIZE);
+      buf_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_ciphertext.u_iv.iv,
+		 MAX_BLOCKSIZE);
+
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+      c->marks.tag = 1;
+    }
+
+  if (!check)
+    {
+      if (outbuflen > c->spec->blocksize)
+        outbuflen = c->spec->blocksize;
+
+      /* NB: We already checked that OUTBUF is large enough to hold
+       * the result or has valid truncated length.  */
+      memcpy (outbuf, c->u_iv.iv, outbuflen);
+    }
+  else
+    {
+      /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
+       * and thus we need to compare its length first.  */
+      if (!(outbuflen <= c->spec->blocksize)
+          || !buf_eq_const (outbuf, c->u_iv.iv, outbuflen))
+        return GPG_ERR_CHECKSUM;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+                          size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, outtag, taglen, 0);
+}
+
+gcry_err_code_t
+_gcry_cipher_eax_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+                            size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, (unsigned char *) intag, taglen, 1);
+}
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 8c897d7b5..a0ede5e03 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -109,6 +109,25 @@ typedef union
 } cipher_context_alignment_t;
 
 
+/* Storage structure for CMAC, for CMAC and EAX modes. */
+typedef struct {
+  /* The initialization vector. Also contains tag after finalization. */
+  union {
+    cipher_context_alignment_t iv_align;
+    unsigned char iv[MAX_BLOCKSIZE];
+  } u_iv;
+
+  /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
+  unsigned char subkeys[2][MAX_BLOCKSIZE];
+
+  /* Space to save partial input lengths for MAC. */
+  unsigned char macbuf[MAX_BLOCKSIZE];
+
+  int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
+  unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
+} gcry_cmac_context_t;
+
+
 /* The handle structure.  */
 struct gcry_cipher_handle
 {
@@ -197,7 +216,7 @@ struct gcry_cipher_handle
 
       unsigned char s0[GCRY_CCM_BLOCK_LEN];
 
-      unsigned int nonce:1;/* Set to 1 if nonce has been set.  */
+      unsigned int nonce:1; /* Set to 1 if nonce has been set.  */
       unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
                                  processed.  */
     } ccm;
@@ -217,12 +236,16 @@ struct gcry_cipher_handle
     } poly1305;
 
     /* Mode specific storage for CMAC mode. */
+    gcry_cmac_context_t cmac;
+
+    /* Mode specific storage for EAX mode. */
     struct {
-      unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
+      /* CMAC for header (AAD). */
+      gcry_cmac_context_t cmac_header;
 
-      /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
-      unsigned char subkeys[2][MAX_BLOCKSIZE];
-    } cmac;
+      /* CMAC for ciphertext. */
+      gcry_cmac_context_t cmac_ciphertext;
+    } eax;
 
     /* Mode specific storage for GCM mode. */
     struct {
@@ -236,7 +259,6 @@ struct gcry_cipher_handle
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
-
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
@@ -309,7 +331,6 @@ struct gcry_cipher_handle
          processed.  */
       unsigned int data_finalized:1;
       unsigned int aad_finalized:1;
-
     } ocb;
 
     /* Mode specific storage for XTS mode. */
@@ -406,6 +427,42 @@ gcry_err_code_t _gcry_cipher_ccm_check_tag
                  const unsigned char *intag, size_t taglen);
 
 
+/*-- cipher-cmac.c --*/
+gcry_err_code_t _gcry_cmac_generate_subkeys
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+gcry_err_code_t _gcry_cmac_write
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		 const byte * inbuf, size_t inlen);
+gcry_err_code_t _gcry_cmac_final
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+void _gcry_cmac_reset (gcry_cmac_context_t *ctx);
+
+
+/*-- cipher-eax.c --*/
+gcry_err_code_t _gcry_cipher_eax_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_set_nonce
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *nonce, size_t noncelen);
+gcry_err_code_t _gcry_cipher_eax_authenticate
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *aadbuf, size_t aadbuflen);
+gcry_err_code_t _gcry_cipher_eax_get_tag
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_check_tag
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *intag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_setkey
+/*           */   (gcry_cipher_hd_t c);
+
+
 /*-- cipher-gcm.c --*/
 gcry_err_code_t _gcry_cipher_gcm_encrypt
 /*           */   (gcry_cipher_hd_t c,
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 18b25911a..1bef766cb 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -420,6 +420,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
       case GCRY_CIPHER_MODE_CTR:
       case GCRY_CIPHER_MODE_AESWRAP:
       case GCRY_CIPHER_MODE_CMAC:
+      case GCRY_CIPHER_MODE_EAX:
       case GCRY_CIPHER_MODE_GCM:
 	if (!spec->encrypt || !spec->decrypt)
 	  err = GPG_ERR_INV_CIPHER_MODE;
@@ -688,7 +689,11 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
       switch (c->mode)
         {
         case GCRY_CIPHER_MODE_CMAC:
-          _gcry_cipher_cmac_set_subkeys (c);
+          rc = _gcry_cipher_cmac_set_subkeys (c);
+          break;
+
+        case GCRY_CIPHER_MODE_EAX:
+          rc = _gcry_cipher_eax_setkey (c);
           break;
 
         case GCRY_CIPHER_MODE_GCM:
@@ -782,8 +787,12 @@ cipher_reset (gcry_cipher_hd_t c)
   switch (c->mode)
     {
     case GCRY_CIPHER_MODE_CMAC:
-      /* Only clear 'tag' for cmac, keep subkeys. */
-      c->u_mode.cmac.tag = 0;
+      _gcry_cmac_reset(&c->u_mode.cmac);
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_ciphertext);
       break;
 
     case GCRY_CIPHER_MODE_GCM:
@@ -929,6 +938,10 @@ cipher_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
       rc = GPG_ERR_INV_CIPHER_MODE;
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
       break;
@@ -1060,6 +1073,10 @@ cipher_decrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
       rc = GPG_ERR_INV_CIPHER_MODE;
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_decrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_decrypt (c, outbuf, outbuflen, inbuf, inbuflen);
       break;
@@ -1158,6 +1175,10 @@ _gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen)
         rc = _gcry_cipher_ccm_set_nonce (hd, iv, ivlen);
         break;
 
+      case GCRY_CIPHER_MODE_EAX:
+        rc =  _gcry_cipher_eax_set_nonce (hd, iv, ivlen);
+        break;
+
       case GCRY_CIPHER_MODE_GCM:
         rc =  _gcry_cipher_gcm_setiv (hd, iv, ivlen);
         break;
@@ -1226,6 +1247,10 @@ _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
       rc = _gcry_cipher_cmac_authenticate (hd, abuf, abuflen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_authenticate (hd, abuf, abuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_authenticate (hd, abuf, abuflen);
       break;
@@ -1263,6 +1288,10 @@ _gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag, size_t taglen)
       rc = _gcry_cipher_cmac_get_tag (hd, outtag, taglen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_get_tag (hd, outtag, taglen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_get_tag (hd, outtag, taglen);
       break;
@@ -1300,6 +1329,10 @@ _gcry_cipher_checktag (gcry_cipher_hd_t hd, const void *intag, size_t taglen)
       rc = _gcry_cipher_cmac_check_tag (hd, intag, taglen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_check_tag (hd, intag, taglen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_check_tag (hd, intag, taglen);
       break;
@@ -1501,6 +1534,10 @@ _gcry_cipher_info (gcry_cipher_hd_t h, int cmd, void *buffer, size_t *nbytes)
               *nbytes = h->u_mode.ccm.authlen;
               break;
 
+            case GCRY_CIPHER_MODE_EAX:
+              *nbytes = h->spec->blocksize;
+              break;
+
             case GCRY_CIPHER_MODE_GCM:
               *nbytes = GCRY_GCM_BLOCK_LEN;
               break;
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 78315052b..ccb4b820b 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1722,6 +1722,12 @@ value is automatically incremented after each call of
 Auto-increment allows avoiding need of setting IV between processing
 of sequential data units.
 
+ at item  GCRY_CIPHER_MODE_EAX
+ at cindex EAX, EAX mode
+EAX is an Authenticated Encryption with Associated Data (AEAD) block cipher
+mode by Bellare, Rogaway, and Wagner (see
+ at uref{http://web.cs.ucdavis.edu/~rogaway/papers/eax.html}).
+
 @end table
 
 @node Working with cipher handles
@@ -1752,12 +1758,13 @@ with some algorithms - in particular, stream mode
 Poly1305 AEAD mode (@code{GCRY_CIPHER_MODE_POLY1305}) only works with
 ChaCha20 stream cipher. The block cipher modes
 (@code{GCRY_CIPHER_MODE_ECB}, @code{GCRY_CIPHER_MODE_CBC},
- at code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB} and
- at code{GCRY_CIPHER_MODE_CTR}) will work with any block cipher
-algorithm.  GCM mode (@code{GCRY_CIPHER_MODE_CCM}), CCM mode
-(@code{GCRY_CIPHER_MODE_GCM}), OCB mode (@code{GCRY_CIPHER_MODE_OCB}),
-and XTS mode (@code{GCRY_CIPHER_MODE_XTS}) will only work
-with block cipher algorithms which have the block size of 16 bytes.
+ at code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB},
+ at code{GCRY_CIPHER_MODE_CTR} and @code{GCRY_CIPHER_MODE_EAX}) will work
+with any block cipher algorithm.  GCM mode
+(@code{GCRY_CIPHER_MODE_CCM}), CCM mode (@code{GCRY_CIPHER_MODE_GCM}),
+OCB mode (@code{GCRY_CIPHER_MODE_OCB}), and XTS mode
+(@code{GCRY_CIPHER_MODE_XTS}) will only work with block cipher
+algorithms which have the block size of 16 bytes.
 
 The third argument @var{flags} can either be passed as @code{0} or as
 the bit-wise OR of the following constants.
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 1eb3d7c0f..83f94b687 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -971,7 +971,8 @@ enum gcry_cipher_modes
     GCRY_CIPHER_MODE_POLY1305 = 10,  /* Poly1305 based AEAD mode. */
     GCRY_CIPHER_MODE_OCB      = 11,  /* OCB3 mode.  */
     GCRY_CIPHER_MODE_CFB8     = 12,  /* Cipher feedback (8 bit mode). */
-    GCRY_CIPHER_MODE_XTS      = 13  /* XTS mode.  */
+    GCRY_CIPHER_MODE_XTS      = 13,  /* XTS mode.  */
+    GCRY_CIPHER_MODE_EAX      = 14   /* EAX mode.  */
   };
 
 /* Flags used with the open function. */
diff --git a/tests/basic.c b/tests/basic.c
index c2b42082a..c883eb39f 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -1347,7 +1347,7 @@ check_ofb_cipher (void)
 static void
 _check_gcm_cipher (unsigned int step)
 {
-  struct tv
+  static const struct tv
   {
     int algo;
     char key[MAX_DATA_LEN];
@@ -1890,10 +1890,543 @@ check_gcm_cipher (void)
 }
 
 
+static void
+_check_eax_cipher (unsigned int step)
+{
+  static const struct tv
+  {
+    int algo;
+    char key[MAX_DATA_LEN];
+    char nonce[MAX_DATA_LEN];
+    int noncelen;
+    unsigned char header[MAX_DATA_LEN];
+    int headerlen;
+    unsigned char plaintext[MAX_DATA_LEN];
+    int inlen;
+    char out[MAX_DATA_LEN];
+    char tag[MAX_DATA_LEN];
+    int taglen;
+    int should_fail;
+  } tv[] =
+    {
+      /* Test vectors from http://www.cs.ucdavis.edu/~rogaway/papers/eax.pdf */
+      { GCRY_CIPHER_AES,
+        "\x23\x39\x52\xDE\xE4\xD5\xED\x5F\x9B\x9C\x6D\x6F\xF8\x0F\xF4\x78",
+        "\x62\xEC\x67\xF9\xC3\xA4\xA4\x07\xFC\xB2\xA8\xC4\x90\x31\xA8\xB3", 16,
+        "\x6B\xFB\x91\x4F\xD0\x7E\xAE\x6B", 8,
+        "",
+        0,
+        "",
+        "\xE0\x37\x83\x0E\x83\x89\xF2\x7B\x02\x5A\x2D\x65\x27\xE7\x9D\x01", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x91\x94\x5D\x3F\x4D\xCB\xEE\x0B\xF4\x5E\xF5\x22\x55\xF0\x95\xA4",
+        "\xBE\xCA\xF0\x43\xB0\xA2\x3D\x84\x31\x94\xBA\x97\x2C\x66\xDE\xBD", 16,
+        "\xFA\x3B\xFD\x48\x06\xEB\x53\xFA", 8,
+        "\xF7\xFB",
+        2,
+        "\x19\xDD",
+        "\x5C\x4C\x93\x31\x04\x9D\x0B\xDA\xB0\x27\x74\x08\xF6\x79\x67\xE5", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x01\xF7\x4A\xD6\x40\x77\xF2\xE7\x04\xC0\xF6\x0A\xDA\x3D\xD5\x23",
+        "\x70\xC3\xDB\x4F\x0D\x26\x36\x84\x00\xA1\x0E\xD0\x5D\x2B\xFF\x5E", 16,
+        "\x23\x4A\x34\x63\xC1\x26\x4A\xC6", 8,
+        "\x1A\x47\xCB\x49\x33",
+        5,
+        "\xD8\x51\xD5\xBA\xE0",
+        "\x3A\x59\xF2\x38\xA2\x3E\x39\x19\x9D\xC9\x26\x66\x26\xC4\x0F\x80", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xD0\x7C\xF6\xCB\xB7\xF3\x13\xBD\xDE\x66\xB7\x27\xAF\xD3\xC5\xE8",
+        "\x84\x08\xDF\xFF\x3C\x1A\x2B\x12\x92\xDC\x19\x9E\x46\xB7\xD6\x17", 16,
+        "\x33\xCC\xE2\xEA\xBF\xF5\xA7\x9D", 8,
+        "\x48\x1C\x9E\x39\xB1",
+        5,
+        "\x63\x2A\x9D\x13\x1A",
+        "\xD4\xC1\x68\xA4\x22\x5D\x8E\x1F\xF7\x55\x93\x99\x74\xA7\xBE\xDE", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x35\xB6\xD0\x58\x00\x05\xBB\xC1\x2B\x05\x87\x12\x45\x57\xD2\xC2",
+        "\xFD\xB6\xB0\x66\x76\xEE\xDC\x5C\x61\xD7\x42\x76\xE1\xF8\xE8\x16", 16,
+        "\xAE\xB9\x6E\xAE\xBE\x29\x70\xE9", 8,
+        "\x40\xD0\xC0\x7D\xA5\xE4",
+        6,
+        "\x07\x1D\xFE\x16\xC6\x75",
+        "\xCB\x06\x77\xE5\x36\xF7\x3A\xFE\x6A\x14\xB7\x4E\xE4\x98\x44\xDD", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xBD\x8E\x6E\x11\x47\x5E\x60\xB2\x68\x78\x4C\x38\xC6\x2F\xEB\x22",
+        "\x6E\xAC\x5C\x93\x07\x2D\x8E\x85\x13\xF7\x50\x93\x5E\x46\xDA\x1B", 16,
+        "\xD4\x48\x2D\x1C\xA7\x8D\xCE\x0F", 8,
+        "\x4D\xE3\xB3\x5C\x3F\xC0\x39\x24\x5B\xD1\xFB\x7D",
+        12,
+        "\x83\x5B\xB4\xF1\x5D\x74\x3E\x35\x0E\x72\x84\x14",
+        "\xAB\xB8\x64\x4F\xD6\xCC\xB8\x69\x47\xC5\xE1\x05\x90\x21\x0A\x4F", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x7C\x77\xD6\xE8\x13\xBE\xD5\xAC\x98\xBA\xA4\x17\x47\x7A\x2E\x7D",
+        "\x1A\x8C\x98\xDC\xD7\x3D\x38\x39\x3B\x2B\xF1\x56\x9D\xEE\xFC\x19", 16,
+        "\x65\xD2\x01\x79\x90\xD6\x25\x28", 8,
+        "\x8B\x0A\x79\x30\x6C\x9C\xE7\xED\x99\xDA\xE4\xF8\x7F\x8D\xD6\x16\x36",
+        17,
+        "\x02\x08\x3E\x39\x79\xDA\x01\x48\x12\xF5\x9F\x11\xD5\x26\x30\xDA\x30",
+        "\x13\x73\x27\xD1\x06\x49\xB0\xAA\x6E\x1C\x18\x1D\xB6\x17\xD7\xF2", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x5F\xFF\x20\xCA\xFA\xB1\x19\xCA\x2F\xC7\x35\x49\xE2\x0F\x5B\x0D",
+        "\xDD\xE5\x9B\x97\xD7\x22\x15\x6D\x4D\x9A\xFF\x2B\xC7\x55\x98\x26", 16,
+        "\x54\xB9\xF0\x4E\x6A\x09\x18\x9A", 8,
+        "\x1B\xDA\x12\x2B\xCE\x8A\x8D\xBA\xF1\x87\x7D\x96\x2B\x85\x92\xDD"
+        "\x2D\x56",
+        18,
+        "\x2E\xC4\x7B\x2C\x49\x54\xA4\x89\xAF\xC7\xBA\x48\x97\xED\xCD\xAE"
+        "\x8C\xC3",
+        "\x3B\x60\x45\x05\x99\xBD\x02\xC9\x63\x82\x90\x2A\xEF\x7F\x83\x2A", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xA4\xA4\x78\x2B\xCF\xFD\x3E\xC5\xE7\xEF\x6D\x8C\x34\xA5\x61\x23",
+        "\xB7\x81\xFC\xF2\xF7\x5F\xA5\xA8\xDE\x97\xA9\xCA\x48\xE5\x22\xEC", 16,
+        "\x89\x9A\x17\x58\x97\x56\x1D\x7E", 8,
+        "\x6C\xF3\x67\x20\x87\x2B\x85\x13\xF6\xEA\xB1\xA8\xA4\x44\x38\xD5"
+        "\xEF\x11",
+        18,
+        "\x0D\xE1\x8F\xD0\xFD\xD9\x1E\x7A\xF1\x9F\x1D\x8E\xE8\x73\x39\x38"
+        "\xB1\xE8",
+        "\xE7\xF6\xD2\x23\x16\x18\x10\x2F\xDB\x7F\xE5\x5F\xF1\x99\x17\x00", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x83\x95\xFC\xF1\xE9\x5B\xEB\xD6\x97\xBD\x01\x0B\xC7\x66\xAA\xC3",
+        "\x22\xE7\xAD\xD9\x3C\xFC\x63\x93\xC5\x7E\xC0\xB3\xC1\x7D\x6B\x44", 16,
+        "\x12\x67\x35\xFC\xC3\x20\xD2\x5A", 8,
+        "\xCA\x40\xD7\x44\x6E\x54\x5F\xFA\xED\x3B\xD1\x2A\x74\x0A\x65\x9F"
+        "\xFB\xBB\x3C\xEA\xB7",
+        21,
+        "\xCB\x89\x20\xF8\x7A\x6C\x75\xCF\xF3\x96\x27\xB5\x6E\x3E\xD1\x97"
+        "\xC5\x52\xD2\x95\xA7",
+        "\xCF\xC4\x6A\xFC\x25\x3B\x46\x52\xB1\xAF\x37\x95\xB1\x24\xAB\x6E", 16,
+        0
+      },
+      /* Negative test for bad tag. */
+      { GCRY_CIPHER_AES,
+        "\x23\x39\x52\xDE\xE4\xD5\xED\x5F\x9B\x9C\x6D\x6F\xF8\x0F\xF4\x78",
+        "\x62\xEC\x67\xF9\xC3\xA4\xA4\x07\xFC\xB2\xA8\xC4\x90\x31\xA8\xB3", 16,
+        "\x6B\xFB\x91\x4F\xD0\x7E\xAE\x6B", 8,
+        "",
+        0,
+        "",
+        "\x00\x37\x83\x0E\x83\x89\xF2\x7B\x02\x5A\x2D\x65\x27\xE7\x9D\x01", 16,
+        1
+      },
+      /* Test vectors from libtomcrypt. */
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "", 0,
+        "", 0,
+        "",
+        0,
+        "",
+        "\x9a\xd0\x7e\x7d\xbf\xf3\x01\xf5\x05\xde\x59\x6b\x96\x15\xdf\xff", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "", 0,
+        "",
+        0,
+        "",
+        "\x1c\xe1\x0d\x3e\xff\xd4\xca\xdb\xe2\xe4\x4b\x58\xd6\x0a\xb9\xec", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "", 0,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "",
+        0,
+        "",
+        "\x3a\x69\x8f\x7a\x27\x0e\x51\xb0\xf6\x5b\x3d\x3e\x47\x19\x3c\xff", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+        32,
+        "\x29\xd8\x78\xd1\xa3\xbe\x85\x7b\x6f\xb8\xc8\xea\x59\x50\xa7\x78"
+        "\x33\x1f\xbf\x2c\xcf\x33\x98\x6f\x35\xe8\xcf\x12\x1d\xcb\x30\xbc",
+        "\x4f\xbe\x03\x38\xbe\x1c\x8c\x7e\x1d\x7a\xe7\xe4\x5b\x92\xc5\x87", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e", 15,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d", 14,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c",
+        29,
+        "\xdd\x25\xc7\x54\xc5\xb1\x7c\x59\x28\xb6\x9b\x73\x15\x5f\x7b\xb8"
+        "\x88\x8f\xaf\x37\x09\x1a\xd9\x2c\x8a\x24\xdb\x86\x8b",
+        "\x0d\x1a\x14\xe5\x22\x24\xff\xd2\x3a\x05\xfa\x02\xcd\xef\x52\xda", 16,
+        0
+      },
+    };
+
+  gcry_cipher_hd_t hde, hdd;
+  unsigned char out[MAX_DATA_LEN];
+  unsigned char tag[16];
+  int i, keylen;
+  gcry_error_t err = 0;
+  size_t pos, poslen, taglen2;
+  int byteNum;
+
+  if (verbose)
+    fprintf (stderr, "  Starting EAX checks.\n");
+
+  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
+    {
+      if (gcry_cipher_test_algo (tv[i].algo) && in_fips_mode)
+        {
+          if (verbose)
+            fprintf (stderr, "  algorithm %d not available in fips mode\n",
+		     tv[i].algo);
+          continue;
+        }
+
+      if (verbose)
+        fprintf (stderr, "    checking EAX mode for %s [%i]\n",
+                 gcry_cipher_algo_name (tv[i].algo),
+                 tv[i].algo);
+      err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_EAX, 0);
+      if (!err)
+        err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_EAX, 0);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_open failed: %s\n", gpg_strerror (err));
+          return;
+        }
+
+      keylen = gcry_cipher_get_algo_keylen(tv[i].algo);
+      if (!keylen)
+        {
+          fail ("aes-eax, gcry_cipher_get_algo_keylen failed\n");
+          return;
+        }
+
+      err = gcry_cipher_setkey (hde, tv[i].key, keylen);
+      if (!err)
+        err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setkey failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_setiv (hde, tv[i].nonce, tv[i].noncelen);
+      if (!err)
+        err = gcry_cipher_setiv (hdd, tv[i].nonce, tv[i].noncelen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setiv failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_info (hde, GCRYCTL_GET_TAGLEN, NULL, &taglen2);
+      if (err)
+        {
+          fail ("cipher-eax, gcryctl_get_taglen failed (tv %d): %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+      if (taglen2 != 16)
+        {
+          fail ("cipher-eax, gcryctl_get_taglen returned bad length"
+                " (tv %d): got=%zu want=%d\n",
+                i, taglen2, 16);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      for (pos = 0; pos < tv[i].headerlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].headerlen) ?
+                    step : tv[i].headerlen - pos;
+
+          err = gcry_cipher_authenticate(hde, tv[i].header + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_authenticate (%d) (%lu:%d) failed: "
+                    "%s\n", i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+          err = gcry_cipher_authenticate(hdd, tv[i].header + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, de gcry_cipher_authenticate (%d) (%lu:%d) failed: "
+	            "%s\n", i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      for (pos = 0; pos < tv[i].inlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos;
+
+          err = gcry_cipher_encrypt (hde, out + pos, poslen,
+                                     tv[i].plaintext + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_encrypt (%d) (%lu:%d) failed: %s\n",
+                    i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].out, out, tv[i].inlen))
+        fail ("aes-eax, encrypt mismatch entry %d (step %d)\n", i, step);
+
+      for (pos = 0; pos < tv[i].inlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos;
+
+          err = gcry_cipher_decrypt (hdd, out + pos, poslen, NULL, 0);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_decrypt (%d) (%lu:%d) failed: %s\n",
+                    i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].plaintext, out, tv[i].inlen))
+        fail ("aes-eax, decrypt mismatch entry %d (step %d)\n", i, step);
+
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16;
+
+      err = gcry_cipher_gettag (hde, out, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_gettag(%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      if ((memcmp (tv[i].tag, out, taglen2) != 0) ^ tv[i].should_fail)
+        fail ("aes-eax, encrypt tag mismatch entry %d\n", i);
+
+      err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_checktag(%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_reset(hde);
+      if (!err)
+        err = gcry_cipher_reset(hdd);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_reset (%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      /* gcry_cipher_reset clears the IV */
+      err = gcry_cipher_setiv (hde, tv[i].nonce, tv[i].noncelen);
+      if (!err)
+        err = gcry_cipher_setiv (hdd, tv[i].nonce, tv[i].noncelen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setiv failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      /* this time we authenticate, encrypt and decrypt one byte at a time */
+      for (byteNum = 0; byteNum < tv[i].headerlen; ++byteNum)
+        {
+          err = gcry_cipher_authenticate(hde, tv[i].header + byteNum, 1);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_authenticate (%d) (byte-buf) failed: "
+                    "%s\n", i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+          err = gcry_cipher_authenticate(hdd, tv[i].header + byteNum, 1);
+          if (err)
+            {
+              fail ("aes-eax, de gcry_cipher_authenticate (%d) (byte-buf) "
+	            "failed: %s\n", i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum)
+        {
+          err = gcry_cipher_encrypt (hde, out+byteNum, 1,
+                                     (tv[i].plaintext) + byteNum,
+                                     1);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_encrypt (%d) (byte-buf) failed: %s\n",
+                    i,  gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].out, out, tv[i].inlen))
+        fail ("aes-eax, encrypt mismatch entry %d, (byte-buf)\n", i);
+
+      /* Test output to larger than 16-byte buffer. */
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16 + 1;
+
+      err = gcry_cipher_gettag (hde, tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_gettag(%d, %lu) (byte-buf) failed: %s\n",
+                i, (unsigned long) taglen2, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16;
+
+      if ((memcmp (tv[i].tag, tag, taglen2) != 0) ^ tv[i].should_fail)
+        fail ("aes-eax, encrypt tag mismatch entry %d, (byte-buf)\n", i);
+
+      for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum)
+        {
+          err = gcry_cipher_decrypt (hdd, out+byteNum, 1, NULL, 0);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_decrypt (%d) (byte-buf) failed: %s\n",
+                    i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].plaintext, out, tv[i].inlen))
+        fail ("aes-eax, decrypt mismatch entry %d\n", i);
+
+      err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_checktag(%d) (byte-buf) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_checktag (hdd, tag, 17);
+      if (!err)
+        {
+          fail ("aes-eax, gcry_cipher_checktag(%d) did not fail for invalid "
+	        " tag length of '%d'\n", i, 17);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      if (tv[i].should_fail)
+        {
+          fail ("aes-eax, negative test succeeded %d\n", i);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+    next_tv:
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hdd);
+    }
+  if (verbose)
+    fprintf (stderr, "  Completed EAX checks.\n");
+}
+
+
+static void
+check_eax_cipher (void)
+{
+  /* Large buffers, no splitting. */
+  _check_eax_cipher(0xffffffff);
+  /* Split input to one byte buffers. */
+  _check_eax_cipher(1);
+  /* Split input to 7 byte buffers. */
+  _check_eax_cipher(7);
+  /* Split input to 16 byte buffers. */
+  _check_eax_cipher(16);
+}
+
+
 static void
 _check_poly1305_cipher (unsigned int step)
 {
-  struct tv
+  static const struct tv
   {
     int algo;
     const char *key;
@@ -5813,6 +6346,7 @@ get_algo_mode_blklen (int algo, int mode)
     case GCRY_CIPHER_MODE_CTR:
     case GCRY_CIPHER_MODE_CCM:
     case GCRY_CIPHER_MODE_GCM:
+    case GCRY_CIPHER_MODE_EAX:
     case GCRY_CIPHER_MODE_POLY1305:
       return 1;
     }
@@ -5894,7 +6428,7 @@ check_one_cipher_core (int algo, int mode, int flags,
   if ((mode == GCRY_CIPHER_MODE_CBC && (flags & GCRY_CIPHER_CBC_CTS)) ||
       mode == GCRY_CIPHER_MODE_XTS)
     {
-      /* Input cannot be split in to multiple operations with CTS . */
+      /* Input cannot be split in to multiple operations with CTS. */
       blklen = nplain;
     }
 
@@ -6281,6 +6815,7 @@ check_ciphers (void)
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, 0);
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, GCRY_CIPHER_CBC_CTS);
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CTR, 0);
+      check_one_cipher (algos[i], GCRY_CIPHER_MODE_EAX, 0);
       if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_CCM_BLOCK_LEN)
         check_one_cipher (algos[i], GCRY_CIPHER_MODE_CCM, 0);
       if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_GCM_BLOCK_LEN)
@@ -6333,6 +6868,7 @@ check_cipher_modes(void)
   check_poly1305_cipher ();
   check_ocb_cipher ();
   check_xts_cipher ();
+  check_eax_cipher ();
   check_gost28147_cipher ();
   check_stream_cipher ();
   check_stream_cipher_large_block ();
diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 75e6e43d3..e34104f7b 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -1231,6 +1231,53 @@ static struct bench_ops ocb_authenticate_ops = {
   &bench_ocb_authenticate_do_bench
 };
 
+static void
+bench_eax_encrypt_do_bench (struct bench_obj *obj, void *buf,
+			    size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static void
+bench_eax_decrypt_do_bench (struct bench_obj *obj, void *buf,
+			    size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static void
+bench_eax_authenticate_do_bench (struct bench_obj *obj, void *buf,
+				 size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static struct bench_ops eax_encrypt_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_encrypt_do_bench
+};
+
+static struct bench_ops eax_decrypt_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_decrypt_do_bench
+};
+
+static struct bench_ops eax_authenticate_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_authenticate_do_bench
+};
 
 static void
 bench_poly1305_encrypt_do_bench (struct bench_obj *obj, void *buf,
@@ -1291,6 +1338,9 @@ static struct bench_cipher_mode cipher_modes[] = {
   {GCRY_CIPHER_MODE_CCM, "CCM enc", &ccm_encrypt_ops},
   {GCRY_CIPHER_MODE_CCM, "CCM dec", &ccm_decrypt_ops},
   {GCRY_CIPHER_MODE_CCM, "CCM auth", &ccm_authenticate_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX enc",  &eax_encrypt_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX dec",  &eax_decrypt_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX auth", &eax_authenticate_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM enc", &gcm_encrypt_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM dec", &gcm_decrypt_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM auth", &gcm_authenticate_ops},
diff --git a/tests/benchmark.c b/tests/benchmark.c
index 44a8711d9..59ea32c66 100644
--- a/tests/benchmark.c
+++ b/tests/benchmark.c
@@ -779,6 +779,8 @@ cipher_bench ( const char *algoname )
       NULL, GCRY_GCM_BLOCK_LEN, GCRY_GCM_BLOCK_LEN },
     { GCRY_CIPHER_MODE_OCB, "      OCB", 1,
       NULL, 16, 16, 15 },
+    { GCRY_CIPHER_MODE_EAX, "      EAX", 0,
+      NULL, 0, 8, 8 },
     { GCRY_CIPHER_MODE_STREAM, "", 0 },
     {0}
   };


From wk at gnupg.org  Sun Jan 21 16:53:11 2018
From: wk at gnupg.org (Werner Koch)
Date: Sun, 21 Jan 2018 16:53:11 +0100
Subject: [PATCH 2/3] Add EAX mode
In-Reply-To: <151647869043.5266.8325824367607322184.stgit@localhost.localdomain>
 (Jussi Kivilinna's message of "Sat, 20 Jan 2018 22:04:50 +0200")
References: <151647868538.5266.2761365923121807285.stgit@localhost.localdomain>
 <151647869043.5266.8325824367607322184.stgit@localhost.localdomain>
Message-ID: <87bmhn43a0.fsf@wheatstone.g10code.de>

Cool!

I noticed this right after I pushed my AEAD implementation for gpg.
Meanwhile I added support for EAX and encryption seems to work.  There
is some bug in the decryption, likely due to the different noncen size.
Need to check.  The OpenPGP folks will be glad to see a fully free mode
implemented.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180121/919a81d2/attachment.sig>

From laxminarsaiah.bandla at gmail.com  Mon Jan 22 07:26:11 2018
From: laxminarsaiah.bandla at gmail.com (Laxmi Narsaiah Bandla)
Date: Mon, 22 Jan 2018 11:56:11 +0530
Subject: libgcrypt's scrypt algorithm has N and p values wrongly set
In-Reply-To: <CAEiGfhonBzJBLy5QDt8DQLxFFOcrZSw8tmW2o8o_DyyH9h+mHg@mail.gmail.com>
References: <CAEiGfhonBzJBLy5QDt8DQLxFFOcrZSw8tmW2o8o_DyyH9h+mHg@mail.gmail.com>
Message-ID: <CAEiGfho6AmAhSvV86z2kqUizwNrE4ZqGh9gUJrX6BaqatT9J7g@mail.gmail.com>

Any comments on this please?

Thanks
Laxmi Narsaiah

On 14 January 2018 at 01:34, Laxmi Narsaiah Bandla <
laxminarsaiah.bandla at gmail.com> wrote:

>
> Hi All,
>
> In the API gcry_kdf_scrypt implementation i see the below code.
>
> 240 gcry_err_code_t
> 241 _gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
> 242                   int algo, int subalgo,
> 243                   const unsigned char *salt, size_t saltlen,
> 244                   unsigned long iterations,
> 245                   size_t dkLen, unsigned char *DK)
> 246 {
> 247   u64 N = subalgo;    /* CPU/memory cost parameter.  */
> 248   u32 r;              /* Block size.  */
> 249   u32 p = iterations; /* Parallelization parameter.  */ <<<<<<<<<<<
> 250
> 251   gpg_err_code_t ec;
> 252   u32 i;
> 253   unsigned char *B = NULL;
> 254   unsigned char *tmp1 = NULL;
> 255   unsigned char *tmp2 = NULL;
> 256   size_t r128;
> 257   size_t nbytes;
>
> Here iterations should have been assigned to N (CPU/memory cost supposed
> to be higher) but it is assigned to p (parallelization). The same wrapper
> api gcry_kdf_derive() api's iteration argument has been correcly used in
> _gcry_kdf_pkdf2().
>
> when i set iterations to 20,000 (for PBKDF2) and 16384 (for scrypt) in
> gcry_kdf_derive(), on my machine PBKDF2 took less than a second to generate
> the key/hash where as scrypt took almost 35 minutes.
>
> when i set iterations = 1 (subalgo) and subalgo = 16384, it took less than
> a second to generate the key.
>
> Suggested fix:
>
> 1. Either we should document the arguments properly.
>
> OR
>
> 2.
>
>  u64 N = iterations;    /* CPU/memory cost parameter.  */
>  u32 r;              /* Block size.  */
>  u32 p = subalgo; /* Parallelization parameter.  */ <<<<<<<<<<<
>
> Please let me know.
>
> Thanks
>
> P.S : I have raised a task on gnupg : https://dev.gnupg.org/T3737
>


-- 
regards
RAJ
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180122/f6ca8484/attachment.html>

From jussi.kivilinna at iki.fi  Mon Jan 22 21:18:54 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 22 Jan 2018 22:18:54 +0200
Subject: [PATCH] Fix use of AVX instructions in Chaha20 SSSE3 implementation
Message-ID: <151665233485.8419.9669533565187132486.stgit@localhost.localdomain>

* cipher/chacha20-amd64-ssse3.S: Replace two 'vmovdqa' instructions
with 'movdqa'.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-amd64-ssse3.S |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 7ad1c0ae3..f23722814 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -190,8 +190,8 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	vmovdqa .Linc_counter RIP, X0;
-	vmovdqa .Lunsigned_cmp RIP, X2;
+	movdqa .Linc_counter RIP, X0;
+	movdqa .Lunsigned_cmp RIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;


From cvs at cvs.gnupg.org  Mon Jan 22 21:23:17 2018
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Mon, 22 Jan 2018 21:23:17 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-40-g0b55f34
Message-ID: <E1edicg-00005b-KR@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  0b55f349a8b8f4b0ac9ed724c2d5b8dcc9f5401c (commit)
       via  bd75f0e89817b5708c57efab49e3eb4e035186e2 (commit)
       via  e8629e535bd0e9711b07904d4501de8ad57aaecd (commit)
       via  cd7ed2e3546b12dd98df4211949f1cdbf5827013 (commit)
      from  93503c127a52c1f6a193750e2bf181a744ba3e6b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 0b55f349a8b8f4b0ac9ed724c2d5b8dcc9f5401c
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Mon Jan 22 22:17:50 2018 +0200

    Fix use of AVX instructions in Chaha20 SSSE3 implementation
    
    * cipher/chacha20-amd64-ssse3.S: Replace two 'vmovdqa' instructions
    with 'movdqa'.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 7ad1c0a..f237228 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -190,8 +190,8 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	vmovdqa .Linc_counter RIP, X0;
-	vmovdqa .Lunsigned_cmp RIP, X2;
+	movdqa .Linc_counter RIP, X0;
+	movdqa .Lunsigned_cmp RIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;

commit bd75f0e89817b5708c57efab49e3eb4e035186e2
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 20 21:12:12 2018 +0200

    doc: fix double "See" in front of reference
    
    * doc/gcrypt.texi: Change @xref to @ref when text already has 'see' in
    the front.
    --
    
    @xref references start with `See ...'. Use @ref instead
    when text already has 'see' in front.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index ccb4b82..bba07a4 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1743,7 +1743,7 @@ other cipher functions and returns a handle to it in `hd'.  In case of
 an error, an according error code is returned.
 
 The ID of algorithm to use must be specified via @var{algo}.  See
- at xref{Available ciphers}, for a list of supported ciphers and the
+ at ref{Available ciphers}, for a list of supported ciphers and the
 according constants.
 
 Besides using the constants directly, the function
@@ -1751,7 +1751,7 @@ Besides using the constants directly, the function
 an algorithm into the according numeric ID.
 
 The cipher mode to use must be specified via @var{mode}.  See
- at xref{Available cipher modes}, for a list of supported cipher modes
+ at ref{Available cipher modes}, for a list of supported cipher modes
 and the according constants.  Note that some modes are incompatible
 with some algorithms - in particular, stream mode
 (@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers.
@@ -3310,7 +3310,7 @@ may be given as @code{0} if the algorithms to use are later set using
 @code{gcry_md_enable}. @var{hd} is guaranteed to either receive a valid
 handle or NULL.
 
-For a list of supported algorithms, see @xref{Available hash
+For a list of supported algorithms, see @ref{Available hash
 algorithms}.
 
 The flags allowed for @var{mode} are:
@@ -3329,7 +3329,7 @@ algorithm is not an extendable-output function.  Note that the function
 @code{gcry_md_setkey} must be used to set the MAC key.  The size of the
 MAC is equal to the message digest of the underlying hash algorithm.
 If you want CBC message authentication codes based on a cipher,
-see @xref{Working with cipher handles}.
+see @ref{Working with cipher handles}.
 
 @item GCRY_MD_FLAG_BUGEMU1
 @cindex bug emulation
@@ -3847,7 +3847,7 @@ bitwise OR of constants described below. @var{hd} is guaranteed to either
 receive a valid handle or NULL. @var{ctx} is context object to associate MAC
 object with. @var{ctx} maybe set to NULL.
 
-For a list of supported algorithms, see @xref{Available MAC algorithms}.
+For a list of supported algorithms, see @ref{Available MAC algorithms}.
 
 The flags allowed for @var{mode} are:
 
@@ -5626,7 +5626,7 @@ self-contained functions.  Due to the wide variety of parameters
 required by different algorithms S-expressions, as flexible way to
 convey these parameters, are used.  There is a set of helper functions
 to work with these S-expressions.
- at c see @xref{S-expression Subsystem Architecture}.
+ at c see @ref{S-expression Subsystem Architecture}.
 
 Aside of functions to register new algorithms, map algorithms names to
 algorithms identifiers and to lookup properties of a key, the

commit e8629e535bd0e9711b07904d4501de8ad57aaecd
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 20 21:08:37 2018 +0200

    Add EAX mode
    
    * cipher/Makefile.am: Add 'cipher-eax.c'.
    * cipher/cipher-cmac.c (cmac_write): Rename to ...
    (_gcry_cmac_write): ... this; Take CMAC context as new input
    parameter; Return error code.
    (cmac_generate_subkeys): Rename to ...
    (_gcry_cmac_generate_subkeys): ... this; Take CMAC context as new
    input parameter; Return error code.
    (cmac_final): Rename to ...
    (_gcry_cmac_final): ... this; Take CMAC context as new input
    parameter; Return error code.
    (cmac_tag): Take CMAC context as new input parameter.
    (_gcry_cmac_reset): New.
    (_gcry_cipher_cmac_authenticate): Remove duplicate tag flag check;
    Adapt to changes above.
    (_gcry_cipher_cmac_get_tag): Adapt to changes above.
    (_gcry_cipher_cmac_check_tag): Ditto.
    (_gcry_cipher_cmac_set_subkeys): Ditto.
    * cipher-eax.c: New.
    * cipher-internal.h (gcry_cmac_context_t): New.
    (gcry_cipher_handle): Update u_mode.cmac; Add u_mode.eax.
    (_gcry_cmac_write, _gcry_cmac_generate_subkeys, _gcry_cmac_final)
    (_gcry_cmac_reset, _gcry_cipher_eax_encrypt, _gcry_cipher_eax_decrypt)
    (_gcry_cipher_eax_set_nonce, _gcry_cipher_eax_authenticate)
    (_gcry_cipher_eax_get_tag, _gcry_cipher_eax_check_tag)
    (_gcry_cipher_eax_setkey): New prototypes.
    * cipher/cipher.c (_gcry_cipher_open_internal, cipher_setkey)
    (cipher_reset, cipher_encrypt, cipher_decrypt, _gcry_cipher_setiv)
    (_gcry_cipher_authenticate, _gcry_cipher_gettag, _gcry_cipher_checktag)
    (_gcry_cipher_info): Add EAX mode.
    * doc/gcrypt.texi: Add EAX mode.
    * src/gcrypt.h.in (GCRY_CIPHER_MODE_EAX): New.
    * tests/basic.c (_check_gcm_cipher, _check_poly1305_cipher): Constify
    test vectors array.
    (_check_eax_cipher, check_eax_cipher): New.
    (check_ciphers, check_cipher_modes): Add EAX mode.
    * tests/bench-slope.c (bench_eax_encrypt_do_bench)
    (bench_eax_decrypt_do_bench, bench_eax_authenticate_do_bench)
    (eax_encrypt_ops, eax_decrypt_ops, eax_authenticate_ops): New.
    (cipher_modes): Add EAX mode.
    * tests/benchmark.c (cipher_bench): Add EAX mode.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bba815b..6e6c5ac 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -44,7 +44,7 @@ cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
 cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
   cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
-cipher-poly1305.c cipher-ocb.c cipher-xts.c \
+cipher-poly1305.c cipher-ocb.c cipher-xts.c cipher-eax.c \
 cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
 md.c \
diff --git a/cipher/cipher-cmac.c b/cipher/cipher-cmac.c
index da3ef75..30567b7 100644
--- a/cipher/cipher-cmac.c
+++ b/cipher/cipher-cmac.c
@@ -1,5 +1,5 @@
 /* cmac.c - CMAC, Cipher-based MAC.
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -33,8 +33,9 @@
   (burn) = (burn) > __nburn ? (burn) : __nburn; } while (0)
 
 
-static void
-cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
+gcry_err_code_t
+_gcry_cmac_write (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		  const byte * inbuf, size_t inlen)
 {
   gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   const unsigned int blocksize = c->spec->blocksize;
@@ -42,31 +43,37 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   unsigned int burn = 0;
   unsigned int nblocks;
 
+  if (ctx->tag)
+    return GPG_ERR_INV_STATE;
+
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
-  if (!inlen || !inbuf)
-    return;
+  if (!inbuf)
+    return GPG_ERR_INV_ARG;
+
+  if (inlen == 0)
+    return 0;
 
   /* Last block is needed for cmac_final.  */
-  if (c->unused + inlen <= blocksize)
+  if (ctx->mac_unused + inlen <= blocksize)
     {
-      for (; inlen && c->unused < blocksize; inlen--)
-        c->lastiv[c->unused++] = *inbuf++;
-      return;
+      for (; inlen && ctx->mac_unused < blocksize; inlen--)
+        ctx->macbuf[ctx->mac_unused++] = *inbuf++;
+      return 0;
     }
 
-  if (c->unused)
+  if (ctx->mac_unused)
     {
-      for (; inlen && c->unused < blocksize; inlen--)
-        c->lastiv[c->unused++] = *inbuf++;
+      for (; inlen && ctx->mac_unused < blocksize; inlen--)
+        ctx->macbuf[ctx->mac_unused++] = *inbuf++;
 
-      buf_xor (c->u_iv.iv, c->u_iv.iv, c->lastiv, blocksize);
-      set_burn (burn, enc_fn (&c->context.c, c->u_iv.iv, c->u_iv.iv));
+      buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+      set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
 
-      c->unused = 0;
+      ctx->mac_unused = 0;
     }
 
   if (c->bulk.cbc_enc && inlen > blocksize)
@@ -74,7 +81,7 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
       nblocks = inlen / blocksize;
       nblocks -= (nblocks * blocksize == inlen);
 
-      c->bulk.cbc_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks, 1);
+      c->bulk.cbc_enc (&c->context.c, ctx->u_iv.iv, outbuf, inbuf, nblocks, 1);
       inbuf += nblocks * blocksize;
       inlen -= nblocks * blocksize;
 
@@ -83,8 +90,8 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   else
     while (inlen > blocksize)
       {
-        buf_xor (c->u_iv.iv, c->u_iv.iv, inbuf, blocksize);
-        set_burn (burn, enc_fn (&c->context.c, c->u_iv.iv, c->u_iv.iv));
+        buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, inbuf, blocksize);
+        set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
         inlen -= blocksize;
         inbuf += blocksize;
       }
@@ -93,16 +100,18 @@ cmac_write (gcry_cipher_hd_t c, const byte * inbuf, size_t inlen)
   if (inlen == 0)
     BUG ();
 
-  for (; inlen && c->unused < blocksize; inlen--)
-    c->lastiv[c->unused++] = *inbuf++;
+  for (; inlen && ctx->mac_unused < blocksize; inlen--)
+    ctx->macbuf[ctx->mac_unused++] = *inbuf++;
 
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
 }
 
 
-static void
-cmac_generate_subkeys (gcry_cipher_hd_t c)
+gcry_err_code_t
+_gcry_cmac_generate_subkeys (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
 {
   const unsigned int blocksize = c->spec->blocksize;
   byte rb, carry, t, bi;
@@ -117,7 +126,7 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
   if (MAX_BLOCKSIZE < blocksize)
     BUG ();
@@ -127,7 +136,7 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
   burn = c->spec->encrypt (&c->context.c, u.buf, u.buf);
 
   /* Currently supported blocksizes are 16 and 8. */
-  rb = blocksize == 16 ? 0x87 : 0x1B /*blocksize == 8 */ ;
+  rb = blocksize == 16 ? 0x87 : 0x1B /* blocksize == 8 */ ;
 
   for (j = 0; j < 2; j++)
     {
@@ -139,93 +148,113 @@ cmac_generate_subkeys (gcry_cipher_hd_t c)
           t = carry | (bi << 1);
           carry = bi >> 7;
           u.buf[i] = t & 0xff;
-          c->u_mode.cmac.subkeys[j][i] = u.buf[i];
+          ctx->subkeys[j][i] = u.buf[i];
         }
       u.buf[blocksize - 1] ^= carry ? rb : 0;
-      c->u_mode.cmac.subkeys[j][blocksize - 1] = u.buf[blocksize - 1];
+      ctx->subkeys[j][blocksize - 1] = u.buf[blocksize - 1];
     }
 
   wipememory (&u, sizeof (u));
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
 }
 
 
-static void
-cmac_final (gcry_cipher_hd_t c)
+gcry_err_code_t
+_gcry_cmac_final (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
 {
   const unsigned int blocksize = c->spec->blocksize;
-  unsigned int count = c->unused;
+  unsigned int count = ctx->mac_unused;
   unsigned int burn;
   byte *subkey;
 
   /* Tell compiler that we require a cipher with a 64bit or 128 bit block
    * length, to allow better optimization of this function.  */
   if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
-    return;
+    return GPG_ERR_INV_CIPHER_MODE;
 
   if (count == blocksize)
-    subkey = c->u_mode.cmac.subkeys[0];        /* K1 */
+    subkey = ctx->subkeys[0];        /* K1 */
   else
     {
-      subkey = c->u_mode.cmac.subkeys[1];      /* K2 */
-      c->lastiv[count++] = 0x80;
+      subkey = ctx->subkeys[1];      /* K2 */
+      ctx->macbuf[count++] = 0x80;
       while (count < blocksize)
-        c->lastiv[count++] = 0;
+        ctx->macbuf[count++] = 0;
     }
 
-  buf_xor (c->lastiv, c->lastiv, subkey, blocksize);
+  buf_xor (ctx->macbuf, ctx->macbuf, subkey, blocksize);
 
-  buf_xor (c->u_iv.iv, c->u_iv.iv, c->lastiv, blocksize);
-  burn = c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_iv.iv);
+  buf_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+  burn = c->spec->encrypt (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv);
   if (burn)
     _gcry_burn_stack (burn + 4 * sizeof (void *));
 
-  c->unused = 0;
+  ctx->mac_unused = 0;
+
+  return 0;
 }
 
 
 static gcry_err_code_t
-cmac_tag (gcry_cipher_hd_t c, unsigned char *tag, size_t taglen, int check)
+cmac_tag (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+	  unsigned char *tag, size_t taglen, int check)
 {
+  gcry_err_code_t ret;
+
   if (!tag || taglen == 0 || taglen > c->spec->blocksize)
     return GPG_ERR_INV_ARG;
 
-  if (!c->u_mode.cmac.tag)
+  if (!ctx->tag)
     {
-      cmac_final (c);
-      c->u_mode.cmac.tag = 1;
+      ret = _gcry_cmac_final (c, ctx);
+      if (ret != 0)
+	return ret;
+
+      ctx->tag = 1;
     }
 
   if (!check)
     {
-      memcpy (tag, c->u_iv.iv, taglen);
+      memcpy (tag, ctx->u_iv.iv, taglen);
       return GPG_ERR_NO_ERROR;
     }
   else
     {
-      return buf_eq_const (tag, c->u_iv.iv, taglen) ?
+      return buf_eq_const (tag, ctx->u_iv.iv, taglen) ?
         GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
     }
 }
 
 
+void
+_gcry_cmac_reset (gcry_cmac_context_t *ctx)
+{
+  char tmp_buf[sizeof(ctx->subkeys)];
+
+  /* Only keep subkeys when reseting context. */
+
+  buf_cpy (tmp_buf, ctx->subkeys, sizeof(ctx->subkeys));
+  memset (ctx, 0, sizeof(*ctx));
+  buf_cpy (ctx->subkeys, tmp_buf, sizeof(ctx->subkeys));
+  wipememory (tmp_buf, sizeof(tmp_buf));
+}
+
+
 gcry_err_code_t
 _gcry_cipher_cmac_authenticate (gcry_cipher_hd_t c,
                                 const unsigned char *abuf, size_t abuflen)
 {
   if (abuflen > 0 && !abuf)
     return GPG_ERR_INV_ARG;
-  if (c->u_mode.cmac.tag)
-    return GPG_ERR_INV_STATE;
   /* To support new blocksize, update cmac_generate_subkeys() then add new
      blocksize here. */
   if (c->spec->blocksize != 16 && c->spec->blocksize != 8)
     return GPG_ERR_INV_CIPHER_MODE;
 
-  cmac_write (c, abuf, abuflen);
-
-  return GPG_ERR_NO_ERROR;
+  return _gcry_cmac_write (c, &c->u_mode.cmac, abuf, abuflen);
 }
 
 
@@ -233,7 +262,7 @@ gcry_err_code_t
 _gcry_cipher_cmac_get_tag (gcry_cipher_hd_t c,
                            unsigned char *outtag, size_t taglen)
 {
-  return cmac_tag (c, outtag, taglen, 0);
+  return cmac_tag (c, &c->u_mode.cmac, outtag, taglen, 0);
 }
 
 
@@ -241,13 +270,11 @@ gcry_err_code_t
 _gcry_cipher_cmac_check_tag (gcry_cipher_hd_t c,
                              const unsigned char *intag, size_t taglen)
 {
-  return cmac_tag (c, (unsigned char *) intag, taglen, 1);
+  return cmac_tag (c, &c->u_mode.cmac, (unsigned char *) intag, taglen, 1);
 }
 
 gcry_err_code_t
 _gcry_cipher_cmac_set_subkeys (gcry_cipher_hd_t c)
 {
-  cmac_generate_subkeys (c);
-
-  return GPG_ERR_NO_ERROR;
+  return _gcry_cmac_generate_subkeys (c, &c->u_mode.cmac);
 }
diff --git a/cipher/cipher-eax.c b/cipher/cipher-eax.c
new file mode 100644
index 0000000..1ce4797
--- /dev/null
+++ b/cipher/cipher-eax.c
@@ -0,0 +1,248 @@
+/* cipher-eax.c  -  EAX implementation
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+  if (err != 0)
+    return err;
+
+  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf, inbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf, inbuflen);
+  if (err != 0)
+    return err;
+
+  return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_authenticate (gcry_cipher_hd_t c,
+                               const byte * aadbuf, size_t aadbuflen)
+{
+  gcry_err_code_t err;
+
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, aadbuf, aadbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_setkey (gcry_cipher_hd_t c)
+{
+  gcry_err_code_t err;
+
+  err = _gcry_cmac_generate_subkeys (c, &c->u_mode.eax.cmac_header);
+  if (err != 0)
+    return err;
+
+  buf_cpy (c->u_mode.eax.cmac_ciphertext.subkeys,
+	   c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_set_nonce (gcry_cipher_hd_t c, const byte *nonce,
+			    size_t noncelen)
+{
+  gcry_cmac_context_t nonce_cmac;
+  unsigned char initbuf[MAX_BLOCKSIZE];
+  gcry_err_code_t err;
+
+  c->marks.iv = 0;
+  c->marks.tag = 0;
+
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+  /* Calculate nonce CMAC */
+
+  memset(&nonce_cmac, 0, sizeof(nonce_cmac));
+  memset(&initbuf, 0, sizeof(initbuf));
+
+  buf_cpy (&nonce_cmac.subkeys, c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  err = _gcry_cmac_write (c, &nonce_cmac, initbuf, c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  if (noncelen != 0)
+    {
+      err = _gcry_cmac_write (c, &nonce_cmac, nonce, noncelen);
+      if (err != 0)
+        return err;
+    }
+
+  err = _gcry_cmac_final (c, &nonce_cmac);
+  if (err != 0)
+    return err;
+
+  buf_cpy (c->u_iv.iv, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+  buf_cpy (c->u_ctr.ctr, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+
+  wipememory (&nonce_cmac, sizeof(nonce_cmac));
+
+  /* Prepare header CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 1;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  /* Prepare ciphertext CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 2;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  c->marks.iv = 1;
+  c->marks.tag = 0;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+_gcry_cipher_eax_tag (gcry_cipher_hd_t c,
+                      byte *outbuf, size_t outbuflen, int check)
+{
+  gcry_err_code_t err;
+
+  if (!c->marks.tag)
+    {
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_header);
+      if (err != 0)
+	return err;
+
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_ciphertext);
+      if (err != 0)
+	return err;
+
+      buf_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_header.u_iv.iv, MAX_BLOCKSIZE);
+      buf_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_ciphertext.u_iv.iv,
+		 MAX_BLOCKSIZE);
+
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+      c->marks.tag = 1;
+    }
+
+  if (!check)
+    {
+      if (outbuflen > c->spec->blocksize)
+        outbuflen = c->spec->blocksize;
+
+      /* NB: We already checked that OUTBUF is large enough to hold
+       * the result or has valid truncated length.  */
+      memcpy (outbuf, c->u_iv.iv, outbuflen);
+    }
+  else
+    {
+      /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
+       * and thus we need to compare its length first.  */
+      if (!(outbuflen <= c->spec->blocksize)
+          || !buf_eq_const (outbuf, c->u_iv.iv, outbuflen))
+        return GPG_ERR_CHECKSUM;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+                          size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, outtag, taglen, 0);
+}
+
+gcry_err_code_t
+_gcry_cipher_eax_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+                            size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, (unsigned char *) intag, taglen, 1);
+}
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 8c897d7..a0ede5e 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -109,6 +109,25 @@ typedef union
 } cipher_context_alignment_t;
 
 
+/* Storage structure for CMAC, for CMAC and EAX modes. */
+typedef struct {
+  /* The initialization vector. Also contains tag after finalization. */
+  union {
+    cipher_context_alignment_t iv_align;
+    unsigned char iv[MAX_BLOCKSIZE];
+  } u_iv;
+
+  /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
+  unsigned char subkeys[2][MAX_BLOCKSIZE];
+
+  /* Space to save partial input lengths for MAC. */
+  unsigned char macbuf[MAX_BLOCKSIZE];
+
+  int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
+  unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
+} gcry_cmac_context_t;
+
+
 /* The handle structure.  */
 struct gcry_cipher_handle
 {
@@ -197,7 +216,7 @@ struct gcry_cipher_handle
 
       unsigned char s0[GCRY_CCM_BLOCK_LEN];
 
-      unsigned int nonce:1;/* Set to 1 if nonce has been set.  */
+      unsigned int nonce:1; /* Set to 1 if nonce has been set.  */
       unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
                                  processed.  */
     } ccm;
@@ -217,12 +236,16 @@ struct gcry_cipher_handle
     } poly1305;
 
     /* Mode specific storage for CMAC mode. */
+    gcry_cmac_context_t cmac;
+
+    /* Mode specific storage for EAX mode. */
     struct {
-      unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
+      /* CMAC for header (AAD). */
+      gcry_cmac_context_t cmac_header;
 
-      /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
-      unsigned char subkeys[2][MAX_BLOCKSIZE];
-    } cmac;
+      /* CMAC for ciphertext. */
+      gcry_cmac_context_t cmac_ciphertext;
+    } eax;
 
     /* Mode specific storage for GCM mode. */
     struct {
@@ -236,7 +259,6 @@ struct gcry_cipher_handle
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
-
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
@@ -309,7 +331,6 @@ struct gcry_cipher_handle
          processed.  */
       unsigned int data_finalized:1;
       unsigned int aad_finalized:1;
-
     } ocb;
 
     /* Mode specific storage for XTS mode. */
@@ -406,6 +427,42 @@ gcry_err_code_t _gcry_cipher_ccm_check_tag
                  const unsigned char *intag, size_t taglen);
 
 
+/*-- cipher-cmac.c --*/
+gcry_err_code_t _gcry_cmac_generate_subkeys
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+gcry_err_code_t _gcry_cmac_write
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		 const byte * inbuf, size_t inlen);
+gcry_err_code_t _gcry_cmac_final
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+void _gcry_cmac_reset (gcry_cmac_context_t *ctx);
+
+
+/*-- cipher-eax.c --*/
+gcry_err_code_t _gcry_cipher_eax_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_set_nonce
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *nonce, size_t noncelen);
+gcry_err_code_t _gcry_cipher_eax_authenticate
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *aadbuf, size_t aadbuflen);
+gcry_err_code_t _gcry_cipher_eax_get_tag
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_check_tag
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *intag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_setkey
+/*           */   (gcry_cipher_hd_t c);
+
+
 /*-- cipher-gcm.c --*/
 gcry_err_code_t _gcry_cipher_gcm_encrypt
 /*           */   (gcry_cipher_hd_t c,
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 18b2591..1bef766 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -420,6 +420,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
       case GCRY_CIPHER_MODE_CTR:
       case GCRY_CIPHER_MODE_AESWRAP:
       case GCRY_CIPHER_MODE_CMAC:
+      case GCRY_CIPHER_MODE_EAX:
       case GCRY_CIPHER_MODE_GCM:
 	if (!spec->encrypt || !spec->decrypt)
 	  err = GPG_ERR_INV_CIPHER_MODE;
@@ -688,7 +689,11 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
       switch (c->mode)
         {
         case GCRY_CIPHER_MODE_CMAC:
-          _gcry_cipher_cmac_set_subkeys (c);
+          rc = _gcry_cipher_cmac_set_subkeys (c);
+          break;
+
+        case GCRY_CIPHER_MODE_EAX:
+          rc = _gcry_cipher_eax_setkey (c);
           break;
 
         case GCRY_CIPHER_MODE_GCM:
@@ -782,8 +787,12 @@ cipher_reset (gcry_cipher_hd_t c)
   switch (c->mode)
     {
     case GCRY_CIPHER_MODE_CMAC:
-      /* Only clear 'tag' for cmac, keep subkeys. */
-      c->u_mode.cmac.tag = 0;
+      _gcry_cmac_reset(&c->u_mode.cmac);
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_ciphertext);
       break;
 
     case GCRY_CIPHER_MODE_GCM:
@@ -929,6 +938,10 @@ cipher_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
       rc = GPG_ERR_INV_CIPHER_MODE;
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
       break;
@@ -1060,6 +1073,10 @@ cipher_decrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
       rc = GPG_ERR_INV_CIPHER_MODE;
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_decrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_decrypt (c, outbuf, outbuflen, inbuf, inbuflen);
       break;
@@ -1158,6 +1175,10 @@ _gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen)
         rc = _gcry_cipher_ccm_set_nonce (hd, iv, ivlen);
         break;
 
+      case GCRY_CIPHER_MODE_EAX:
+        rc =  _gcry_cipher_eax_set_nonce (hd, iv, ivlen);
+        break;
+
       case GCRY_CIPHER_MODE_GCM:
         rc =  _gcry_cipher_gcm_setiv (hd, iv, ivlen);
         break;
@@ -1226,6 +1247,10 @@ _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
       rc = _gcry_cipher_cmac_authenticate (hd, abuf, abuflen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_authenticate (hd, abuf, abuflen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_authenticate (hd, abuf, abuflen);
       break;
@@ -1263,6 +1288,10 @@ _gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag, size_t taglen)
       rc = _gcry_cipher_cmac_get_tag (hd, outtag, taglen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_get_tag (hd, outtag, taglen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_get_tag (hd, outtag, taglen);
       break;
@@ -1300,6 +1329,10 @@ _gcry_cipher_checktag (gcry_cipher_hd_t hd, const void *intag, size_t taglen)
       rc = _gcry_cipher_cmac_check_tag (hd, intag, taglen);
       break;
 
+    case GCRY_CIPHER_MODE_EAX:
+      rc = _gcry_cipher_eax_check_tag (hd, intag, taglen);
+      break;
+
     case GCRY_CIPHER_MODE_GCM:
       rc = _gcry_cipher_gcm_check_tag (hd, intag, taglen);
       break;
@@ -1501,6 +1534,10 @@ _gcry_cipher_info (gcry_cipher_hd_t h, int cmd, void *buffer, size_t *nbytes)
               *nbytes = h->u_mode.ccm.authlen;
               break;
 
+            case GCRY_CIPHER_MODE_EAX:
+              *nbytes = h->spec->blocksize;
+              break;
+
             case GCRY_CIPHER_MODE_GCM:
               *nbytes = GCRY_GCM_BLOCK_LEN;
               break;
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 7831505..ccb4b82 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1722,6 +1722,12 @@ value is automatically incremented after each call of
 Auto-increment allows avoiding need of setting IV between processing
 of sequential data units.
 
+ at item  GCRY_CIPHER_MODE_EAX
+ at cindex EAX, EAX mode
+EAX is an Authenticated Encryption with Associated Data (AEAD) block cipher
+mode by Bellare, Rogaway, and Wagner (see
+ at uref{http://web.cs.ucdavis.edu/~rogaway/papers/eax.html}).
+
 @end table
 
 @node Working with cipher handles
@@ -1752,12 +1758,13 @@ with some algorithms - in particular, stream mode
 Poly1305 AEAD mode (@code{GCRY_CIPHER_MODE_POLY1305}) only works with
 ChaCha20 stream cipher. The block cipher modes
 (@code{GCRY_CIPHER_MODE_ECB}, @code{GCRY_CIPHER_MODE_CBC},
- at code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB} and
- at code{GCRY_CIPHER_MODE_CTR}) will work with any block cipher
-algorithm.  GCM mode (@code{GCRY_CIPHER_MODE_CCM}), CCM mode
-(@code{GCRY_CIPHER_MODE_GCM}), OCB mode (@code{GCRY_CIPHER_MODE_OCB}),
-and XTS mode (@code{GCRY_CIPHER_MODE_XTS}) will only work
-with block cipher algorithms which have the block size of 16 bytes.
+ at code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB},
+ at code{GCRY_CIPHER_MODE_CTR} and @code{GCRY_CIPHER_MODE_EAX}) will work
+with any block cipher algorithm.  GCM mode
+(@code{GCRY_CIPHER_MODE_CCM}), CCM mode (@code{GCRY_CIPHER_MODE_GCM}),
+OCB mode (@code{GCRY_CIPHER_MODE_OCB}), and XTS mode
+(@code{GCRY_CIPHER_MODE_XTS}) will only work with block cipher
+algorithms which have the block size of 16 bytes.
 
 The third argument @var{flags} can either be passed as @code{0} or as
 the bit-wise OR of the following constants.
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 1eb3d7c..83f94b6 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -971,7 +971,8 @@ enum gcry_cipher_modes
     GCRY_CIPHER_MODE_POLY1305 = 10,  /* Poly1305 based AEAD mode. */
     GCRY_CIPHER_MODE_OCB      = 11,  /* OCB3 mode.  */
     GCRY_CIPHER_MODE_CFB8     = 12,  /* Cipher feedback (8 bit mode). */
-    GCRY_CIPHER_MODE_XTS      = 13  /* XTS mode.  */
+    GCRY_CIPHER_MODE_XTS      = 13,  /* XTS mode.  */
+    GCRY_CIPHER_MODE_EAX      = 14   /* EAX mode.  */
   };
 
 /* Flags used with the open function. */
diff --git a/tests/basic.c b/tests/basic.c
index c2b4208..c883eb3 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -1347,7 +1347,7 @@ check_ofb_cipher (void)
 static void
 _check_gcm_cipher (unsigned int step)
 {
-  struct tv
+  static const struct tv
   {
     int algo;
     char key[MAX_DATA_LEN];
@@ -1891,9 +1891,542 @@ check_gcm_cipher (void)
 
 
 static void
+_check_eax_cipher (unsigned int step)
+{
+  static const struct tv
+  {
+    int algo;
+    char key[MAX_DATA_LEN];
+    char nonce[MAX_DATA_LEN];
+    int noncelen;
+    unsigned char header[MAX_DATA_LEN];
+    int headerlen;
+    unsigned char plaintext[MAX_DATA_LEN];
+    int inlen;
+    char out[MAX_DATA_LEN];
+    char tag[MAX_DATA_LEN];
+    int taglen;
+    int should_fail;
+  } tv[] =
+    {
+      /* Test vectors from http://www.cs.ucdavis.edu/~rogaway/papers/eax.pdf */
+      { GCRY_CIPHER_AES,
+        "\x23\x39\x52\xDE\xE4\xD5\xED\x5F\x9B\x9C\x6D\x6F\xF8\x0F\xF4\x78",
+        "\x62\xEC\x67\xF9\xC3\xA4\xA4\x07\xFC\xB2\xA8\xC4\x90\x31\xA8\xB3", 16,
+        "\x6B\xFB\x91\x4F\xD0\x7E\xAE\x6B", 8,
+        "",
+        0,
+        "",
+        "\xE0\x37\x83\x0E\x83\x89\xF2\x7B\x02\x5A\x2D\x65\x27\xE7\x9D\x01", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x91\x94\x5D\x3F\x4D\xCB\xEE\x0B\xF4\x5E\xF5\x22\x55\xF0\x95\xA4",
+        "\xBE\xCA\xF0\x43\xB0\xA2\x3D\x84\x31\x94\xBA\x97\x2C\x66\xDE\xBD", 16,
+        "\xFA\x3B\xFD\x48\x06\xEB\x53\xFA", 8,
+        "\xF7\xFB",
+        2,
+        "\x19\xDD",
+        "\x5C\x4C\x93\x31\x04\x9D\x0B\xDA\xB0\x27\x74\x08\xF6\x79\x67\xE5", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x01\xF7\x4A\xD6\x40\x77\xF2\xE7\x04\xC0\xF6\x0A\xDA\x3D\xD5\x23",
+        "\x70\xC3\xDB\x4F\x0D\x26\x36\x84\x00\xA1\x0E\xD0\x5D\x2B\xFF\x5E", 16,
+        "\x23\x4A\x34\x63\xC1\x26\x4A\xC6", 8,
+        "\x1A\x47\xCB\x49\x33",
+        5,
+        "\xD8\x51\xD5\xBA\xE0",
+        "\x3A\x59\xF2\x38\xA2\x3E\x39\x19\x9D\xC9\x26\x66\x26\xC4\x0F\x80", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xD0\x7C\xF6\xCB\xB7\xF3\x13\xBD\xDE\x66\xB7\x27\xAF\xD3\xC5\xE8",
+        "\x84\x08\xDF\xFF\x3C\x1A\x2B\x12\x92\xDC\x19\x9E\x46\xB7\xD6\x17", 16,
+        "\x33\xCC\xE2\xEA\xBF\xF5\xA7\x9D", 8,
+        "\x48\x1C\x9E\x39\xB1",
+        5,
+        "\x63\x2A\x9D\x13\x1A",
+        "\xD4\xC1\x68\xA4\x22\x5D\x8E\x1F\xF7\x55\x93\x99\x74\xA7\xBE\xDE", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x35\xB6\xD0\x58\x00\x05\xBB\xC1\x2B\x05\x87\x12\x45\x57\xD2\xC2",
+        "\xFD\xB6\xB0\x66\x76\xEE\xDC\x5C\x61\xD7\x42\x76\xE1\xF8\xE8\x16", 16,
+        "\xAE\xB9\x6E\xAE\xBE\x29\x70\xE9", 8,
+        "\x40\xD0\xC0\x7D\xA5\xE4",
+        6,
+        "\x07\x1D\xFE\x16\xC6\x75",
+        "\xCB\x06\x77\xE5\x36\xF7\x3A\xFE\x6A\x14\xB7\x4E\xE4\x98\x44\xDD", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xBD\x8E\x6E\x11\x47\x5E\x60\xB2\x68\x78\x4C\x38\xC6\x2F\xEB\x22",
+        "\x6E\xAC\x5C\x93\x07\x2D\x8E\x85\x13\xF7\x50\x93\x5E\x46\xDA\x1B", 16,
+        "\xD4\x48\x2D\x1C\xA7\x8D\xCE\x0F", 8,
+        "\x4D\xE3\xB3\x5C\x3F\xC0\x39\x24\x5B\xD1\xFB\x7D",
+        12,
+        "\x83\x5B\xB4\xF1\x5D\x74\x3E\x35\x0E\x72\x84\x14",
+        "\xAB\xB8\x64\x4F\xD6\xCC\xB8\x69\x47\xC5\xE1\x05\x90\x21\x0A\x4F", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x7C\x77\xD6\xE8\x13\xBE\xD5\xAC\x98\xBA\xA4\x17\x47\x7A\x2E\x7D",
+        "\x1A\x8C\x98\xDC\xD7\x3D\x38\x39\x3B\x2B\xF1\x56\x9D\xEE\xFC\x19", 16,
+        "\x65\xD2\x01\x79\x90\xD6\x25\x28", 8,
+        "\x8B\x0A\x79\x30\x6C\x9C\xE7\xED\x99\xDA\xE4\xF8\x7F\x8D\xD6\x16\x36",
+        17,
+        "\x02\x08\x3E\x39\x79\xDA\x01\x48\x12\xF5\x9F\x11\xD5\x26\x30\xDA\x30",
+        "\x13\x73\x27\xD1\x06\x49\xB0\xAA\x6E\x1C\x18\x1D\xB6\x17\xD7\xF2", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x5F\xFF\x20\xCA\xFA\xB1\x19\xCA\x2F\xC7\x35\x49\xE2\x0F\x5B\x0D",
+        "\xDD\xE5\x9B\x97\xD7\x22\x15\x6D\x4D\x9A\xFF\x2B\xC7\x55\x98\x26", 16,
+        "\x54\xB9\xF0\x4E\x6A\x09\x18\x9A", 8,
+        "\x1B\xDA\x12\x2B\xCE\x8A\x8D\xBA\xF1\x87\x7D\x96\x2B\x85\x92\xDD"
+        "\x2D\x56",
+        18,
+        "\x2E\xC4\x7B\x2C\x49\x54\xA4\x89\xAF\xC7\xBA\x48\x97\xED\xCD\xAE"
+        "\x8C\xC3",
+        "\x3B\x60\x45\x05\x99\xBD\x02\xC9\x63\x82\x90\x2A\xEF\x7F\x83\x2A", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\xA4\xA4\x78\x2B\xCF\xFD\x3E\xC5\xE7\xEF\x6D\x8C\x34\xA5\x61\x23",
+        "\xB7\x81\xFC\xF2\xF7\x5F\xA5\xA8\xDE\x97\xA9\xCA\x48\xE5\x22\xEC", 16,
+        "\x89\x9A\x17\x58\x97\x56\x1D\x7E", 8,
+        "\x6C\xF3\x67\x20\x87\x2B\x85\x13\xF6\xEA\xB1\xA8\xA4\x44\x38\xD5"
+        "\xEF\x11",
+        18,
+        "\x0D\xE1\x8F\xD0\xFD\xD9\x1E\x7A\xF1\x9F\x1D\x8E\xE8\x73\x39\x38"
+        "\xB1\xE8",
+        "\xE7\xF6\xD2\x23\x16\x18\x10\x2F\xDB\x7F\xE5\x5F\xF1\x99\x17\x00", 16,
+        0
+      },
+      { GCRY_CIPHER_AES,
+        "\x83\x95\xFC\xF1\xE9\x5B\xEB\xD6\x97\xBD\x01\x0B\xC7\x66\xAA\xC3",
+        "\x22\xE7\xAD\xD9\x3C\xFC\x63\x93\xC5\x7E\xC0\xB3\xC1\x7D\x6B\x44", 16,
+        "\x12\x67\x35\xFC\xC3\x20\xD2\x5A", 8,
+        "\xCA\x40\xD7\x44\x6E\x54\x5F\xFA\xED\x3B\xD1\x2A\x74\x0A\x65\x9F"
+        "\xFB\xBB\x3C\xEA\xB7",
+        21,
+        "\xCB\x89\x20\xF8\x7A\x6C\x75\xCF\xF3\x96\x27\xB5\x6E\x3E\xD1\x97"
+        "\xC5\x52\xD2\x95\xA7",
+        "\xCF\xC4\x6A\xFC\x25\x3B\x46\x52\xB1\xAF\x37\x95\xB1\x24\xAB\x6E", 16,
+        0
+      },
+      /* Negative test for bad tag. */
+      { GCRY_CIPHER_AES,
+        "\x23\x39\x52\xDE\xE4\xD5\xED\x5F\x9B\x9C\x6D\x6F\xF8\x0F\xF4\x78",
+        "\x62\xEC\x67\xF9\xC3\xA4\xA4\x07\xFC\xB2\xA8\xC4\x90\x31\xA8\xB3", 16,
+        "\x6B\xFB\x91\x4F\xD0\x7E\xAE\x6B", 8,
+        "",
+        0,
+        "",
+        "\x00\x37\x83\x0E\x83\x89\xF2\x7B\x02\x5A\x2D\x65\x27\xE7\x9D\x01", 16,
+        1
+      },
+      /* Test vectors from libtomcrypt. */
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "", 0,
+        "", 0,
+        "",
+        0,
+        "",
+        "\x9a\xd0\x7e\x7d\xbf\xf3\x01\xf5\x05\xde\x59\x6b\x96\x15\xdf\xff", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "", 0,
+        "",
+        0,
+        "",
+        "\x1c\xe1\x0d\x3e\xff\xd4\xca\xdb\xe2\xe4\x4b\x58\xd6\x0a\xb9\xec", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "", 0,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "",
+        0,
+        "",
+        "\x3a\x69\x8f\x7a\x27\x0e\x51\xb0\xf6\x5b\x3d\x3e\x47\x19\x3c\xff", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+        32,
+        "\x29\xd8\x78\xd1\xa3\xbe\x85\x7b\x6f\xb8\xc8\xea\x59\x50\xa7\x78"
+        "\x33\x1f\xbf\x2c\xcf\x33\x98\x6f\x35\xe8\xcf\x12\x1d\xcb\x30\xbc",
+        "\x4f\xbe\x03\x38\xbe\x1c\x8c\x7e\x1d\x7a\xe7\xe4\x5b\x92\xc5\x87", 16,
+        0
+      },
+      {
+        GCRY_CIPHER_AES,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e", 15,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d", 14,
+        "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c",
+        29,
+        "\xdd\x25\xc7\x54\xc5\xb1\x7c\x59\x28\xb6\x9b\x73\x15\x5f\x7b\xb8"
+        "\x88\x8f\xaf\x37\x09\x1a\xd9\x2c\x8a\x24\xdb\x86\x8b",
+        "\x0d\x1a\x14\xe5\x22\x24\xff\xd2\x3a\x05\xfa\x02\xcd\xef\x52\xda", 16,
+        0
+      },
+    };
+
+  gcry_cipher_hd_t hde, hdd;
+  unsigned char out[MAX_DATA_LEN];
+  unsigned char tag[16];
+  int i, keylen;
+  gcry_error_t err = 0;
+  size_t pos, poslen, taglen2;
+  int byteNum;
+
+  if (verbose)
+    fprintf (stderr, "  Starting EAX checks.\n");
+
+  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
+    {
+      if (gcry_cipher_test_algo (tv[i].algo) && in_fips_mode)
+        {
+          if (verbose)
+            fprintf (stderr, "  algorithm %d not available in fips mode\n",
+		     tv[i].algo);
+          continue;
+        }
+
+      if (verbose)
+        fprintf (stderr, "    checking EAX mode for %s [%i]\n",
+                 gcry_cipher_algo_name (tv[i].algo),
+                 tv[i].algo);
+      err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_EAX, 0);
+      if (!err)
+        err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_EAX, 0);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_open failed: %s\n", gpg_strerror (err));
+          return;
+        }
+
+      keylen = gcry_cipher_get_algo_keylen(tv[i].algo);
+      if (!keylen)
+        {
+          fail ("aes-eax, gcry_cipher_get_algo_keylen failed\n");
+          return;
+        }
+
+      err = gcry_cipher_setkey (hde, tv[i].key, keylen);
+      if (!err)
+        err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setkey failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_setiv (hde, tv[i].nonce, tv[i].noncelen);
+      if (!err)
+        err = gcry_cipher_setiv (hdd, tv[i].nonce, tv[i].noncelen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setiv failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_info (hde, GCRYCTL_GET_TAGLEN, NULL, &taglen2);
+      if (err)
+        {
+          fail ("cipher-eax, gcryctl_get_taglen failed (tv %d): %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+      if (taglen2 != 16)
+        {
+          fail ("cipher-eax, gcryctl_get_taglen returned bad length"
+                " (tv %d): got=%zu want=%d\n",
+                i, taglen2, 16);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      for (pos = 0; pos < tv[i].headerlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].headerlen) ?
+                    step : tv[i].headerlen - pos;
+
+          err = gcry_cipher_authenticate(hde, tv[i].header + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_authenticate (%d) (%lu:%d) failed: "
+                    "%s\n", i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+          err = gcry_cipher_authenticate(hdd, tv[i].header + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, de gcry_cipher_authenticate (%d) (%lu:%d) failed: "
+	            "%s\n", i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      for (pos = 0; pos < tv[i].inlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos;
+
+          err = gcry_cipher_encrypt (hde, out + pos, poslen,
+                                     tv[i].plaintext + pos, poslen);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_encrypt (%d) (%lu:%d) failed: %s\n",
+                    i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].out, out, tv[i].inlen))
+        fail ("aes-eax, encrypt mismatch entry %d (step %d)\n", i, step);
+
+      for (pos = 0; pos < tv[i].inlen; pos += step)
+        {
+          poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos;
+
+          err = gcry_cipher_decrypt (hdd, out + pos, poslen, NULL, 0);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_decrypt (%d) (%lu:%d) failed: %s\n",
+                    i, (unsigned long) pos, step, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].plaintext, out, tv[i].inlen))
+        fail ("aes-eax, decrypt mismatch entry %d (step %d)\n", i, step);
+
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16;
+
+      err = gcry_cipher_gettag (hde, out, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_gettag(%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      if ((memcmp (tv[i].tag, out, taglen2) != 0) ^ tv[i].should_fail)
+        fail ("aes-eax, encrypt tag mismatch entry %d\n", i);
+
+      err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_checktag(%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_reset(hde);
+      if (!err)
+        err = gcry_cipher_reset(hdd);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_reset (%d) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      /* gcry_cipher_reset clears the IV */
+      err = gcry_cipher_setiv (hde, tv[i].nonce, tv[i].noncelen);
+      if (!err)
+        err = gcry_cipher_setiv (hdd, tv[i].nonce, tv[i].noncelen);
+      if (err)
+        {
+          fail ("aes-eax, gcry_cipher_setiv failed: %s\n",
+                gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      /* this time we authenticate, encrypt and decrypt one byte at a time */
+      for (byteNum = 0; byteNum < tv[i].headerlen; ++byteNum)
+        {
+          err = gcry_cipher_authenticate(hde, tv[i].header + byteNum, 1);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_authenticate (%d) (byte-buf) failed: "
+                    "%s\n", i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+          err = gcry_cipher_authenticate(hdd, tv[i].header + byteNum, 1);
+          if (err)
+            {
+              fail ("aes-eax, de gcry_cipher_authenticate (%d) (byte-buf) "
+	            "failed: %s\n", i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum)
+        {
+          err = gcry_cipher_encrypt (hde, out+byteNum, 1,
+                                     (tv[i].plaintext) + byteNum,
+                                     1);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_encrypt (%d) (byte-buf) failed: %s\n",
+                    i,  gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].out, out, tv[i].inlen))
+        fail ("aes-eax, encrypt mismatch entry %d, (byte-buf)\n", i);
+
+      /* Test output to larger than 16-byte buffer. */
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16 + 1;
+
+      err = gcry_cipher_gettag (hde, tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_gettag(%d, %lu) (byte-buf) failed: %s\n",
+                i, (unsigned long) taglen2, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      taglen2 = tv[i].taglen ? tv[i].taglen : 16;
+
+      if ((memcmp (tv[i].tag, tag, taglen2) != 0) ^ tv[i].should_fail)
+        fail ("aes-eax, encrypt tag mismatch entry %d, (byte-buf)\n", i);
+
+      for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum)
+        {
+          err = gcry_cipher_decrypt (hdd, out+byteNum, 1, NULL, 0);
+          if (err)
+            {
+              fail ("aes-eax, gcry_cipher_decrypt (%d) (byte-buf) failed: %s\n",
+                    i, gpg_strerror (err));
+              gcry_cipher_close (hde);
+              gcry_cipher_close (hdd);
+              return;
+            }
+        }
+
+      if (memcmp (tv[i].plaintext, out, tv[i].inlen))
+        fail ("aes-eax, decrypt mismatch entry %d\n", i);
+
+      err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+      if (err)
+        {
+          if (tv[i].should_fail)
+            goto next_tv;
+
+          fail ("aes-eax, gcry_cipher_checktag(%d) (byte-buf) failed: %s\n",
+                i, gpg_strerror (err));
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      err = gcry_cipher_checktag (hdd, tag, 17);
+      if (!err)
+        {
+          fail ("aes-eax, gcry_cipher_checktag(%d) did not fail for invalid "
+	        " tag length of '%d'\n", i, 17);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+      if (tv[i].should_fail)
+        {
+          fail ("aes-eax, negative test succeeded %d\n", i);
+          gcry_cipher_close (hde);
+          gcry_cipher_close (hdd);
+          return;
+        }
+
+    next_tv:
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hdd);
+    }
+  if (verbose)
+    fprintf (stderr, "  Completed EAX checks.\n");
+}
+
+
+static void
+check_eax_cipher (void)
+{
+  /* Large buffers, no splitting. */
+  _check_eax_cipher(0xffffffff);
+  /* Split input to one byte buffers. */
+  _check_eax_cipher(1);
+  /* Split input to 7 byte buffers. */
+  _check_eax_cipher(7);
+  /* Split input to 16 byte buffers. */
+  _check_eax_cipher(16);
+}
+
+
+static void
 _check_poly1305_cipher (unsigned int step)
 {
-  struct tv
+  static const struct tv
   {
     int algo;
     const char *key;
@@ -5813,6 +6346,7 @@ get_algo_mode_blklen (int algo, int mode)
     case GCRY_CIPHER_MODE_CTR:
     case GCRY_CIPHER_MODE_CCM:
     case GCRY_CIPHER_MODE_GCM:
+    case GCRY_CIPHER_MODE_EAX:
     case GCRY_CIPHER_MODE_POLY1305:
       return 1;
     }
@@ -5894,7 +6428,7 @@ check_one_cipher_core (int algo, int mode, int flags,
   if ((mode == GCRY_CIPHER_MODE_CBC && (flags & GCRY_CIPHER_CBC_CTS)) ||
       mode == GCRY_CIPHER_MODE_XTS)
     {
-      /* Input cannot be split in to multiple operations with CTS . */
+      /* Input cannot be split in to multiple operations with CTS. */
       blklen = nplain;
     }
 
@@ -6281,6 +6815,7 @@ check_ciphers (void)
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, 0);
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, GCRY_CIPHER_CBC_CTS);
       check_one_cipher (algos[i], GCRY_CIPHER_MODE_CTR, 0);
+      check_one_cipher (algos[i], GCRY_CIPHER_MODE_EAX, 0);
       if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_CCM_BLOCK_LEN)
         check_one_cipher (algos[i], GCRY_CIPHER_MODE_CCM, 0);
       if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_GCM_BLOCK_LEN)
@@ -6333,6 +6868,7 @@ check_cipher_modes(void)
   check_poly1305_cipher ();
   check_ocb_cipher ();
   check_xts_cipher ();
+  check_eax_cipher ();
   check_gost28147_cipher ();
   check_stream_cipher ();
   check_stream_cipher_large_block ();
diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 75e6e43..e34104f 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -1231,6 +1231,53 @@ static struct bench_ops ocb_authenticate_ops = {
   &bench_ocb_authenticate_do_bench
 };
 
+static void
+bench_eax_encrypt_do_bench (struct bench_obj *obj, void *buf,
+			    size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static void
+bench_eax_decrypt_do_bench (struct bench_obj *obj, void *buf,
+			    size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static void
+bench_eax_authenticate_do_bench (struct bench_obj *obj, void *buf,
+				 size_t buflen)
+{
+  char nonce[16] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce,
+                     0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+                     0x00, 0x00, 0x01, 0x00 };
+  bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce));
+}
+
+static struct bench_ops eax_encrypt_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_encrypt_do_bench
+};
+
+static struct bench_ops eax_decrypt_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_decrypt_do_bench
+};
+
+static struct bench_ops eax_authenticate_ops = {
+  &bench_encrypt_init,
+  &bench_encrypt_free,
+  &bench_eax_authenticate_do_bench
+};
 
 static void
 bench_poly1305_encrypt_do_bench (struct bench_obj *obj, void *buf,
@@ -1291,6 +1338,9 @@ static struct bench_cipher_mode cipher_modes[] = {
   {GCRY_CIPHER_MODE_CCM, "CCM enc", &ccm_encrypt_ops},
   {GCRY_CIPHER_MODE_CCM, "CCM dec", &ccm_decrypt_ops},
   {GCRY_CIPHER_MODE_CCM, "CCM auth", &ccm_authenticate_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX enc",  &eax_encrypt_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX dec",  &eax_decrypt_ops},
+  {GCRY_CIPHER_MODE_EAX, "EAX auth", &eax_authenticate_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM enc", &gcm_encrypt_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM dec", &gcm_decrypt_ops},
   {GCRY_CIPHER_MODE_GCM, "GCM auth", &gcm_authenticate_ops},
diff --git a/tests/benchmark.c b/tests/benchmark.c
index 44a8711..59ea32c 100644
--- a/tests/benchmark.c
+++ b/tests/benchmark.c
@@ -779,6 +779,8 @@ cipher_bench ( const char *algoname )
       NULL, GCRY_GCM_BLOCK_LEN, GCRY_GCM_BLOCK_LEN },
     { GCRY_CIPHER_MODE_OCB, "      OCB", 1,
       NULL, 16, 16, 15 },
+    { GCRY_CIPHER_MODE_EAX, "      EAX", 0,
+      NULL, 0, 8, 8 },
     { GCRY_CIPHER_MODE_STREAM, "", 0 },
     {0}
   };

commit cd7ed2e3546b12dd98df4211949f1cdbf5827013
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Jan 7 22:19:13 2018 +0200

    cipher: constify spec arrays
    
    * cipher/cipher.c (cipher_list): Constify array.
    * cipher/mac.c (mac_list): Constify array.
    * cipher/md.c (digest_list): Constify array.
    * cipher/pubkey.c (pubkey_list): Constify array.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher.c b/cipher/cipher.c
index 063c13d..18b2591 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -33,7 +33,7 @@
 
 /* This is the list of the default ciphers, which are included in
    libgcrypt.  */
-static gcry_cipher_spec_t *cipher_list[] =
+static gcry_cipher_spec_t * const cipher_list[] =
   {
 #if USE_BLOWFISH
      &_gcry_cipher_spec_blowfish,
diff --git a/cipher/mac.c b/cipher/mac.c
index 46be7b7..4a7a47d 100644
--- a/cipher/mac.c
+++ b/cipher/mac.c
@@ -29,7 +29,7 @@
 
 /* This is the list of the digest implementations included in
    libgcrypt.  */
-static gcry_mac_spec_t *mac_list[] = {
+static gcry_mac_spec_t * const mac_list[] = {
 #if USE_SHA1
   &_gcry_mac_type_spec_hmac_sha1,
 #endif
diff --git a/cipher/md.c b/cipher/md.c
index 94f1b5d..efbffe1 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -31,7 +31,7 @@
 
 /* This is the list of the digest implementations included in
    libgcrypt.  */
-static gcry_md_spec_t *digest_list[] =
+static gcry_md_spec_t * const digest_list[] =
   {
 #if USE_CRC
      &_gcry_digest_spec_crc32,
diff --git a/cipher/pubkey.c b/cipher/pubkey.c
index 8ec15fd..4c07e33 100644
--- a/cipher/pubkey.c
+++ b/cipher/pubkey.c
@@ -34,7 +34,7 @@
 
 /* This is the list of the public-key algorithms included in
    Libgcrypt.  */
-static gcry_pk_spec_t *pubkey_list[] =
+static gcry_pk_spec_t * const pubkey_list[] =
   {
 #if USE_ECC
     &_gcry_pubkey_spec_ecc,

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am            |   2 +-
 cipher/chacha20-amd64-ssse3.S |   4 +-
 cipher/cipher-cmac.c          | 137 ++++++-----
 cipher/cipher-eax.c           | 248 +++++++++++++++++++
 cipher/cipher-internal.h      |  71 +++++-
 cipher/cipher.c               |  45 +++-
 cipher/mac.c                  |   2 +-
 cipher/md.c                   |   2 +-
 cipher/pubkey.c               |   2 +-
 doc/gcrypt.texi               |  31 ++-
 src/gcrypt.h.in               |   3 +-
 tests/basic.c                 | 542 +++++++++++++++++++++++++++++++++++++++++-
 tests/bench-slope.c           |  50 ++++
 tests/benchmark.c             |   2 +
 14 files changed, 1053 insertions(+), 88 deletions(-)
 create mode 100644 cipher/cipher-eax.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From Clemens.Lang at bmw.de  Mon Jan 29 18:03:42 2018
From: Clemens.Lang at bmw.de (Clemens.Lang at bmw.de)
Date: Mon, 29 Jan 2018 17:03:42 +0000
Subject: Possibly incorrect counter overflow handling for AES-GCM
Message-ID: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>

Hi,

I believe we have found what seems to be a bug in counter overflow
handling in AES-GCM in libgcrypt's implementation. This leads to
incorrect results when using a non-12-byte IV and decrypting payloads
encrypted with other AES-GCM implementations, such as OpenSSL.

According to the NIST Special Publication 800-38D "Recommendation for
Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC",
section 7.1, algorithm 4, step 3 [NIST38D], the counter increment is
defined as inc_32. Section 6.2 of the same document defines the
incrementing function inc_s for positive integers s as follows:

| the function increments the right-most s bits of the string, regarded
| as the binary representation of an integer, modulo 2^s; the remaining,
| left-most len(X) - s bits remain unchanged

(X is the complete counter value in this case)

This problem does not occur when using a 12-byte IV, because AES-GCM has
a special case for the inital counter value with 12-byte IVs:

| If len(IV)=96, then J_0 = IV || 0^31 || 1

i.e., one would have to encrypt (UINT_MAX - 1) * blocksize of data to
hit an overflow. However, for non-12-byte IVs, the initial counter value
is the output of a hash function, which makes hitting an overflow much
more likely.

In practice, we have found that using

 iv = 9e 79 18 8c ff 09 56 1e c9 90 99 cc 6d 5d f6 d3
 key = 26 56 e5 73 76 03 c6 95 0d 22 07 31 5d 32 5c 6b a5 54 5f 40 23 98 60 f6 f7 06 6f 7a 4f c2 ca 40

will reliably trigger an overflow when encrypting 10 MiB of data. It
seems that this is caused by re-using the AES-CTR implementation for
incrementing the counter.

I am attaching a small utility to encrypt and decrypt data with both
OpenSSL and libgcrypt using AES-256-GCM while allowing to pass in the
key and IV. The problem can be demonstrated as follows:

$ openssl dgst -sha1 payload
SHA1(payload)= 33354423e3e0df0c306ba35bb3d7c720deeccd2a
$ ./cryptohelper libgcrypt encrypt payload key.bin iv.bin encrypted-libgcrypt
$ ./cryptohelper openssl decrypt encrypted-libgcrypt key.bin iv.bin decrypted-payload
$ openssl dgst -sha1 decrypted-payload
SHA1(decrypted-payload)= 7c57a358f8d9d63a69055ae32dd1e78ae068ebbe

Can anybody confirm our findings? Should I file a ticket for this?

[NIST38D]: http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf

-- 
Clemens Lang ? Development Specialist
BMW Car IT GmbH ? Lise-Meitner-Str. 14 ? 89081 Ulm ? http://bmw-carit.com
-------------------------------------------------------------------------
BMW Car IT GmbH
Gesch?ftsf?hrer: Kai-Uwe Balszuweit und Christian Salzmann
Sitz und Registergericht: M?nchen HRB 134810
-------------------------------------------------------------------------
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cryptohelper.cpp
Type: application/octet-stream
Size: 9244 bytes
Desc: cryptohelper.cpp
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20180129/c800151f/attachment.obj>

From gniibe at fsij.org  Tue Jan 30 03:43:47 2018
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 30 Jan 2018 11:43:47 +0900
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
Message-ID: <87r2q8awws.fsf@iwagami.gniibe.org>

Hello,

Thanks for your report.  I tried to test your example program with 20MB
(not MiB, but MB) payload.  Given the IV, it works correctly for me.  I
use libgcrypt version 1.8.1.

Clemens.Lang at bmw.de wrote:
> I believe we have found what seems to be a bug in counter overflow
> handling in AES-GCM in libgcrypt's implementation.

No, it's your example program which must be wrong.

libgcrypt does the increment correctly by the function gcm_add32_be128
in cipher/cipher-gcm.c.  Well, I admit that the function name is
misleading, but it _does_ inc_32, indeed.

I think that your line:

    std::size_t ciphertextsize = ((plaintext.size() - 1) | (kBlockSize - 1)) + 1 + kAuthTagSize;

... it doesn't work well for plaintext size not multiple of 16.

Another line:

    if ((err = gcry_cipher_encrypt(hd, ciphertext.data(), ciphertext.size() - kAuthTagSize, plaintext.data(), plaintext.size())))

This doesn't work either for plaintext size not multiple of 16.
-- 


From Clemens.Lang at bmw.de  Tue Jan 30 10:26:17 2018
From: Clemens.Lang at bmw.de (Clemens.Lang at bmw.de)
Date: Tue, 30 Jan 2018 09:26:17 +0000
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <87r2q8awws.fsf@iwagami.gniibe.org>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
 <87r2q8awws.fsf@iwagami.gniibe.org>
Message-ID: <DEC7F237-6809-4EA4-A5EB-F8FE87C011F1@bmw.de>

Hi,

> On 30. Jan 2018, at 03:43, NIIBE Yutaka <gniibe at fsij.org> wrote:
> 
> Thanks for your report.  I tried to test your example program with 20MB
> (not MiB, but MB) payload.  Given the IV, it works correctly for me.  I
> use libgcrypt version 1.8.1.

Note that you will also have to use the same key K to trigger the behavior. This is because the initial counter value J_0 is calculated from the given IV using the GHASH function, which uses the hash subkey H = CIPH_K(0^128).

Just for the record, I tested this with 1.8.2 and 1.7.6.

> No, it's your example program which must be wrong.
> 
> libgcrypt does the increment correctly by the function gcm_add32_be128
> in cipher/cipher-gcm.c.  Well, I admit that the function name is
> misleading, but it _does_ inc_32, indeed.

gcm_add32_be128 is only used for the first counter increment in _gcry_cipher_gcm_initiv, which is called from _gcry_cipher_gcm_setiv. The counter increase while encrypting is delegated from _gcry_cipher_gcm_encrypt to _gcry_cipher_ctr_encrypt, which is implemented in cipher/cipher-ctr.c. Ignoring the bulk methods for now, the code used to increase the counter in that function is

  for (i = blocksize; i > 0; i--)
    {
      c->u_ctr.ctr[i-1]++;
      if (c->u_ctr.ctr[i-1] != 0)
        break;
    }

where blocksize is the length of the entire counter, so it will not do inc_32 here. The bulk method for AES256-GCM would be _gcry_aes_ctr_enc from cipher/rijndael.c which uses counter increment code equivalent to the block cited above (see also do_aesni_ctr in cipher/rijndael-aesni.c).


> I think that your line:
> 
>    std::size_t ciphertextsize = ((plaintext.size() - 1) | (kBlockSize - 1)) + 1 + kAuthTagSize;
> 
> ... it doesn't work well for plaintext size not multiple of 16.

You?re right, that?s a bug in my reproducer. I?ve checked our original implementation ? it does not have this issue. Regardless of that, we see the problem with plaintext sizes that are multiples of 16, where this code should have worked.


Clemens
-- 
Clemens Lang ? Development Specialist
BMW Car IT GmbH ? Lise-Meitner-Str. 14 ? 89081 Ulm ? http://bmw-carit.com
-------------------------------------------------------------------------
BMW Car IT GmbH
Gesch?ftsf?hrer: Kai-Uwe Balszuweit und Christian Salzmann
Sitz und Registergericht: M?nchen HRB 134810
-------------------------------------------------------------------------


From gniibe at fsij.org  Tue Jan 30 12:21:04 2018
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 30 Jan 2018 20:21:04 +0900
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <DEC7F237-6809-4EA4-A5EB-F8FE87C011F1@bmw.de>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
 <87r2q8awws.fsf@iwagami.gniibe.org>
 <DEC7F237-6809-4EA4-A5EB-F8FE87C011F1@bmw.de>
Message-ID: <87o9lb384f.fsf@fsij.org>

Clemens.Lang at bmw.de wrote:
> Note that you will also have to use the same key K to trigger the behavior. This is because the initial counter value J_0 is calculated from the given IV using the GHASH function, which uses the hash subkey H = CIPH_K(0^128).
>
> Just for the record, I tested this with 1.8.2 and 1.7.6.

Ah, I see.  I created a ticket:

       https://dev.gnupg.org/T3764

I was naively read your previous message which addressed section 7.1,
algorithm 4, step 3 of the document.  It is actually section 6.5,
algorithm 3, step 5, which matters.

I'll create a test case in libgcrypt/tests/ and fix.

Since I'm going to travel, it will be next week.
-- 


From jussi.kivilinna at iki.fi  Tue Jan 30 21:06:54 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 30 Jan 2018 22:06:54 +0200
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
Message-ID: <f6e25af9-c271-b814-efe4-92753decc3c9@iki.fi>

Hello,

On 29.01.2018 19:03, Clemens.Lang at bmw.de wrote:
> Hi,
> 
> I believe we have found what seems to be a bug in counter overflow
> handling in AES-GCM in libgcrypt's implementation. This leads to
> incorrect results when using a non-12-byte IV and decrypting payloads
> encrypted with other AES-GCM implementations, such as OpenSSL.
> 
> According to the NIST Special Publication 800-38D "Recommendation for
> Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC",
> section 7.1, algorithm 4, step 3 [NIST38D], the counter increment is
> defined as inc_32. Section 6.2 of the same document defines the
> incrementing function inc_s for positive integers s as follows:
> 
> | the function increments the right-most s bits of the string, regarded
> | as the binary representation of an integer, modulo 2^s; the remaining,
> | left-most len(X) - s bits remain unchanged
> 
> (X is the complete counter value in this case)
> 
> This problem does not occur when using a 12-byte IV, because AES-GCM has
> a special case for the inital counter value with 12-byte IVs:
> 
> | If len(IV)=96, then J_0 = IV || 0^31 || 1
> 
> i.e., one would have to encrypt (UINT_MAX - 1) * blocksize of data to
> hit an overflow. However, for non-12-byte IVs, the initial counter value
> is the output of a hash function, which makes hitting an overflow much
> more likely.

You're right, counter overflow is handled wrong and this can be seen with
non-12-byte IVs. Input plaintext length is limited to 2^32-2 blocks, so
overflow does not happen with 12-byte IV.

> 
> In practice, we have found that using
> 
>  iv = 9e 79 18 8c ff 09 56 1e c9 90 99 cc 6d 5d f6 d3
>  key = 26 56 e5 73 76 03 c6 95 0d 22 07 31 5d 32 5c 6b a5 54 5f 40 23 98 60 f6 f7 06 6f 7a 4f c2 ca 40
> 
> will reliably trigger an overflow when encrypting 10 MiB of data. It
> seems that this is caused by re-using the AES-CTR implementation for
> incrementing the counter.

Following key/iv gives overflow after 16 bytes of input and encryption
goes wrong after that.

AES256, ctr_low: ffffffff
 key: 0000000000000000000000000000000000000000000000000000000000000000
 iv: 00000000000000000000000086dd40e7

-Jussi


From jussi.kivilinna at iki.fi  Tue Jan 30 23:12:26 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 31 Jan 2018 00:12:26 +0200
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <87o9lb384f.fsf@fsij.org>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
 <87r2q8awws.fsf@iwagami.gniibe.org>
 <DEC7F237-6809-4EA4-A5EB-F8FE87C011F1@bmw.de> <87o9lb384f.fsf@fsij.org>
Message-ID: <761a1106-692b-9ac6-c39b-7466d508887a@iki.fi>

Hello,

On 30.01.2018 13:21, NIIBE Yutaka wrote:
> Clemens.Lang at bmw.de wrote:
>> Note that you will also have to use the same key K to trigger the behavior. This is because the initial counter value J_0 is calculated from the given IV using the GHASH function, which uses the hash subkey H = CIPH_K(0^128).
>>
>> Just for the record, I tested this with 1.8.2 and 1.7.6.
> 
> Ah, I see.  I created a ticket:
> 
>        https://dev.gnupg.org/T3764
> 
> I was naively read your previous message which addressed section 7.1,
> algorithm 4, step 3 of the document.  It is actually section 6.5,
> algorithm 3, step 5, which matters.
> 
> I'll create a test case in libgcrypt/tests/ and fix.
> 
> Since I'm going to travel, it will be next week.
> 

I can do the fix for this one, if that's ok.

-Jussi


From gniibe at fsij.org  Tue Jan 30 23:42:53 2018
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Wed, 31 Jan 2018 07:42:53 +0900
Subject: Possibly incorrect counter overflow handling for AES-GCM
In-Reply-To: <761a1106-692b-9ac6-c39b-7466d508887a@iki.fi>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
 <87r2q8awws.fsf@iwagami.gniibe.org>
 <DEC7F237-6809-4EA4-A5EB-F8FE87C011F1@bmw.de> <87o9lb384f.fsf@fsij.org>
 <761a1106-692b-9ac6-c39b-7466d508887a@iki.fi>
Message-ID: <874ln3rms2.fsf@fsij.org>

Jussi Kivilinna <jussi.kivilinna at iki.fi> wrote:
> I can do the fix for this one, if that's ok.

Yes, that's good.  Please go ahead.
-- 


From jussi.kivilinna at iki.fi  Wed Jan 31 19:42:36 2018
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 31 Jan 2018 20:42:36 +0200
Subject: [PATCH] Fix incorrect counter overflow handling for GCM
In-Reply-To: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
References: <A3CA1A62-5E3B-43F3-BDEA-626DA636F835@bmw.de>
Message-ID: <151742415635.22208.9926136198558789073.stgit@localhost.localdomain>

* cipher/cipher-gcm.c (gcm_ctr_encrypt): New function to handle
32-bit CTR increment for GCM.
(_gcry_cipher_gcm_encrypt, _gcry_cipher_gcm_decrypt): Do not use
generic CTR implementation directly, use gcm_ctr_encrypt instead.
* tests/basic.c (_check_gcm_cipher): Add test-vectors for 32-bit
CTR overflow.
(check_gcm_cipher): Add 'split input to 15 bytes and 17 bytes'
test-runs.
--

Reported-by: Clemens Lang <Clemens.Lang at bmw.de>

> I believe we have found what seems to be a bug in counter overflow
> handling in AES-GCM in libgcrypt's implementation. This leads to
> incorrect results when using a non-12-byte IV and decrypting payloads
> encrypted with other AES-GCM implementations, such as OpenSSL.
>
> According to the NIST Special Publication 800-38D "Recommendation for
> Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC",
> section 7.1, algorithm 4, step 3 [NIST38D], the counter increment is
> defined as inc_32. Section 6.2 of the same document defines the
> incrementing function inc_s for positive integers s as follows:
>
> | the function increments the right-most s bits of the string, regarded
> | as the binary representation of an integer, modulo 2^s; the remaining,
> | left-most len(X) - s bits remain unchanged
>
> (X is the complete counter value in this case)
>
> This problem does not occur when using a 12-byte IV, because AES-GCM has
> a special case for the inital counter value with 12-byte IVs:
>
> | If len(IV)=96, then J_0 = IV || 0^31 || 1
>
> i.e., one would have to encrypt (UINT_MAX - 1) * blocksize of data to
> hit an overflow. However, for non-12-byte IVs, the initial counter value
> is the output of a hash function, which makes hitting an overflow much
> more likely.
>
> In practice, we have found that using
>
>  iv = 9e 79 18 8c ff 09 56 1e c9 90 99 cc 6d 5d f6 d3
>  key = 26 56 e5 73 76 03 c6 95 0d 22 07 31 5d 32 5c 6b a5 54 5f 40 23 98 60 f6 f7 06 6f 7a 4f c2 ca 40
>
> will reliably trigger an overflow when encrypting 10 MiB of data. It
> seems that this is caused by re-using the AES-CTR implementation for
> incrementing the counter.

Bug was introduced by commit bd4bd23a2511a4bce63c3217cca0d4ecf0c79532
"GCM: Use counter mode code for speed-up".

GnuPG-bug-id: 3764
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm.c |   77 +++++-
 tests/basic.c       |  689 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 759 insertions(+), 7 deletions(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 2b8b454b2..6169d1427 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,6 +1,6 @@
 /* cipher-gcm.c  - Generic Galois Counter Mode implementation
  * Copyright (C) 2013 Dmitry Eremin-Solenikov
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013, 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -556,6 +556,77 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 }
 
 
+static gcry_err_code_t
+gcm_ctr_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
+                 const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err = 0;
+
+  while (inbuflen)
+    {
+      u32 nblocks_to_overflow;
+      u32 num_ctr_increments;
+      u32 curr_ctr_low;
+      size_t currlen = inbuflen;
+      byte ctr_copy[GCRY_GCM_BLOCK_LEN];
+      int fix_ctr = 0;
+
+      /* GCM CTR increments only least significant 32-bits, without carry
+       * to upper 96-bits of counter.  Using generic CTR implementation
+       * directly would carry 32-bit overflow to upper 96-bit.  Detect
+       * if input length is long enough to cause overflow, and limit
+       * input length so that CTR overflow happen but updated CTR value is
+       * not used to encrypt further input.  After overflow, upper 96 bits
+       * of CTR are restored to cancel out modification done by generic CTR
+       * encryption. */
+
+      if (inbuflen > c->unused)
+        {
+          curr_ctr_low = gcm_add32_be128 (c->u_ctr.ctr, 0);
+
+          /* Number of CTR increments this inbuflen would cause. */
+          num_ctr_increments = (inbuflen - c->unused) / GCRY_GCM_BLOCK_LEN +
+                               !!((inbuflen - c->unused) % GCRY_GCM_BLOCK_LEN);
+
+          if ((u32)(num_ctr_increments + curr_ctr_low) < curr_ctr_low)
+            {
+              nblocks_to_overflow = 0xffffffffU - curr_ctr_low + 1;
+              currlen = nblocks_to_overflow * GCRY_GCM_BLOCK_LEN + c->unused;
+              if (currlen > inbuflen)
+                {
+                  currlen = inbuflen;
+                }
+
+              fix_ctr = 1;
+              buf_cpy(ctr_copy, c->u_ctr.ctr, GCRY_GCM_BLOCK_LEN);
+            }
+        }
+
+      err = _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+        return err;
+
+      if (fix_ctr)
+        {
+          /* Lower 32-bits of CTR should now be zero. */
+          gcry_assert(gcm_add32_be128 (c->u_ctr.ctr, 0) == 0);
+
+          /* Restore upper part of CTR. */
+          buf_cpy(c->u_ctr.ctr, ctr_copy, GCRY_GCM_BLOCK_LEN - sizeof(u32));
+
+          wipememory(ctr_copy, sizeof(ctr_copy));
+        }
+
+      inbuflen -= currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      outbuf += currlen;
+    }
+
+  return err;
+}
+
+
 gcry_err_code_t
 _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
@@ -595,7 +666,7 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
-  err = _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
+  err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
   if (err != 0)
     return err;
 
@@ -642,7 +713,7 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
 
   do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, inbuflen, 0);
 
-  return _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
+  return gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
 }
 
 
diff --git a/tests/basic.c b/tests/basic.c
index c883eb39f..42ee819e7 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -1347,6 +1347,7 @@ check_ofb_cipher (void)
 static void
 _check_gcm_cipher (unsigned int step)
 {
+#define MAX_GCM_DATA_LEN (256 + 32)
   static const struct tv
   {
     int algo;
@@ -1355,9 +1356,9 @@ _check_gcm_cipher (unsigned int step)
     int ivlen;
     unsigned char aad[MAX_DATA_LEN];
     int aadlen;
-    unsigned char plaintext[MAX_DATA_LEN];
+    unsigned char plaintext[MAX_GCM_DATA_LEN];
     int inlen;
-    char out[MAX_DATA_LEN];
+    char out[MAX_GCM_DATA_LEN];
     char tag[MAX_DATA_LEN];
     int taglen;
     int should_fail;
@@ -1551,11 +1552,687 @@ _check_gcm_cipher (unsigned int step)
         "\xee\xb2\xb2\x2a\xaf\xde\x64\x19\xa0\x58\xab\x4f\x6f\x74\x6b\xf4"
         "\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde"
         "\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f",
-        "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" }
+        "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" },
+      /* Test vectors for overflowing CTR. */
+      /* After setiv, ctr_low: 0xffffffff */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x86\xdd\x40\xe7",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x7d\x6e\x38\xfd\xd0\x04\x9d\x28\xdf\x4a\x10\x3f\xa3\x9e\xf8\xf8"
+        "\x6c\x2c\x10\xa7\x91\xab\xc0\x86\xd4\x6d\x69\xea\x58\xc4\xf9\xc0"
+        "\xd4\xee\xc2\xb0\x9d\x36\xae\xe7\xc9\xa9\x1f\x71\xa8\xee\xa2\x1d"
+        "\x20\xfd\x79\xc7\xd9\xc4\x90\x51\x38\x97\xb6\x9f\x55\xea\xf3\xf0"
+        "\x78\xb4\xd3\x8c\xa9\x9b\x32\x7d\x19\x36\x96\xbc\x8e\xab\x80\x9f"
+        "\x61\x56\xcc\xbd\x3a\x80\xc6\x69\x37\x0a\x89\x89\x21\x82\xb7\x79"
+        "\x6d\xe9\xb4\x34\xc4\x31\xe0\xbe\x71\xad\xf3\x50\x05\xb2\x61\xab"
+        "\xb3\x1a\x80\x57\xcf\xe1\x11\x26\xcb\xa9\xd1\xf6\x58\x46\xf1\x69"
+        "\xa2\xb8\x42\x3c\xe8\x28\x13\xca\x58\xd9\x28\x99\xf8\xc8\x17\x32"
+        "\x4a\xf9\xb3\x4c\x7a\x47\xad\xe4\x77\x64\xec\x70\xa1\x01\x0b\x88"
+        "\xe7\x30\x0b\xbd\x66\x25\x39\x1e\x51\x67\xee\xec\xdf\xb8\x24\x5d"
+        "\x7f\xcb\xee\x7a\x4e\xa9\x93\xf0\xa1\x84\x7b\xfe\x5a\xe3\x86\xb2"
+        "\xfb\xcd\x39\xe7\x1e\x5e\x48\x65\x4b\x50\x2b\x4a\x99\x46\x3f\x6f"
+        "\xdb\xd9\x97\xdb\xe5\x6d\xa4\xdd\x6c\x18\x64\x5e\xae\x7e\x2c\xd3"
+        "\xb4\xf3\x57\x5c\xb5\xf8\x7f\xe5\x87\xb5\x35\xdb\x80\x38\x6e\x2c"
+        "\x5c\xdd\xeb\x7c\x63\xac\xe4\xb5\x5a\x6a\x40\x6d\x72\x69\x9a\xa9"
+        "\x8f\x5e\x93\x91\x4d\xce\xeb\x87\xf5\x25\xed\x75\x6b\x3b\x1a\xf2"
+        "\x0c\xd2\xa4\x10\x45\xd2\x87\xae\x29\x6d\xeb\xea\x66\x5f\xa0\xc2",
+        "\x8c\x22\xe3\xda\x9d\x94\x8a\xbe\x8a\xbc\x55\x2c\x94\x63\x44\x40" },
+      /* After setiv, ctr_low: 0xfffffffe */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x8d\xd1\xc1\xdf",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\xac\x6a\x10\x3f\xe2\x8d\xed\x27\x55\x14\xca\x1f\x03\x67\x0a\xa8"
+        "\xa1\x07\xbf\x00\x73\x5b\x64\xef\xac\x30\x83\x81\x48\x4c\xaa\xd5"
+        "\xff\xca\xef\x2f\x77\xbe\xfe\x1b\x20\x5c\x86\x19\xc7\xf9\x11\x99"
+        "\x27\xc5\x57\xa7\x0a\xc2\xa8\x05\xd9\x07\x2b\xb9\x38\xa4\xef\x58"
+        "\x92\x74\xcf\x89\xc7\xba\xfc\xb9\x70\xac\x86\xe2\x31\xba\x7c\xf9"
+        "\xc4\xe2\xe0\x4c\x1b\xe4\x3f\x75\x83\x5c\x40\x0e\xa4\x13\x8b\x04"
+        "\x60\x78\x57\x29\xbb\xe6\x61\x93\xe3\x16\xf9\x58\x07\x75\xd0\x96"
+        "\xfb\x8f\x6d\x1e\x49\x0f\xd5\x31\x9e\xee\x31\xe6\x0a\x85\x93\x49"
+        "\x22\xcf\xd6\x1b\x40\x44\x63\x9c\x95\xaf\xf0\x44\x23\x51\x37\x92"
+        "\x0d\xa0\x22\x37\xb9\x6d\x13\xf9\x78\xba\x27\x27\xed\x08\x7e\x35"
+        "\xe4\xe2\x28\xeb\x0e\xbe\x3d\xce\x89\x93\x35\x84\x0f\xa0\xf9\x8d"
+        "\x94\xe9\x5a\xec\xd4\x0d\x1f\x5c\xbe\x6f\x8e\x6a\x4d\x10\x65\xbb"
+        "\xc7\x0b\xa0\xd5\x5c\x20\x80\x0b\x4a\x43\xa6\xe1\xb0\xe0\x56\x6a"
+        "\xde\x90\xe0\x6a\x45\xe7\xc2\xd2\x69\x9b\xc6\x62\x11\xe3\x2b\xa5"
+        "\x45\x98\xb0\x80\xd3\x57\x4d\x1f\x09\x83\x58\xd4\x4d\xa6\xc5\x95"
+        "\x87\x59\xb0\x58\x6c\x81\x49\xc5\x95\x18\x23\x1b\x6f\x10\x86\xa2"
+        "\xd9\x56\x19\x30\xec\xd3\x4a\x4b\xe8\x1c\x11\x37\xfb\x31\x60\x4d"
+        "\x4f\x9b\xc4\x95\xba\xda\x49\x43\x6c\xc7\x3d\x5b\x13\xf9\x91\xf8",
+        "\xcd\x2b\x83\xd5\x5b\x5a\x8e\x0b\x2e\x77\x0d\x97\xbf\xf7\xaa\xab" },
+      /* After setiv, ctr_low: 0xfffffffd */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x76\x8c\x18\x92",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x3d\x6f\x4e\xf6\xd2\x6f\x4e\xce\xa6\xb4\x4a\x9e\xcb\x57\x13\x90"
+        "\x51\x3b\xf6\xb2\x40\x55\x0c\x2c\xa2\x85\x44\x72\xf2\x90\xaf\x6b"
+        "\x86\x8c\x75\x2a\x9c\xd6\x52\x50\xee\xc6\x5f\x59\xbc\x8d\x18\xd7"
+        "\x87\xa5\x7f\xa0\x13\xd1\x5d\x54\x77\x30\xe2\x5d\x1b\x4f\x87\x9f"
+        "\x3a\x41\xcb\x6a\xdf\x44\x4f\xa2\x1a\xbc\xfb\x4b\x16\x67\xed\x59"
+        "\x65\xf0\x77\x48\xca\xfd\xf0\xb6\x90\x65\xca\x23\x09\xca\x83\x43"
+        "\x8f\xf0\x78\xb4\x5f\x96\x2a\xfd\x29\xae\xda\x62\x85\xc5\x87\x4b"
+        "\x2a\x3f\xba\xbe\x15\x5e\xb0\x4e\x8e\xe7\x66\xae\xb4\x80\x66\x90"
+        "\x10\x9d\x81\xb9\x64\xd3\x36\x00\xb2\x95\xa8\x7d\xaf\x54\xf8\xbd"
+        "\x8f\x7a\xb1\xa1\xde\x09\x0d\x10\xc8\x8e\x1e\x18\x2c\x1e\x73\x71"
+        "\x2f\x1e\xfd\x16\x6e\xbe\xe1\x3e\xe5\xb4\xb5\xbf\x03\x63\xf4\x5a"
+        "\x0d\xeb\xff\xe0\x61\x80\x67\x51\xb4\xa3\x1f\x18\xa5\xa9\xf1\x9a"
+        "\xeb\x2a\x7f\x56\xb6\x01\x88\x82\x78\xdb\xec\xb7\x92\xfd\xef\x56"
+        "\x55\xd3\x72\x35\xcd\xa4\x0d\x19\x6a\xb6\x79\x91\xd5\xcb\x0e\x3b"
+        "\xfb\xea\xa3\x55\x9f\x77\xfb\x75\xc2\x3e\x09\x02\x73\x7a\xff\x0e"
+        "\xa5\xf0\x83\x11\xeb\xe7\xff\x3b\xd0\xfd\x7a\x07\x53\x63\x43\x89"
+        "\xf5\x7b\xc4\x7d\x3b\x2c\x9b\xca\x1c\xf6\xb2\xab\x13\xf5\xc4\x2a"
+        "\xbf\x46\x77\x3b\x09\xdd\xd1\x80\xef\x55\x11\x3e\xd8\xe4\x42\x22",
+        "\xa3\x86\xa1\x5f\xe3\x4f\x3b\xed\x12\x23\xeb\x5c\xb8\x0c\xad\x4a" },
+      /* After setiv, ctr_low: 0xfffffffc */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x9b\xc8\xc3\xaf",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x33\x5f\xdc\x8d\x5d\x77\x7b\x78\xc1\x5b\x7b\xb3\xd9\x08\x9a\x0c"
+        "\xce\x63\x4e\xef\x19\xf8\x8c\x7a\xcb\x31\x39\x93\x69\x7a\x2c\x97"
+        "\x3a\xb4\x52\x45\x9e\x7b\x78\xbc\xa9\xad\x54\x7f\x88\xa6\xae\xd5"
+        "\xc0\x8b\x7a\xe4\x23\x6b\xb2\x29\x98\xea\x25\x7a\xae\x11\x0c\xc9"
+        "\xf3\x77\xa1\x74\x82\xde\x0c\xec\x68\xce\x94\xfd\xb0\xa0\xc5\x32"
+        "\xd6\xbb\xc3\xe7\xed\x3c\x6f\x0b\x53\x9d\xf3\xc8\xeb\x4e\xee\x99"
+        "\x19\xc7\x16\xd1\xa5\x59\x1d\xa9\xd3\xe6\x43\x52\x74\x61\x28\xe6"
+        "\xac\xd8\x47\x63\xc2\xb7\x53\x39\xc1\x9a\xb0\xa3\xa4\x26\x14\xd0"
+        "\x88\xa9\x8c\xc5\x6d\xe9\x21\x7c\xb9\xa5\xab\x67\xe3\x8d\xe9\x1d"
+        "\xe3\x1c\x7b\xcd\xa4\x12\x0c\xd7\xa6\x5d\x41\xcf\xdd\x3d\xfc\xbc"
+        "\x2a\xbb\xa2\x7a\x9c\x4b\x3a\x42\x6c\x98\x1d\x50\x99\x9c\xfb\xda"
+        "\x21\x09\x2a\x31\xff\x05\xeb\xa5\xf1\xba\x65\x78\xbe\x15\x8e\x84"
+        "\x35\xdd\x45\x29\xcc\xcd\x32\x2d\x27\xe9\xa8\x94\x4b\x16\x16\xcc"
+        "\xab\xf2\xec\xfb\xa0\xb5\x9d\x39\x81\x3e\xec\x5e\x3d\x13\xd1\x83"
+        "\x04\x79\x2d\xbb\x2c\x76\x76\x93\x28\x77\x27\x13\xdd\x1d\x3e\x89"
+        "\x3e\x37\x46\x4c\xb8\x34\xbe\xbf\x9f\x4f\x9f\x37\xff\x0c\xe6\x14"
+        "\x14\x66\x52\x41\x18\xa9\x39\x2b\x0c\xe5\x44\x04\xb0\x93\x06\x64"
+        "\x67\xf7\xa0\x19\xa7\x61\xcf\x03\x7b\xcb\xc8\xb3\x88\x28\xe4\xe7",
+        "\xe6\xe8\x0a\xe3\x72\xfc\xe0\x07\x69\x09\xf2\xeb\xbc\xc8\x6a\xf0" },
+      /* After setiv, ctr_low: 0xfffffffb */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x60\x95\x1a\xe2",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\xd8\x32\x5a\xe3\x55\x8e\xb3\xc2\x51\x84\x2b\x09\x01\x5e\x6c\xfb"
+        "\x4a\xc4\x88\xa0\x33\xe7\x3e\xbf\xe5\x7c\xd2\x00\x4c\x1a\x85\x32"
+        "\x34\xec\x38\x9d\x18\x5f\xf1\x50\x61\x82\xee\xf3\x84\x5a\x84\x4e"
+        "\xeb\x29\x08\x4c\x7b\xb5\x27\xec\x7d\x79\x77\xd7\xa1\x68\x91\x32"
+        "\x2d\xf3\x38\xa9\xd6\x27\x16\xfb\x7d\x8b\x09\x5e\xcf\x1b\x74\x6d"
+        "\xcf\x51\x91\x91\xa1\xe7\x40\x19\x43\x7b\x0d\xa5\xa9\xa5\xf4\x2e"
+        "\x7f\x1c\xc7\xba\xa2\xea\x00\xdd\x24\x01\xa8\x66\x1e\x88\xf1\xf6"
+        "\x0c\x9a\xd6\x2b\xda\x3f\x3e\xb2\x98\xea\x89\xc7\xc6\x63\x27\xb7"
+        "\x6a\x48\x9a\xee\x1e\x70\xa0\xc8\xec\x3d\xc3\x3e\xb5\xf0\xc2\xb1"
+        "\xb9\x71\x1a\x69\x9d\xdd\x72\x1e\xfe\x72\xa0\x21\xb8\x9f\x18\x96"
+        "\x26\xcf\x89\x2e\x92\xf1\x02\x65\xa5\xb4\x2e\xb7\x4e\x12\xbd\xa0"
+        "\x48\xbe\xf6\x5c\xef\x7e\xf3\x0a\xcf\x9d\x1f\x1e\x14\x70\x3e\xa0"
+        "\x01\x0f\x14\xbf\x38\x10\x3a\x3f\x3f\xc2\x76\xe0\xb0\xe0\x7c\xc6"
+        "\x77\x6d\x7f\x69\x8e\xa0\x4b\x00\xc3\x9d\xf9\x0b\x7f\x8a\x8e\xd3"
+        "\x17\x58\x40\xfe\xaf\xf4\x16\x3a\x65\xff\xce\x85\xbb\x80\xfa\xb8"
+        "\x34\xc9\xef\x3a\xdd\x04\x46\xca\x8f\x70\x48\xbc\x1c\x71\x4d\x6a"
+        "\x17\x30\x32\x87\x2e\x2e\x54\x9e\x3f\x15\xed\x17\xd7\xa1\xcf\x6c"
+        "\x5d\x0f\x3c\xee\xf5\x96\xf1\x8f\x68\x1c\xbc\x27\xdc\x10\x3c\x3c",
+        "\x8c\x31\x06\xbb\xf8\x18\x2d\x9d\xd1\x0d\x03\x56\x2b\x28\x25\x9b" },
+      /* After setiv, ctr_low: 0xfffffffa */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x6b\x99\x9b\xda",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x7a\x74\x57\xe7\xc1\xb8\x7e\xcf\x91\x98\xf4\x1a\xa4\xdb\x4d\x2c"
+        "\x6e\xdc\x05\x0b\xd1\x16\xdf\x25\xa8\x1e\x42\xa6\xf9\x09\x36\xfb"
+        "\x02\x8a\x10\x7d\xa1\x07\x88\x40\xb7\x41\xfd\x64\xf6\xe3\x92\x20"
+        "\xfd\xc9\xde\xbd\x88\x46\xd3\x1f\x20\x14\x73\x86\x09\xb6\x68\x61"
+        "\x64\x90\xda\x24\xa8\x0f\x6a\x10\xc5\x01\xbf\x52\x8a\xee\x23\x44"
+        "\xd5\xb0\xd8\x68\x5e\x77\xc3\x62\xed\xcb\x3c\x1b\x0c\x1f\x13\x92"
+        "\x2c\x74\x6d\xee\x40\x1b\x6b\xfe\xbe\x3c\xb8\x02\xdd\x24\x9d\xd3"
+        "\x3d\x4e\xd3\x9b\x18\xfd\xd6\x8f\x95\xef\xa3\xbf\xa9\x2f\x33\xa8"
+        "\xc2\x37\x69\x58\x92\x42\x3a\x30\x46\x12\x1b\x2c\x04\xf0\xbf\xa9"
+        "\x79\x55\xcd\xac\x45\x36\x79\xc0\xb4\xb2\x5f\x82\x88\x49\xe8\xa3"
+        "\xbf\x33\x41\x7a\xcb\xc4\x11\x0e\xcc\x61\xed\xd1\x6b\x59\x5f\x9d"
+        "\x20\x6f\x85\x01\xd0\x16\x2a\x51\x1b\x79\x35\x42\x5e\x49\xdf\x6f"
+        "\x64\x68\x31\xac\x49\x34\xfb\x2b\xbd\xb1\xd9\x12\x4e\x4b\x16\xc5"
+        "\xa6\xfe\x15\xd3\xaf\xac\x51\x08\x95\x1f\x8c\xd2\x52\x37\x8b\x88"
+        "\xf3\x20\xe2\xf7\x09\x55\x82\x83\x1c\x38\x5f\x17\xfc\x37\x26\x21"
+        "\xb8\xf1\xfe\xa9\xac\x54\x1e\x53\x83\x53\x3f\x43\xe4\x67\x22\xd5"
+        "\x86\xec\xf2\xb6\x4a\x8b\x8a\x66\xea\xe0\x92\x50\x3b\x51\xe4\x00"
+        "\x25\x2a\x7a\x64\x14\xd6\x09\xe1\x6c\x75\x32\x28\x53\x5e\xb3\xab",
+        "\x5d\x4b\xb2\x8f\xfe\xa5\x7f\x01\x6d\x78\x6c\x13\x58\x08\xe4\x94" },
+      /* After setiv, ctr_low: 0xfffffff9 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x90\xc4\x42\x97",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\xf5\xc1\xed\xb8\x7f\x55\x7b\xb5\x47\xed\xaa\x42\xd2\xda\x33\x41"
+        "\x4a\xe0\x36\x6d\x51\x28\x40\x9c\x35\xfb\x11\x65\x18\x83\x9c\xb5"
+        "\x02\xb2\xa7\xe5\x52\x27\xa4\xe8\x57\x3d\xb3\xf5\xea\xcb\x21\x07"
+        "\x67\xbe\xbe\x0f\xf6\xaa\x32\xa1\x4b\x5e\x79\x4f\x50\x67\xcd\x80"
+        "\xfc\xf1\x65\xf2\x6c\xd0\xdb\x17\xcc\xf9\x52\x93\xfd\x5e\xa6\xb9"
+        "\x5c\x9f\xa8\xc6\x36\xb7\x80\x80\x6a\xea\x62\xdc\x61\x13\x45\xbe"
+        "\xab\x8f\xd8\x99\x17\x51\x9b\x29\x04\x6e\xdb\x3e\x9f\x83\xc6\x35"
+        "\xb3\x90\xce\xcc\x74\xec\xcb\x04\x41\xac\xb1\x92\xde\x20\xb1\x67"
+        "\xb0\x38\x14\xaa\x7d\xee\x3c\xb2\xd3\xbb\x2f\x88\x0b\x73\xcf\x7b"
+        "\x69\xc1\x55\x5b\x2b\xf2\xd4\x38\x2b\x3c\xef\x04\xc9\x14\x7c\x31"
+        "\xd6\x61\x88\xa8\xb3\x8c\x69\xb4\xbc\xaa\x0d\x15\xd2\xd5\x27\x63"
+        "\xc4\xa4\x80\xe9\x2b\xe9\xd2\x34\xc9\x0e\x3f\x7b\xd3\x43\x0d\x47"
+        "\x5d\x37\x8e\x42\xa4\x4e\xef\xcd\xbb\x3a\x5b\xa4\xe1\xb0\x8d\x64"
+        "\xb7\x0b\x58\x52\xec\x55\xd0\xef\x23\xfe\xf2\x8d\xe0\xd1\x6a\x2c"
+        "\xaa\x1c\x03\xc7\x3e\x58\x4c\x61\x72\x07\xc6\xfd\x0e\xbc\xd4\x6b"
+        "\x99\x4f\x91\xda\xff\x6f\xea\x81\x0c\x76\x85\x5d\x0c\x7f\x1c\xb8"
+        "\x84\x8c\x2f\xe1\x36\x3e\x68\xa0\x57\xf5\xdf\x13\x0a\xd6\xe1\xcd"
+        "\xae\x23\x99\x4e\xed\x7a\x72\x1b\x7c\xe5\x65\xd1\xb7\xcf\x2f\x73",
+        "\x1e\x2f\xcf\x3c\x95\x9a\x29\xec\xd3\x37\x90\x8c\x84\x8a\xfb\x95" },
+      /* After setiv, ctr_low: 0xfffffff8 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xb7\xfa\xc7\x4f",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x14\x33\xc6\x9d\x04\xd3\x48\x29\x0c\x6a\x24\x27\xdf\x5f\x0a\xd2"
+        "\x71\xd6\xd0\x18\x04\xc0\x9f\x72\x0a\x60\xb7\x10\x52\x56\xf7\xae"
+        "\x64\xb0\x28\xd4\xfd\x25\x93\x8e\x67\x7e\xac\xc2\x93\xc7\x54\x2e"
+        "\x82\x93\x88\x6a\xb9\x8b\x73\xbc\x88\xec\x27\xdd\x4f\x9b\x21\x9e"
+        "\x77\x98\x70\x0b\xf4\xd8\x55\xfe\xf4\xc3\x3a\xcb\xca\x3a\xfb\xd4"
+        "\x52\x72\x2f\xf8\xac\xa9\x6a\xf5\x13\xab\x7a\x2e\x9f\x52\x41\xbd"
+        "\x87\x90\x68\xad\x17\xbd\x5a\xff\xc3\xc6\x10\x4d\xc1\xfe\xfc\x72"
+        "\x21\xb5\x53\x4a\x3f\xe0\x15\x9f\x29\x36\x23\xc0\x9a\x31\xb2\x0f"
+        "\xcd\x2f\xa6\xd0\xfc\xe6\x4d\xed\x68\xb3\x3d\x26\x67\xab\x40\xf0"
+        "\xab\xcf\x72\xc0\x50\xb1\x1e\x86\x38\xe2\xe0\x46\x3a\x2e\x3e\x1d"
+        "\x07\xd6\x9d\xe8\xfc\xa3\xe7\xac\xc9\xa0\xb3\x22\x05\xbc\xbf\xd2"
+        "\x63\x44\x66\xfc\xb4\x7b\xb4\x70\x7e\x96\xa9\x16\x1b\xb2\x7d\x93"
+        "\x44\x92\x5e\xbd\x16\x34\xa7\x11\xd0\xdf\x52\xad\x6f\xbd\x23\x3c"
+        "\x3d\x58\x16\xaf\x99\x8b\xbb\xa0\xdc\x3a\xff\x17\xda\x56\xba\x77"
+        "\xae\xc4\xb1\x51\xe2\x61\x4f\xf0\x66\x1b\x4c\xac\x79\x34\x1c\xfd"
+        "\x6c\x5f\x9a\x2c\x60\xfc\x47\x00\x5f\x2d\x81\xcc\xa9\xdd\x2b\xf4"
+        "\x5b\x53\x44\x61\xd4\x13\x5a\xf3\x93\xf0\xc9\x24\xd4\xe6\x60\x6f"
+        "\x78\x02\x0c\x75\x9d\x0d\x23\x97\x35\xe2\x06\x8a\x49\x5e\xe5\xbe",
+        "\x23\xc0\x4a\x2f\x98\x93\xca\xbd\x2e\x44\xde\x05\xcc\xe7\xf1\xf5" },
+      /* After setiv, ctr_low: 0xfffffff7 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x4c\xa7\x1e\x02",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x51\x51\x64\x89\xeb\x9f\xf9\xd6\xb1\xa6\x73\x5f\xf1\x62\xb5\xe4"
+        "\x00\x80\xdb\x4c\x1c\xce\xe5\x00\xeb\xea\x6c\x57\xe4\x27\xfc\x71"
+        "\x08\x8c\xa1\xfc\x59\x1d\x07\x45\x3c\xc9\x4e\x0f\xb6\xea\x96\x90"
+        "\xae\xf7\x81\x1e\x7e\x6c\x5e\x50\xaf\x34\x3e\xa0\x55\x59\x8e\xe7"
+        "\xc1\xba\x48\xfa\x9e\x07\xf6\x6a\x24\x54\x3e\x9b\xa5\xfe\x31\x16"
+        "\x3d\x4d\x9c\xc4\xe1\xec\x26\xa0\x8b\x59\xa6\xf3\x94\xf8\x88\xda"
+        "\x1f\x88\x23\x5f\xfb\xfd\x79\xa2\xd3\x62\x30\x66\x69\xd9\x0d\x05"
+        "\xc0\x75\x4c\xb8\x48\x34\x1d\x97\xcf\x29\x6a\x12\x1c\x26\x54\x1d"
+        "\x80\xa9\x06\x74\x86\xff\xc6\xb4\x72\xee\x34\xe2\x56\x06\x6c\xf5"
+        "\x11\xe7\x26\x71\x47\x6b\x05\xbd\xe4\x0b\x40\x78\x84\x3c\xf9\xf2"
+        "\x78\x34\x2b\x3c\x5f\x0e\x4c\xfb\x17\x39\xdc\x59\x6b\xd1\x56\xac"
+        "\xe4\x1f\xb9\x19\xbc\xec\xb1\xd0\x6d\x47\x3b\x37\x4d\x0d\x6b\x65"
+        "\x7c\x70\xe9\xec\x58\xcc\x09\xd4\xd9\xbf\x9f\xe0\x6c\x7f\x60\x28"
+        "\xd8\xdf\x8e\xd1\x6a\x73\x42\xf3\x50\x01\x79\x68\x41\xc3\xba\x19"
+        "\x1e\x2d\x30\xc2\x81\x2c\x9f\x11\x8b\xd0\xdc\x31\x3b\x01\xfe\x53"
+        "\xa5\x11\x13\x22\x89\x40\xb9\x1b\x12\x89\xef\x9a\xcb\xa8\x03\x4f"
+        "\x54\x1a\x15\x6d\x11\xba\x05\x09\xd3\xdb\xbf\x05\x42\x3a\x5a\x27"
+        "\x3b\x34\x5c\x58\x8a\x5c\xa4\xc2\x28\xdc\xb2\x3a\xe9\x99\x01\xd6",
+        "\x30\xb2\xb5\x11\x8a\x3a\x8d\x70\x67\x71\x14\xde\xed\xa7\x43\xb5" },
+      /* After setiv, ctr_low: 0xfffffff6 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x47\xab\x9f\x3a",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x05\x72\x44\xa0\x99\x11\x1d\x2c\x4b\x03\x4f\x20\x92\x88\xbe\x55"
+        "\xee\x31\x2c\xd9\xc0\xc1\x64\x77\x79\xd7\x3e\xfa\x5a\x7d\xf0\x48"
+        "\xf8\xc8\xfe\x81\x8f\x89\x92\xa6\xc2\x07\xdc\x9f\x3f\xb2\xc8\xf2"
+        "\xf3\xe9\xe1\xd3\xed\x55\xb4\xab\xc3\x22\xed\x8f\x00\xde\x32\x95"
+        "\x91\xc0\xc5\xf3\xd3\x93\xf0\xee\x56\x14\x8f\x96\xff\xd0\x6a\xbd"
+        "\xfc\x57\xc2\xc3\x7b\xc1\x1d\x56\x48\x3f\xa6\xc7\x92\x47\xf7\x2f"
+        "\x0b\x85\x1c\xff\x87\x29\xe1\xbb\x9b\x14\x6c\xac\x51\x0a\xc0\x7b"
+        "\x22\x25\xb0\x48\x92\xad\x09\x09\x6e\x39\x8e\x96\x13\x05\x55\x92"
+        "\xbd\xd7\x5d\x95\x35\xdd\x8a\x9d\x05\x59\x60\xae\xbb\xc0\x85\x92"
+        "\x4c\x8b\xa0\x3f\xa2\x4a\xe5\x2e\xde\x85\x1a\x39\x10\x22\x11\x1b"
+        "\xdd\xcc\x96\xf4\x93\x97\xf5\x81\x85\xf3\x33\xda\xa1\x9a\xba\xfd"
+        "\xb8\xaf\x60\x81\x37\xf1\x02\x88\x54\x15\xeb\x21\xd1\x19\x1a\x1f"
+        "\x28\x9f\x02\x27\xca\xce\x97\xda\xdc\xd2\x0f\xc5\x0e\x2e\xdd\x4f"
+        "\x1d\x24\x62\xe4\x6e\x4a\xbe\x96\x95\x38\x0c\xe9\x26\x14\xf3\xf0"
+        "\x92\xbc\x97\xdc\x38\xeb\x64\xc3\x04\xc1\xa2\x6c\xad\xbd\xf8\x03"
+        "\xa0\xa4\x68\xaa\x9d\x1f\x09\xe6\x62\x95\xa2\x1c\x32\xef\x62\x28"
+        "\x7e\x54\x6d\x4b\x6a\xcc\x4a\xd0\x82\x47\x46\x0d\x45\x3c\x36\x03"
+        "\x86\x90\x44\x65\x18\xac\x19\x75\xe6\xba\xb1\x9a\xb4\x5d\x84\x9b",
+        "\x31\x22\x2b\x11\x6e\x2b\x94\x56\x37\x9d\xc3\xa5\xde\xe7\x6e\xc9" },
+      /* After setiv, ctr_low: 0xfffffff5 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xbc\xf6\x46\x77",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x6e\x32\xdb\x04\x32\x57\x15\x78\x0e\x4c\x70\x66\x5c\x91\x43\x0c"
+        "\x63\x73\xb8\x86\xad\xb0\xf1\x34\x0f\x0c\x7e\xd3\x4e\xcb\xc9\xea"
+        "\x19\x3c\xb8\x14\xd0\xab\x9e\x9b\x22\xda\x7a\x96\xa7\xf5\xa2\x99"
+        "\x58\xe3\xd6\x72\x0f\xf5\xdf\x88\xd1\x33\xb1\xe5\x03\x72\x62\x1c"
+        "\xa7\xf2\x67\x50\x0e\x70\xc3\x7a\x6c\x4a\x90\xba\x78\x9e\xd2\x0b"
+        "\x29\xd4\xc8\xa7\x57\x06\xf2\xf4\x01\x4b\x30\x53\xea\xf7\xde\xbf"
+        "\x1c\x12\x03\xcf\x9f\xcf\x80\x8b\x77\xfd\x73\x48\x79\x19\xbe\x38"
+        "\x75\x0b\x6d\x78\x7d\x79\x05\x98\x65\x3b\x35\x8f\x68\xff\x30\x7a"
+        "\x6e\xf7\x10\x9e\x11\x25\xc4\x95\x97\x7d\x92\x0f\xbf\x38\x95\xbd"
+        "\x5d\x2a\xf2\x06\x2c\xd9\x5a\x80\x91\x4e\x22\x7d\x5f\x69\x85\x03"
+        "\xa7\x5d\xda\x22\x09\x2b\x8d\x29\x67\x7c\x8c\xf6\xb6\x49\x20\x63"
+        "\xb9\xb6\x4d\xb6\x37\xa3\x7b\x19\xa4\x28\x90\x83\x55\x3d\x4e\x18"
+        "\xc8\x65\xbc\xd1\xe7\xb5\xcf\x65\x28\xea\x19\x11\x5c\xea\x83\x8c"
+        "\x44\x1f\xac\xc5\xf5\x3a\x4b\x1c\x2b\xbf\x76\xd8\x98\xdb\x50\xeb"
+        "\x64\x45\xae\xa5\x39\xb7\xc8\xdf\x5a\x73\x6d\x2d\x0f\x4a\x5a\x17"
+        "\x37\x66\x1c\x3d\x27\xd5\xd6\x7d\xe1\x08\x7f\xba\x4d\x43\xc2\x29"
+        "\xf7\xbe\x83\xec\xd0\x3b\x2e\x19\x9e\xf7\xbf\x1b\x16\x34\xd8\xfa"
+        "\x32\x17\x2a\x90\x55\x93\xd5\x3e\x14\x8d\xd6\xa1\x40\x45\x09\x52",
+        "\x89\xf2\xae\x78\x38\x8e\xf2\xd2\x52\xa8\xba\xb6\xf2\x5d\x7c\xfc" },
+      /* After setiv, ctr_low: 0xfffffff4 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x51\xb2\x9d\x4a",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x1d\xb8\x77\xcd\xcd\xfe\xde\x07\x97\xcb\x97\x3a\x4f\xa0\xd0\xe6"
+        "\xcc\xcf\x8b\x71\xd5\x65\x3d\xc4\x17\x52\xe7\x1d\x6a\x68\x4a\x77"
+        "\xca\x04\x4a\xef\x8e\x7e\xce\x79\xa1\x80\x0d\x9e\xd5\xf4\xce\x66"
+        "\x4d\x54\xb1\x09\xd1\xb6\xb0\x43\x28\xe8\x53\xe2\x24\x9c\x76\xc5"
+        "\x4d\x22\xf3\x6e\x13\xf3\xd7\xe0\x85\xb8\x9e\x0b\x17\x22\xc0\x79"
+        "\x2b\x72\x57\xaa\xbd\x43\xc3\xf7\xde\xce\x22\x41\x3c\x7e\x37\x1a"
+        "\x55\x2e\x36\x0e\x7e\xdc\xb3\xde\xd7\x33\x36\xc9\xc8\x56\x93\x51"
+        "\x68\x77\x9a\xb0\x08\x5c\x22\x35\xef\x5c\x9b\xbf\x3e\x20\x8a\x84"
+        "\x3d\xb3\x60\x10\xe1\x97\x30\xd7\xb3\x6f\x40\x5a\x2c\xe0\xe5\x52"
+        "\x19\xb6\x2b\xed\x6e\x8e\x18\xb4\x8d\x78\xbd\xc4\x9f\x4f\xbd\x82"
+        "\x98\xd6\x71\x3d\x71\x5b\x78\x73\xee\x8e\x4b\x37\x88\x9e\x21\xca"
+        "\x00\x6c\xc2\x96\x8d\xf0\xcd\x09\x58\x54\x5a\x58\x59\x8e\x9b\xf8"
+        "\x72\x93\xd7\xa0\xf9\xc4\xdc\x48\x89\xaa\x31\x95\xda\x4e\x2f\x79"
+        "\x1e\x37\x49\x92\x2e\x32\x2e\x76\x54\x2a\x64\xa8\x96\x67\xe9\x75"
+        "\x10\xa6\xeb\xad\xc6\xa8\xec\xb7\x18\x0a\x32\x26\x8d\x6e\x03\x74"
+        "\x0e\x1f\xfc\xde\x76\xff\x6e\x96\x42\x2d\x80\x0a\xc6\x78\x70\xc4"
+        "\xd8\x56\x7b\xa6\x38\x2f\xf6\xc0\x9b\xd7\x21\x6e\x88\x5d\xc8\xe5"
+        "\x02\x6a\x09\x1e\xb3\x46\x44\x80\x82\x5b\xd1\x66\x06\x61\x4f\xb8",
+        "\x16\x0e\x73\xa3\x14\x43\xdb\x15\x9c\xb0\x0d\x30\x6d\x9b\xe1\xb1" },
+      /* After setiv, ctr_low: 0xfffffff3 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa\xef\x44\x07",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x42\x71\x54\xe2\xdb\x50\x5d\x3c\x10\xbd\xf8\x60\xbd\xdb\x26\x14"
+        "\x7d\x13\x59\x98\x28\xfb\x43\x42\xca\x72\xe6\xd8\x58\x00\xa2\x1b"
+        "\x6a\x61\xb4\x3a\x80\x6b\x9e\x14\xbd\x11\x33\xab\xe9\xb9\x91\x95"
+        "\xd7\x5d\xc3\x98\x1f\x7f\xcb\xa8\xf0\xec\x31\x26\x51\xea\x2e\xdf"
+        "\xd9\xde\x70\xf5\x84\x27\x3a\xac\x22\x05\xb9\xce\x2a\xfb\x2a\x83"
+        "\x1e\xce\x0e\xb2\x31\x35\xc6\xe6\xc0\xd7\xb0\x5f\xf5\xca\xdb\x13"
+        "\xa7\xfe\x4f\x85\xa3\x4f\x94\x5c\xc1\x04\x12\xde\x6f\xa1\xdb\x41"
+        "\x59\x82\x22\x22\x65\x97\x6d\xc8\x67\xab\xf3\x90\xeb\xa4\x00\xb3"
+        "\x7d\x94\x3d\x7b\x2a\xe2\x85\x36\x87\x16\xb8\x19\x92\x02\xe0\x43"
+        "\x42\x85\xa1\xe6\xb8\x11\x30\xcc\x2c\xd8\x63\x09\x0e\x53\x5f\xa3"
+        "\xe0\xd4\xee\x0e\x04\xee\x65\x61\x96\x84\x42\x0c\x68\x8d\xb7\x48"
+        "\xa3\x02\xb4\x82\x69\xf2\x35\xe4\xce\x3b\xe3\x44\xce\xad\x49\x32"
+        "\xab\xda\x04\xea\x06\x60\xa6\x2a\x7d\xee\x0f\xb8\x95\x90\x22\x62"
+        "\x9c\x78\x59\xd3\x7b\x61\x02\x65\x63\x96\x9f\x67\x50\xa0\x61\x43"
+        "\x53\xb2\x3f\x22\xed\x8c\x42\x39\x97\xd9\xbc\x6e\x81\xb9\x21\x97"
+        "\xc6\x5b\x68\xd7\x7f\xd0\xc5\x4a\xfb\x74\xc4\xfd\x9a\x2a\xb8\x9b"
+        "\x48\xe0\x00\xea\x6d\xf5\x30\x26\x61\x8f\xa5\x45\x70\xc9\x3a\xea"
+        "\x6d\x19\x11\x57\x0f\x21\xe6\x0a\x53\x94\xe3\x0c\x99\xb0\x2f\xc5",
+        "\x92\x92\x89\xcd\x4f\x3c\x6d\xbc\xe8\xb3\x70\x14\x5b\x3c\x12\xe4" },
+      /* After setiv, ctr_low: 0xfffffff2 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa1\xe3\xc5\x3f",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\x41\xc3\xcb\xd7\x6e\xde\x2a\xc6\x15\x05\xc6\xba\x27\xae\xcd\x37"
+        "\xc0\xe5\xbf\xb9\x5c\xdc\xd6\xad\x1a\xe1\x35\x7c\xc0\x85\x85\x51"
+        "\x8c\x98\x06\xc0\x72\x43\x71\x7a\x2d\x7c\x81\x3c\xe7\xd6\x32\x8e"
+        "\x22\x2b\x46\x95\x6a\xde\x45\x40\x56\xe9\x63\x32\x68\xbf\xb6\x78"
+        "\xb7\x86\x00\x9d\x2c\x9e\xed\x67\xc1\x9b\x09\x9e\xd9\x0a\x56\xcb"
+        "\x57\xc9\x48\x14\x23\x4e\x97\x04\xb5\x85\x25\x1d\xcb\x1a\x79\x9b"
+        "\x54\x06\x95\xad\x16\x81\x84\x3a\x38\xec\x41\x90\x2a\xfa\x50\xe0"
+        "\xb9\x20\xa6\xeb\xfe\x2e\x5c\xa1\xf6\x3c\x69\x4c\xce\xf8\x30\xe0"
+        "\x87\x68\xa2\x3a\x9d\xad\x75\xd4\xa5\x6b\x0a\x90\x65\xa2\x27\x64"
+        "\x9d\xf5\xa0\x6f\xd0\xd3\x62\xa5\x2d\xae\x02\x89\xb4\x1a\xfa\x32"
+        "\x9b\xa0\x44\xdd\x50\xde\xaf\x41\xa9\x89\x1e\xb0\x41\xbc\x9c\x41"
+        "\xb0\x35\x5e\xf1\x9a\xd9\xab\x57\x53\x21\xca\x39\xfc\x8b\xb4\xd4"
+        "\xb2\x19\x8a\xe9\xb2\x24\x1e\xce\x2e\x19\xb0\xd2\x93\x30\xc4\x70"
+        "\xe2\xf8\x6a\x8a\x99\x3b\xed\x71\x7e\x9e\x98\x99\x2a\xc6\xdd\xcf"
+        "\x43\x32\xdb\xfb\x27\x22\x89\xa4\xc5\xe0\xa2\x94\xe9\xcf\x9d\x48"
+        "\xab\x3f\xfa\x4f\x75\x63\x46\xdd\xfe\xfa\xf0\xbf\x6e\xa1\xf9\xca"
+        "\xb1\x77\x79\x35\x6c\x33\xe1\x57\x68\x50\xe9\x78\x4e\xe4\xe2\xf0"
+        "\xcf\xe4\x23\xde\xf4\xa7\x34\xb3\x44\x97\x38\xd2\xbd\x27\x44\x0e",
+        "\x75\x0a\x41\x3b\x87\xe3\xc7\xf6\xd6\xe3\xab\xfa\x4b\xbe\x2e\x56" },
+      /* After setiv, ctr_low: 0xfffffff1 */
+      { GCRY_CIPHER_AES256,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x5a\xbe\x1c\x72",
+        16,
+        "", 0,
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+        288,
+        "\xf1\x3c\x7a\xa4\xa9\xaf\xe7\x49\x19\x7d\xad\x50\xc1\x6a\x84\x87"
+        "\xf5\x69\xe4\xe5\xc2\x0a\x90\x33\xc3\xeb\x76\x63\x5f\x9b\x1d\xf9"
+        "\x53\x4a\x2a\x6d\x6b\x61\xe0\x5d\xed\xcb\x98\x0d\xf2\x57\x33\x12"
+        "\xd1\x44\xaa\x7a\x7e\x4e\x41\x0e\xe6\xa7\x9f\x17\x92\x28\x91\xad"
+        "\xca\xce\xf2\xa8\x73\x4a\xad\x89\x62\x73\x0b\x9a\x68\x91\xa8\x11"
+        "\x44\x01\xfd\x57\xe4\xf8\x84\x55\x2b\x66\xdb\xb9\xd6\xee\x83\xe5"
+        "\x57\xea\x5c\x6a\x23\x87\xdd\x0a\x45\x63\xb4\x0c\x8f\xc5\x9f\x22"
+        "\xf3\x4f\x4e\x6f\x7b\x14\x62\xf7\x80\x59\x4a\xc5\xc8\xae\x8a\x6f"
+        "\x5e\xe3\x1e\xe6\xae\xec\x99\x77\x6b\x88\x14\xe3\x58\x88\x61\x74"
+        "\x38\x91\xa1\x32\xb8\xd2\x39\x6b\xe2\xcb\x8e\x77\xde\x92\x36\x78"
+        "\xad\x50\xcf\x08\xb8\xfa\x29\x59\xb4\x68\x1b\x23\x10\x57\x32\x92"
+        "\xf8\xec\xe1\x97\xdb\x30\x85\x22\xb5\x68\x2f\xf2\x98\xda\x06\xee"
+        "\x65\x02\xe7\xf9\xc8\xc1\xca\x8f\xd3\xed\x4a\x3c\x09\xdd\xde\x64"
+        "\xd9\x85\x17\x2c\x62\x41\x35\x24\xed\x6b\x87\x78\x1e\xb5\x7a\x9b"
+        "\xa3\x90\xa3\x99\xc7\x39\x51\x10\xb7\x6a\x12\x3b\x64\xfe\x32\x3c"
+        "\xb6\x84\x9a\x3f\x95\xd3\xcb\x22\x69\x9c\xf9\xb7\xc2\x8b\xf4\x55"
+        "\x68\x60\x11\x20\xc5\x3e\x0a\xc0\xba\x00\x0e\x88\x96\x66\xfa\xf0"
+        "\x75\xbc\x2b\x9c\xff\xc5\x33\x7b\xaf\xb2\xa6\x34\x78\x44\x9c\xa7",
+        "\x01\x24\x0e\x17\x17\xe5\xfc\x90\x07\xfa\x78\xd5\x5d\x66\xa3\xf5" },
     };
 
   gcry_cipher_hd_t hde, hdd;
-  unsigned char out[MAX_DATA_LEN];
+  unsigned char out[MAX_GCM_DATA_LEN];
   unsigned char tag[GCRY_GCM_BLOCK_LEN];
   int i, keylen;
   gcry_error_t err = 0;
@@ -1885,8 +2562,12 @@ check_gcm_cipher (void)
   _check_gcm_cipher(1);
   /* Split input to 7 byte buffers. */
   _check_gcm_cipher(7);
+  /* Split input to 15 byte buffers. */
+  _check_gcm_cipher(15);
   /* Split input to 16 byte buffers. */
   _check_gcm_cipher(16);
+  /* Split input to 17 byte buffers. */
+  _check_gcm_cipher(17);
 }