[PATCH 1/8] rijndael: add ECB acceleration (for benchmarking purposes)

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Oct 23 18:16:01 CEST 2022


* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
* cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
* cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
'.Ldeclast' to '.Lenclast'.
(_gcry_aes_aesni_ecb_crypt): New.
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
return value from void to size_t.
(ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
indirect function call; Return value from called function (allows tail
call optimization).
(_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
tail call optimization).
(_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
(_gcry_aes_armv8_ce_ecb_crypt): New.
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ecb_crypt_amd64): New.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
(_gcry_aes_vaes_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
(_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
(do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
--

Benchmark on AMD Ryzen 9 7900X:

Before (OCB for reference):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.128 ns/B      7460 MiB/s     0.720 c/B      5634±1
        ECB dec |     0.134 ns/B      7103 MiB/s     0.753 c/B      5608
        OCB enc |     0.029 ns/B     32930 MiB/s     0.163 c/B      5625
        OCB dec |     0.029 ns/B     32738 MiB/s     0.164 c/B      5625

After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.028 ns/B     33761 MiB/s     0.159 c/B      5625
        ECB dec |     0.028 ns/B     33917 MiB/s     0.158 c/B      5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-internal.h           |   2 +
 cipher/cipher.c                    |  41 ++-
 cipher/rijndael-aesni.c            | 160 ++++++++++-
 cipher/rijndael-armv8-aarch32-ce.S | 152 +++++++++-
 cipher/rijndael-armv8-aarch64-ce.S | 125 ++++++++-
 cipher/rijndael-armv8-ce.c         | 124 +++++----
 cipher/rijndael-vaes-avx2-amd64.S  | 432 ++++++++++++++++++++++++++++-
 cipher/rijndael-vaes.c             |  26 ++
 cipher/rijndael.c                  |  12 +
 9 files changed, 997 insertions(+), 77 deletions(-)

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 66b75955..4e022f38 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -161,6 +161,8 @@ typedef struct cipher_mode_ops
    not NULL.  */
 typedef struct cipher_bulk_ops
 {
+  void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg,
+		    size_t nblocks, int encrypt);
   void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 6c335aec..026c1511 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -983,14 +983,11 @@ cipher_reset (gcry_cipher_hd_t c)
 
 

 static gcry_err_code_t
-do_ecb_crypt (gcry_cipher_hd_t c,
-              unsigned char *outbuf, size_t outbuflen,
-              const unsigned char *inbuf, size_t inbuflen,
-              gcry_cipher_encrypt_t crypt_fn)
+do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+	      const unsigned char *inbuf, size_t inbuflen, int encrypt)
 {
   unsigned int blocksize = c->spec->blocksize;
   size_t n, nblocks;
-  unsigned int burn, nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
@@ -998,18 +995,32 @@ do_ecb_crypt (gcry_cipher_hd_t c,
     return GPG_ERR_INV_LENGTH;
 
   nblocks = inbuflen / blocksize;
-  burn = 0;
 
-  for (n=0; n < nblocks; n++ )
+  if (nblocks == 0)
+    return 0;
+
+  if (c->bulk.ecb_crypt)
     {
-      nburn = crypt_fn (&c->context.c, outbuf, inbuf);
-      burn = nburn > burn ? nburn : burn;
-      inbuf  += blocksize;
-      outbuf += blocksize;
+      c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt);
     }
+  else
+    {
+      gcry_cipher_encrypt_t crypt_fn =
+          encrypt ? c->spec->encrypt : c->spec->decrypt;
+      unsigned int burn = 0;
+      unsigned int nburn;
 
-  if (burn > 0)
-    _gcry_burn_stack (burn + 4 * sizeof(void *));
+      for (n = 0; n < nblocks; n++)
+	{
+	  nburn = crypt_fn (&c->context.c, outbuf, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+	  inbuf  += blocksize;
+	  outbuf += blocksize;
+	}
+
+      if (burn > 0)
+	_gcry_burn_stack (burn + 4 * sizeof(void *));
+    }
 
   return 0;
 }
@@ -1019,7 +1030,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1);
 }
 
 static gcry_err_code_t
@@ -1027,7 +1038,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0);
 }
 
 
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 156af015..906737a6 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
-                "jb .Ldeclast%=\n\t"
+                "jb .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
-                "je .Ldeclast%=\n\t"
+                "je .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xe0(%[key]), %%xmm0\n"
 
-                ".Ldeclast%=:\n\t"
+                ".Lenclast%=:\n\t"
                 : /* no output */
                 : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
@@ -1717,6 +1717,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
 }
 
 
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst,
+			   const unsigned char *src, size_t nblocks,
+			   int encrypt)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec;
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqa (%[key]), %%xmm0\n\t"
+	     "movdqu 0*16(%[src]), %%xmm1\n\t"
+	     "movdqu 1*16(%[src]), %%xmm2\n\t"
+	     "movdqu 2*16(%[src]), %%xmm3\n\t"
+	     "movdqu 3*16(%[src]), %%xmm4\n\t"
+	     "movdqu 4*16(%[src]), %%xmm8\n\t"
+	     "movdqu 5*16(%[src]), %%xmm9\n\t"
+	     "movdqu 6*16(%[src]), %%xmm10\n\t"
+	     "movdqu 7*16(%[src]), %%xmm11\n\t"
+	     "pxor   %%xmm0, %%xmm1\n\t"
+	     "pxor   %%xmm0, %%xmm2\n\t"
+	     "pxor   %%xmm0, %%xmm3\n\t"
+	     "pxor   %%xmm0, %%xmm4\n\t"
+	     "pxor   %%xmm0, %%xmm8\n\t"
+	     "pxor   %%xmm0, %%xmm9\n\t"
+	     "pxor   %%xmm0, %%xmm10\n\t"
+	     "pxor   %%xmm0, %%xmm11\n\t"
+	     : /* No output */
+	     : [src] "r" (src),
+	       [key] "r" (key)
+	     : "memory");
+
+	  if (encrypt)
+	    {
+	      do_aesni_enc_vec8 (ctx);
+	      asm volatile
+		("aesenclast %%xmm0, %%xmm1\n\t"
+		 "aesenclast %%xmm0, %%xmm2\n\t"
+		 "aesenclast %%xmm0, %%xmm3\n\t"
+		 "aesenclast %%xmm0, %%xmm4\n\t"
+		 "aesenclast %%xmm0, %%xmm8\n\t"
+		 "aesenclast %%xmm0, %%xmm9\n\t"
+		 "aesenclast %%xmm0, %%xmm10\n\t"
+		 "aesenclast %%xmm0, %%xmm11\n\t"
+		 ::: "memory" );
+	    }
+	  else
+	    {
+	      do_aesni_dec_vec8 (ctx);
+	      asm volatile
+		("aesdeclast %%xmm0, %%xmm1\n\t"
+		 "aesdeclast %%xmm0, %%xmm2\n\t"
+		 "aesdeclast %%xmm0, %%xmm3\n\t"
+		 "aesdeclast %%xmm0, %%xmm4\n\t"
+		 "aesdeclast %%xmm0, %%xmm8\n\t"
+		 "aesdeclast %%xmm0, %%xmm9\n\t"
+		 "aesdeclast %%xmm0, %%xmm10\n\t"
+		 "aesdeclast %%xmm0, %%xmm11\n\t"
+		 ::: "memory" );
+	    }
+
+	  asm volatile
+	    ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+	     "movdqu %%xmm2, 1*16(%[dst])\n\t"
+	     "movdqu %%xmm3, 2*16(%[dst])\n\t"
+	     "movdqu %%xmm4, 3*16(%[dst])\n\t"
+	     "movdqu %%xmm8, 4*16(%[dst])\n\t"
+	     "movdqu %%xmm9, 5*16(%[dst])\n\t"
+	     "movdqu %%xmm10, 6*16(%[dst])\n\t"
+	     "movdqu %%xmm11, 7*16(%[dst])\n\t"
+	     : /* No output */
+	     : [dst] "r" (dst)
+	     : "memory");
+
+	  dst += 8*BLOCKSIZE;
+	  src += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile
+	("movdqu 0*16(%[src]), %%xmm1\n\t"
+	 "movdqu 1*16(%[src]), %%xmm2\n\t"
+	 "movdqu 2*16(%[src]), %%xmm3\n\t"
+	 "movdqu 3*16(%[src]), %%xmm4\n\t"
+	 : /* No output */
+	 : [src] "r" (src)
+	 : "memory");
+
+      if (encrypt)
+	do_aesni_enc_vec4 (ctx);
+      else
+	do_aesni_dec_vec4 (ctx);
+
+      asm volatile
+	("movdqu %%xmm1, 0*16(%[dst])\n\t"
+	 "movdqu %%xmm2, 1*16(%[dst])\n\t"
+	 "movdqu %%xmm3, 2*16(%[dst])\n\t"
+	 "movdqu %%xmm4, 3*16(%[dst])\n\t"
+	 : /* No output */
+	 : [dst] "r" (dst)
+	 : "memory");
+
+      dst += 4*BLOCKSIZE;
+      src += 4*BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      asm volatile ("movdqu %[src], %%xmm0\n\t"
+                    :
+                    : [src] "m" (*src)
+                    : "memory" );
+
+      if (encrypt)
+	do_aesni_enc (ctx);
+      else
+	do_aesni_dec (ctx);
+
+      asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                    : [dst] "=m" (*dst)
+                    :
+                    : "memory" );
+
+      dst += BLOCKSIZE;
+      src += BLOCKSIZE;
+    }
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
 void ASM_FUNC_ATTR
 _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 1eafa93e..6208652b 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -653,6 +653,149 @@ _gcry_aes_cbc_dec_armv8_ce:
 .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+.type  _gcry_aes_ecb_enc_armv8_ce,%function;
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    %st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192e
+  bhi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc, ...) \
+  .Lecb_entry_##bits##e_d: \
+    cmp r3, #4; \
+    blo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    sub r3, r3, #4; \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    cmp r3, #4; \
+    \
+    do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
+    \
+    bhs .Lecb_loop4_##bits##e_d; \
+    cmp r3, #0; \
+    beq .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    subs r3, r3, #1; \
+    \
+    do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    bne .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc, r0, lr)
+  ECB_CRYPT(256, e, mc, r0, lr)
+
+.Lecb_done_e:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_enc_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+.type  _gcry_aes_ecb_dec_armv8_ce,%function;
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    %st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192d
+  bhi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc, r0, lr)
+  ECB_CRYPT(256, d, imc, r0, lr)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_dec_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
+
+
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 4fef0345..97d3d7eb 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -385,6 +385,119 @@ _gcry_aes_dec_armv8_ce:
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+ELF(.type  _gcry_aes_ecb_enc_armv8_ce,%function;)
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192e
+  b.hi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc) \
+  .Lecb_entry_##bits##e_d: \
+    cmp x3, #4; \
+    b.lo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    sub x3, x3, #4; \
+    ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x3, #4; \
+    do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \
+    st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lecb_loop4_##bits##e_d; \
+    CLEAR_REG(v1); \
+    CLEAR_REG(v2); \
+    CLEAR_REG(v3); \
+    cbz x3, .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \
+    sub x3, x3, #1; \
+    do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \
+    st1 {v0.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x3, .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc)
+  ECB_CRYPT(256, e, mc)
+
+.Lecb_done_e:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_enc_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+ELF(.type  _gcry_aes_ecb_dec_armv8_ce,%function;)
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192d
+  b.hi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc)
+  ECB_CRYPT(256, d, imc)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_dec_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;)
+
+
 /*
  * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -471,7 +584,8 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
  * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
- *                                  unsigned char *iv, unsigned int nrounds);
+ *                                  unsigned char *iv,
+ *                                  size_t nblocks, unsigned int nrounds);
  */
 
 .align 3
@@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index c9c37654..042b7d42 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -80,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
                                             unsigned char *iv, size_t nblocks,
                                             unsigned int nrounds);
 
-extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
-                                         const unsigned char *abuf,
-                                         unsigned char *offset,
-                                         unsigned char *checksum,
-                                         unsigned char *L_table,
-                                         size_t nblocks,
-                                         unsigned int nrounds,
-                                         unsigned int blkn);
+extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+                                           const unsigned char *abuf,
+                                           unsigned char *offset,
+                                           unsigned char *checksum,
+                                           unsigned char *L_table,
+                                           size_t nblocks,
+                                           unsigned int nrounds,
+                                           unsigned int blkn);
 extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
                                         unsigned char *outbuf,
                                         const unsigned char *inbuf,
@@ -116,17 +116,14 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
                                         const unsigned char *inbuf,
                                         unsigned char *tweak,
                                         size_t nblocks, unsigned int nrounds);
-
-typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *offset, unsigned char *checksum,
-                                unsigned char *L_table, size_t nblocks,
-                                unsigned int nrounds, unsigned int blkn);
-
-typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *tweak, size_t nblocks,
-                                unsigned int nrounds);
+extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
 
 
 void
@@ -312,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
-                                    : _gcry_aes_ocb_dec_armv8_ce;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned int nrounds = ctx->rounds;
@@ -327,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
-           c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  if (encrypt)
+    return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf,
+				       c->u_iv.iv, c->u_ctr.ctr,
+				       c->u_mode.ocb.L[0], nblocks, nrounds,
+				       (unsigned int)blkn);
+  else
+    return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf,
+				       c->u_iv.iv, c->u_ctr.ctr,
+				       c->u_mode.ocb.L[0], nblocks, nrounds,
+				       (unsigned int)blkn);
 }
 
 size_t
@@ -345,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
 
   c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
-			      c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
-			      nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset,
+				      c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+				      nblocks, nrounds, (unsigned int)blkn);
 }
 
 void
@@ -358,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
 			      size_t nblocks, int encrypt)
 {
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
-                                    : _gcry_aes_xts_dec_armv8_ce;
   unsigned int nrounds = ctx->rounds;
 
   if ( !encrypt && !ctx->decryption_prepared )
@@ -368,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
       ctx->decryption_prepared = 1;
     }
 
-  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+  if (encrypt)
+    _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak,
+				nblocks, nrounds);
+  else
+    _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak,
+				nblocks, nrounds);
 }
 
+void
+_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf,
+			      const void *inbuf, size_t nblocks,
+			      int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if ( !encrypt && !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  if (encrypt)
+    _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+  else
+    _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+}
 #endif /* USE_ARM_CE */
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index e36e82a0..655fdf55 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -2357,7 +2357,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)
 
 /**********************************************************************
-  CTR-mode encryption
+  XTS-mode encryption
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_xts_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_xts_crypt_amd64
@@ -2873,6 +2873,436 @@ _gcry_vaes_avx2_xts_crypt_amd64:
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
 
+/**********************************************************************
+  ECB-mode encryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64, at function)
+.globl _gcry_vaes_avx2_ecb_crypt_amd64
+_gcry_vaes_avx2_ecb_crypt_amd64:
+	/* input:
+	 *	%rdi: round keys
+	 *	%esi: encrypt
+	 *	%rdx: dst
+	 *	%rcx: src
+	 *	%r8:  nblocks
+	 *	%r9:  nrounds
+	 */
+	CFI_STARTPROC();
+
+	/* Process 16 blocks per loop. */
+.align 8
+.Lecb_blk16:
+	cmpq $16, %r8;
+	jb .Lecb_blk8;
+
+	leaq -16(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vmovdqu (4 * 16)(%rcx), %ymm2;
+	vmovdqu (6 * 16)(%rcx), %ymm3;
+	vmovdqu (8 * 16)(%rcx), %ymm4;
+	vmovdqu (10 * 16)(%rcx), %ymm5;
+	vmovdqu (12 * 16)(%rcx), %ymm6;
+	vmovdqu (14 * 16)(%rcx), %ymm7;
+	vpxor %ymm8, %ymm0, %ymm0;
+	vpxor %ymm8, %ymm1, %ymm1;
+	vpxor %ymm8, %ymm2, %ymm2;
+	vpxor %ymm8, %ymm3, %ymm3;
+	vpxor %ymm8, %ymm4, %ymm4;
+	vpxor %ymm8, %ymm5, %ymm5;
+	vpxor %ymm8, %ymm6, %ymm6;
+	vpxor %ymm8, %ymm7, %ymm7;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+	leaq (16 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk16;
+		/* AES rounds */
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk16_last;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+		jz .Lecb_enc_blk16_last;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+	  .Lecb_enc_blk16_last:
+		vaesenclast %ymm8, %ymm0, %ymm0;
+		vaesenclast %ymm8, %ymm1, %ymm1;
+		vaesenclast %ymm8, %ymm2, %ymm2;
+		vaesenclast %ymm8, %ymm3, %ymm3;
+		vaesenclast %ymm8, %ymm4, %ymm4;
+		vaesenclast %ymm8, %ymm5, %ymm5;
+		vaesenclast %ymm8, %ymm6, %ymm6;
+		vaesenclast %ymm8, %ymm7, %ymm7;
+		jmp .Lecb_blk16_end;
+
+	  .align 8
+	  .Lecb_dec_blk16:
+		/* AES rounds */
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk16_last;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+		jz .Lecb_dec_blk16_last;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+	  .Lecb_dec_blk16_last:
+		vaesdeclast %ymm8, %ymm0, %ymm0;
+		vaesdeclast %ymm8, %ymm1, %ymm1;
+		vaesdeclast %ymm8, %ymm2, %ymm2;
+		vaesdeclast %ymm8, %ymm3, %ymm3;
+		vaesdeclast %ymm8, %ymm4, %ymm4;
+		vaesdeclast %ymm8, %ymm5, %ymm5;
+		vaesdeclast %ymm8, %ymm6, %ymm6;
+		vaesdeclast %ymm8, %ymm7, %ymm7;
+		jmp .Lecb_blk16_end;
+
+  .align 8
+  .Lecb_blk16_end:
+	vmovdqu %ymm0, (0 * 16)(%rdx);
+	vmovdqu %ymm1, (2 * 16)(%rdx);
+	vmovdqu %ymm2, (4 * 16)(%rdx);
+	vmovdqu %ymm3, (6 * 16)(%rdx);
+	vmovdqu %ymm4, (8 * 16)(%rdx);
+	vmovdqu %ymm5, (10 * 16)(%rdx);
+	vmovdqu %ymm6, (12 * 16)(%rdx);
+	vmovdqu %ymm7, (14 * 16)(%rdx);
+	leaq (16 * 16)(%rdx), %rdx;
+
+	jmp .Lecb_blk16;
+
+	/* Handle trailing eight blocks. */
+.align 8
+.Lecb_blk8:
+	cmpq $8, %r8;
+	jmp .Lecb_blk4;
+
+	leaq -8(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vmovdqu (4 * 16)(%rcx), %ymm2;
+	vmovdqu (6 * 16)(%rcx), %ymm3;
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm4, %ymm1, %ymm1;
+	vpxor %ymm4, %ymm2, %ymm2;
+	vpxor %ymm4, %ymm3, %ymm3;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+	leaq (8 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk8;
+		/* AES rounds */
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk8_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_enc_blk8_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_enc_blk8_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vaesenclast %ymm4, %ymm2, %ymm2;
+		vaesenclast %ymm4, %ymm3, %ymm3;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		vmovdqu %ymm2, (4 * 16)(%rdx);
+		vmovdqu %ymm3, (6 * 16)(%rdx);
+		leaq (8 * 16)(%rdx), %rdx;
+		jmp .Lecb_blk4;
+
+	  .align 8
+	  .Lecb_dec_blk8:
+		/* AES rounds */
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk8_last;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_dec_blk8_last;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_dec_blk8_last:
+		vaesdeclast %ymm4, %ymm0, %ymm0;
+		vaesdeclast %ymm4, %ymm1, %ymm1;
+		vaesdeclast %ymm4, %ymm2, %ymm2;
+		vaesdeclast %ymm4, %ymm3, %ymm3;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		vmovdqu %ymm2, (4 * 16)(%rdx);
+		vmovdqu %ymm3, (6 * 16)(%rdx);
+		leaq (8 * 16)(%rdx), %rdx;
+
+	/* Handle trailing four blocks. */
+.align 8
+.Lecb_blk4:
+	cmpq $4, %r8;
+	jb .Lecb_blk1;
+
+	leaq -4(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm4, %ymm1, %ymm1;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+	leaq (4 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk4;
+		/* AES rounds */
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk4_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_enc_blk4_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_enc_blk4_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		leaq (4 * 16)(%rdx), %rdx;
+		jmp .Lecb_blk1;
+
+	  .align 8
+	  .Lecb_dec_blk4:
+		/* AES rounds */
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk4_last;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_dec_blk4_last;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_dec_blk4_last:
+		vaesdeclast %ymm4, %ymm0, %ymm0;
+		vaesdeclast %ymm4, %ymm1, %ymm1;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		leaq (4 * 16)(%rdx), %rdx;
+
+	/* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lecb_blk1:
+	cmpq $1, %r8;
+	jb .Ldone_ecb;
+
+	leaq -1(%r8), %r8;
+
+	/* Load input. */
+	vmovdqu (%rcx), %xmm2;
+	leaq 16(%rcx), %rcx;
+
+	/* Xor first key. */
+	vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk1;
+		/* AES rounds. */
+		vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (10 * 16)(%rdi), %xmm1;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk1_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (12 * 16)(%rdi), %xmm1;
+		jz .Lecb_enc_blk1_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (14 * 16)(%rdi), %xmm1;
+	  .Lecb_enc_blk1_last:
+		vaesenclast %xmm1, %xmm0, %xmm0;
+		jmp .Lecb_blk1_end;
+
+	  .align 8
+	  .Lecb_dec_blk1:
+		/* AES rounds. */
+		vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (10 * 16)(%rdi), %xmm1;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk1_last;
+		vaesdec %xmm1, %xmm0, %xmm0;
+		vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (12 * 16)(%rdi), %xmm1;
+		jz .Lecb_dec_blk1_last;
+		vaesdec %xmm1, %xmm0, %xmm0;
+		vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (14 * 16)(%rdi), %xmm1;
+	  .Lecb_dec_blk1_last:
+		vaesdeclast %xmm1, %xmm0, %xmm0;
+		jmp .Lecb_blk1_end;
+
+  .align 8
+  .Lecb_blk1_end:
+	vmovdqu %xmm0, (%rdx);
+	leaq 16(%rdx), %rdx;
+
+	jmp .Lecb_blk1;
+
+.align 8
+.Ldone_ecb:
+	vzeroall;
+	ret_spec_stop
+	CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
+
 /**********************************************************************
   constants
  **********************************************************************/
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index dbcf9afa..978c86da 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -91,6 +91,32 @@ extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
 					     unsigned int nrounds,
 					     int encrypt) ASM_FUNC_ABI;
 
+extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched,
+					     int encrypt,
+					     void *outbuf_arg,
+					     const void *inbuf_arg,
+					     size_t nblocks,
+					     unsigned int nrounds) ASM_FUNC_ABI;
+
+
+void
+_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf,
+			  const void *inbuf, size_t nblocks,
+			  int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf,
+				   nblocks, nrounds);
+}
 
 void
 _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index f3060ea5..84cb7109 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -102,6 +102,9 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg
 extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
                                        void *outbuf_arg, const void *inbuf_arg,
                                        size_t nblocks, int encrypt);
+extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg,
+				       const void *inbuf_arg, size_t nblocks,
+				       int encrypt);
 #endif
 
 #ifdef USE_VAES
@@ -125,6 +128,9 @@ extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
 				      void *outbuf_arg, const void *inbuf_arg,
 				      size_t nblocks, int encrypt);
+extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg,
+				      const void *inbuf_arg, size_t nblocks,
+				      int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -227,6 +233,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
                                           void *outbuf_arg,
                                           const void *inbuf_arg,
                                           size_t nblocks, int encrypt);
+extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg,
+                                          const void *inbuf_arg, size_t nblocks,
+                                          int encrypt);
 #endif /*USE_ARM_ASM*/
 
 #ifdef USE_PPC_CRYPTO
@@ -524,6 +533,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt;
 
 #ifdef USE_VAES
       if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) &&
@@ -536,6 +546,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 	  bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
 	  bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
 	  bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
+	  bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
 	}
 #endif
     }
@@ -591,6 +602,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt;
     }
 #endif
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
-- 
2.37.2




More information about the Gcrypt-devel mailing list