From cvs at cvs.gnupg.org  Wed Jan  2 20:35:57 2019
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Wed, 02 Jan 2019 20:35:57 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-138-g3ee6588
Message-ID: <E1gemJ4-0004ST-6T@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  3ee6588de8311b461ef8707c70ff86d2b252966d (commit)
       via  4871f11745f33c5c5051bfe6f325ac1c10764b04 (commit)
       via  edde61f325e4b345f17c47369f3b6b1400656f04 (commit)
      from  3028a221d39c1b593ea0c1bcbfccd33959769692 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 3ee6588de8311b461ef8707c70ff86d2b252966d
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Jan 2 21:25:44 2019 +0200

    Process CCM/EAX/GCM/Poly1305 AEAD cipher modes input in 24 KiB chucks
    
    * cipher/cipher-ccm.c (_gcry_cipher_ccm_encrypt)
    (_gcry_cipher_ccm_decrypt): Process data in 24 KiB chunks.
    * cipher/cipher-eax.c (_gcry_cipher_eax_encrypt)
    (_gcry_cipher_eax_decrypt): Ditto.
    * cipher/cipher-gcm.c (_gcry_cipher_gcm_encrypt)
    (_gcry_cipher_gcm_decrypt): Ditto.
    * cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt)
    (_gcry_cipher_poly1305_decrypt): Ditto.
    --
    
    Patch changes AEAD modes to process input in 24 KiB chuncks to improve
    cache locality when processing large buffers.
    
    Huge buffer test in tests/benchmark show 0.7% improvement for AES-CCM
    and AES-EAX, 6% for AES-GCM and 4% for Chacha20-Poly1305 on Intel Core
    i7-4790K.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c
index e71c6f1..fd284ca 100644
--- a/cipher/cipher-ccm.c
+++ b/cipher/cipher-ccm.c
@@ -319,7 +319,9 @@ _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
                           size_t outbuflen, const unsigned char *inbuf,
                           size_t inbuflen)
 {
-  unsigned int burn;
+  gcry_err_code_t err = 0;
+  unsigned int burn = 0;
+  unsigned int nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
@@ -329,12 +331,32 @@ _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
   if (inbuflen > c->u_mode.ccm.encryptlen)
     return GPG_ERR_INV_LENGTH;
 
-  c->u_mode.ccm.encryptlen -= inbuflen;
-  burn = do_cbc_mac (c, inbuf, inbuflen, 0);
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for encryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      c->u_mode.ccm.encryptlen -= currlen;
+      nburn = do_cbc_mac (c, inbuf, currlen, 0);
+      burn = nburn > burn ? nburn : burn;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err)
+	break;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
-
-  return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+  return err;
 }
 
 
@@ -343,8 +365,9 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
                           size_t outbuflen, const unsigned char *inbuf,
                           size_t inbuflen)
 {
-  gcry_err_code_t err;
-  unsigned int burn;
+  gcry_err_code_t err = 0;
+  unsigned int burn = 0;
+  unsigned int nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
@@ -354,14 +377,30 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
   if (inbuflen > c->u_mode.ccm.encryptlen)
     return GPG_ERR_INV_LENGTH;
 
-  err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
-  if (err)
-    return err;
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err)
+	break;
+
+      c->u_mode.ccm.encryptlen -= currlen;
+      nburn = do_cbc_mac (c, outbuf, currlen, 0);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
 
-  c->u_mode.ccm.encryptlen -= inbuflen;
-  burn = do_cbc_mac (c, outbuf, inbuflen, 0);
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
-
   return err;
 }
diff --git a/cipher/cipher-eax.c b/cipher/cipher-eax.c
index 3b17bb6..08f815a 100644
--- a/cipher/cipher-eax.c
+++ b/cipher/cipher-eax.c
@@ -48,11 +48,31 @@ _gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
 	return err;
     }
 
-  err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
-  if (err != 0)
-    return err;
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
 
-  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf, inbuflen);
+      err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf,
+			      currlen);
+      if (err != 0)
+	return err;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
 }
 
 
@@ -75,11 +95,31 @@ _gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
 	return err;
     }
 
-  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf, inbuflen);
-  if (err != 0)
-    return err;
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf,
+			      currlen);
+      if (err != 0)
+	return err;
 
-  return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen);
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
 }
 
 
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 32ec9fa..f9ddbc5 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -666,11 +666,26 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
-  err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
-  if (err != 0)
-    return err;
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
 
-  do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, inbuflen, 0);
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
+
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, currlen, 0);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
 
   return 0;
 }
@@ -682,6 +697,7 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
                           const byte *inbuf, size_t inbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
+  gcry_err_code_t err;
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
@@ -711,9 +727,28 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
-  do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, inbuflen, 0);
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before decryption, process input in
+       * 24KiB chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
 
-  return gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen);
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, currlen, 0);
+
+      err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
+      if (err)
+	return err;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
 }
 
 
diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index 82537aa..607586b 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -164,9 +164,24 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
-  c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen);
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
 
-  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, inbuflen);
+      _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, currlen);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
 
   return 0;
 }
@@ -202,9 +217,25 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
-  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, inbuflen);
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen);
+
+      c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
 
-  c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen);
   return 0;
 }
 

commit 4871f11745f33c5c5051bfe6f325ac1c10764b04
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Jan 2 21:25:44 2019 +0200

    tests/benchmark: add Chacha20-Poly1305 benchmarking
    
    * tests/benchmark.c (cipher_bench): Add Chacha20-Poly1305.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/tests/benchmark.c b/tests/benchmark.c
index f9974fc..418f929 100644
--- a/tests/benchmark.c
+++ b/tests/benchmark.c
@@ -825,7 +825,7 @@ cipher_bench ( const char *algoname )
     int doublekey;
   } modes[] = {
     { GCRY_CIPHER_MODE_ECB, "   ECB/Stream", 1, 0xffffffffU },
-    { GCRY_CIPHER_MODE_CBC, "      CBC", 1, 0xffffffffU },
+    { GCRY_CIPHER_MODE_CBC, " CBC/Poly1305", 1, 0xffffffffU },
     { GCRY_CIPHER_MODE_CFB, "      CFB", 0, 0xffffffffU },
     { GCRY_CIPHER_MODE_OFB, "      OFB", 0, 0xffffffffU },
     { GCRY_CIPHER_MODE_CTR, "      CTR", 0, 0xffffffffU },
@@ -840,6 +840,8 @@ cipher_bench ( const char *algoname )
     { GCRY_CIPHER_MODE_EAX, "      EAX", 0, 0xffffffffU,
       NULL, 0, 8, 8 },
     { GCRY_CIPHER_MODE_STREAM, "", 0, 0xffffffffU },
+    { GCRY_CIPHER_MODE_POLY1305, "", 0, 0xffffffffU,
+      NULL, 1, 16, 12 },
     {0}
   };
   int modeidx;
@@ -931,9 +933,14 @@ cipher_bench ( const char *algoname )
   for (modeidx=0; modes[modeidx].mode; modeidx++)
     {
       size_t modekeylen = keylen * (!!modes[modeidx].doublekey + 1);
+      int is_stream = modes[modeidx].mode == GCRY_CIPHER_MODE_STREAM
+                      || modes[modeidx].mode == GCRY_CIPHER_MODE_POLY1305;
 
-      if ((blklen > 1 && modes[modeidx].mode == GCRY_CIPHER_MODE_STREAM)
-          || (blklen == 1 && modes[modeidx].mode != GCRY_CIPHER_MODE_STREAM))
+      if ((blklen > 1 && is_stream) || (blklen == 1 && !is_stream))
+        continue;
+
+      if (modes[modeidx].mode == GCRY_CIPHER_MODE_POLY1305
+          && algo != GCRY_CIPHER_CHACHA20)
         continue;
 
       if (modes[modeidx].req_blocksize > 0

commit edde61f325e4b345f17c47369f3b6b1400656f04
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Jan 2 21:25:44 2019 +0200

    tests/benchmark: add --huge-buffers option for cipher tests
    
    * tests/benchmark.c (huge_buffers, cipher_encrypt, cipher_decrypt): New.
    (cipher_bench): Add 'max_inlen' to modes structure; add huge buffers
    mode selection.
    (main): Add '--huge-buffers'.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/tests/benchmark.c b/tests/benchmark.c
index 59ea32c..f9974fc 100644
--- a/tests/benchmark.c
+++ b/tests/benchmark.c
@@ -37,9 +37,12 @@
 #define PGM "benchmark"
 #include "t-common.h"
 
-/* Do encryption tests with large buffers.  */
+/* Do encryption tests with large buffers (100 KiB).  */
 static int large_buffers;
 
+/* Do encryption tests with huge buffers (256 MiB). */
+static int huge_buffers;
+
 /* Number of cipher repetitions.  */
 static int cipher_repetitions;
 
@@ -743,6 +746,60 @@ static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen)
 }
 
 
+static gcry_error_t
+cipher_encrypt (gcry_cipher_hd_t h, char *out, size_t outsize,
+		const char *in, size_t inlen, size_t max_inlen)
+{
+  gcry_error_t ret;
+
+  while (inlen)
+    {
+      size_t currlen = inlen;
+
+      if (currlen > max_inlen)
+	currlen = max_inlen;
+
+      ret = gcry_cipher_encrypt(h, out, outsize, in, currlen);
+      if (ret)
+	return ret;
+
+      out += currlen;
+      in += currlen;
+      outsize -= currlen;
+      inlen -= currlen;
+    }
+
+  return 0;
+}
+
+
+static gcry_error_t
+cipher_decrypt (gcry_cipher_hd_t h, char *out, size_t outsize,
+		const char *in, size_t inlen, size_t max_inlen)
+{
+  gcry_error_t ret;
+
+  while (inlen)
+    {
+      size_t currlen = inlen;
+
+      if (currlen > max_inlen)
+	currlen = max_inlen;
+
+      ret = gcry_cipher_decrypt(h, out, outsize, in, currlen);
+      if (ret)
+	return ret;
+
+      out += currlen;
+      in += currlen;
+      outsize -= currlen;
+      inlen -= currlen;
+    }
+
+  return 0;
+}
+
+
 static void
 cipher_bench ( const char *algoname )
 {
@@ -760,34 +817,34 @@ cipher_bench ( const char *algoname )
     int mode;
     const char *name;
     int blocked;
+    unsigned int max_inlen;
     void (* const aead_init)(gcry_cipher_hd_t hd, size_t buflen, int authlen);
     int req_blocksize;
     int authlen;
     int noncelen;
     int doublekey;
   } modes[] = {
-    { GCRY_CIPHER_MODE_ECB, "   ECB/Stream", 1 },
-    { GCRY_CIPHER_MODE_CBC, "      CBC", 1 },
-    { GCRY_CIPHER_MODE_CFB, "      CFB", 0 },
-    { GCRY_CIPHER_MODE_OFB, "      OFB", 0 },
-    { GCRY_CIPHER_MODE_CTR, "      CTR", 0 },
-    { GCRY_CIPHER_MODE_XTS, "      XTS", 0,
+    { GCRY_CIPHER_MODE_ECB, "   ECB/Stream", 1, 0xffffffffU },
+    { GCRY_CIPHER_MODE_CBC, "      CBC", 1, 0xffffffffU },
+    { GCRY_CIPHER_MODE_CFB, "      CFB", 0, 0xffffffffU },
+    { GCRY_CIPHER_MODE_OFB, "      OFB", 0, 0xffffffffU },
+    { GCRY_CIPHER_MODE_CTR, "      CTR", 0, 0xffffffffU },
+    { GCRY_CIPHER_MODE_XTS, "      XTS", 0, 16 << 20,
       NULL, GCRY_XTS_BLOCK_LEN, 0, 0, 1 },
-    { GCRY_CIPHER_MODE_CCM, "      CCM", 0,
-      ccm_aead_init, GCRY_CCM_BLOCK_LEN, 8 },
-    { GCRY_CIPHER_MODE_GCM, "      GCM", 0,
+    { GCRY_CIPHER_MODE_CCM, "      CCM", 0, 0xffffffffU,
+      ccm_aead_init, GCRY_CCM_BLOCK_LEN, 8, },
+    { GCRY_CIPHER_MODE_GCM, "      GCM", 0, 0xffffffffU,
       NULL, GCRY_GCM_BLOCK_LEN, GCRY_GCM_BLOCK_LEN },
-    { GCRY_CIPHER_MODE_OCB, "      OCB", 1,
+    { GCRY_CIPHER_MODE_OCB, "      OCB", 1, 0xffffffffU,
       NULL, 16, 16, 15 },
-    { GCRY_CIPHER_MODE_EAX, "      EAX", 0,
+    { GCRY_CIPHER_MODE_EAX, "      EAX", 0, 0xffffffffU,
       NULL, 0, 8, 8 },
-    { GCRY_CIPHER_MODE_STREAM, "", 0 },
+    { GCRY_CIPHER_MODE_STREAM, "", 0, 0xffffffffU },
     {0}
   };
   int modeidx;
   gcry_error_t err = GPG_ERR_NO_ERROR;
 
-
   if (!algoname)
     {
       for (i=1; i < 400; i++)
@@ -796,7 +853,12 @@ cipher_bench ( const char *algoname )
       return;
     }
 
-  if (large_buffers)
+  if (huge_buffers)
+    {
+      allocated_buflen = 256 * 1024 * 1024;
+      repetitions = 4;
+    }
+  else if (large_buffers)
     {
       allocated_buflen = 1024 * 100;
       repetitions = 10;
@@ -945,14 +1007,16 @@ cipher_bench ( const char *algoname )
             {
               (*modes[modeidx].aead_init) (hd, buflen, modes[modeidx].authlen);
               gcry_cipher_final (hd);
-              err = gcry_cipher_encrypt (hd, outbuf, buflen, buf, buflen);
+              err = cipher_encrypt (hd, outbuf, buflen, buf, buflen,
+				    modes[modeidx].max_inlen);
               if (err)
                 break;
               err = gcry_cipher_gettag (hd, outbuf, modes[modeidx].authlen);
             }
           else
             {
-              err = gcry_cipher_encrypt (hd, outbuf, buflen, buf, buflen);
+              err = cipher_encrypt (hd, outbuf, buflen, buf, buflen,
+				    modes[modeidx].max_inlen);
             }
         }
       stop_timer ();
@@ -1024,7 +1088,8 @@ cipher_bench ( const char *algoname )
             {
               (*modes[modeidx].aead_init) (hd, buflen, modes[modeidx].authlen);
               gcry_cipher_final (hd);
-              err = gcry_cipher_decrypt (hd, outbuf, buflen, buf, buflen);
+              err = cipher_decrypt (hd, outbuf, buflen, buf, buflen,
+				    modes[modeidx].max_inlen);
               if (err)
                 break;
               err = gcry_cipher_checktag (hd, outbuf, modes[modeidx].authlen);
@@ -1034,7 +1099,8 @@ cipher_bench ( const char *algoname )
           else
             {
               gcry_cipher_final (hd);
-              err = gcry_cipher_decrypt (hd, outbuf, buflen, buf, buflen);
+              err = cipher_decrypt (hd, outbuf, buflen, buf, buflen,
+				    modes[modeidx].max_inlen);
             }
         }
       stop_timer ();
@@ -1741,6 +1807,11 @@ main( int argc, char **argv )
           large_buffers = 1;
           argc--; argv++;
         }
+      else if (!strcmp (*argv, "--huge-buffers"))
+        {
+          huge_buffers = 1;
+          argc--; argv++;
+        }
       else if (!strcmp (*argv, "--cipher-repetitions"))
         {
           argc--; argv++;

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-ccm.c      |  65 ++++++++++++++++++++-----
 cipher/cipher-eax.c      |  56 ++++++++++++++++++----
 cipher/cipher-gcm.c      |  47 ++++++++++++++++---
 cipher/cipher-poly1305.c |  39 +++++++++++++--
 tests/benchmark.c        | 120 ++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 275 insertions(+), 52 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From cvs at cvs.gnupg.org  Mon Jan 14 21:21:52 2019
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Mon, 14 Jan 2019 21:21:52 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-139-g09c2728
Message-ID: <E1gj8k5-0000E8-JF@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  09c27280cc09798d15369b3a143036b7ab5ddd69 (commit)
      from  3ee6588de8311b461ef8707c70ff86d2b252966d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 09c27280cc09798d15369b3a143036b7ab5ddd69
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Mon Jan 14 22:14:24 2019 +0200

    camellia-aarch64: do not export look-up table globally
    
    * cipher/camellia-aarch64.S (_gcry_camellia_arm_tables): Remove
    '.globl' export.
    --
    
    Reported-by: Martin Husemann <martin at NetBSD.org>
    GnuPG-bug-id: 4317
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S
index b0e9a03..5c6ab02 100644
--- a/cipher/camellia-aarch64.S
+++ b/cipher/camellia-aarch64.S
@@ -289,7 +289,6 @@ _gcry_camellia_arm_decrypt_block:
 ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
 
 /* Encryption/Decryption tables */
-.globl _gcry_camellia_arm_tables
 ELF(.type  _gcry_camellia_arm_tables, at object;)
 .balign 32
 _gcry_camellia_arm_tables:

-----------------------------------------------------------------------

Summary of changes:
 cipher/camellia-aarch64.S | 1 -
 1 file changed, 1 deletion(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From andre at amorim.me  Mon Jan 14 21:29:27 2019
From: andre at amorim.me (Andre Amorim)
Date: Mon, 14 Jan 2019 20:29:27 +0000
Subject: [git] GCRYPT - branch, master,
 updated. libgcrypt-1.8.1-139-g09c2728
In-Reply-To: <E1gj8k5-0000E8-JF@lists.gnupg.org>
References: <E1gj8k5-0000E8-JF@lists.gnupg.org>
Message-ID: <CAB2_bFo5Q6M3Y42r9aOwuGiYaomWgE=uu6pQxb-sXRJP1fR_aw@mail.gmail.com>

I have no idea what camellia-aarch64 means ...

On Mon, 14 Jan 2019 at 20:23, by Jussi Kivilinna <cvs at cvs.gnupg.org> wrote:

> This is an automated email from the git hooks/post-receive script. It was
> generated because a ref change was pushed to the repository containing
> the project "The GNU crypto library".
>
> The branch, master has been updated
>        via  09c27280cc09798d15369b3a143036b7ab5ddd69 (commit)
>       from  3ee6588de8311b461ef8707c70ff86d2b252966d (commit)
>
> Those revisions listed above that are new to this repository have
> not appeared on any other notification email; so we list those
> revisions in full, below.
>
> - Log -----------------------------------------------------------------
> commit 09c27280cc09798d15369b3a143036b7ab5ddd69
> Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
> Date:   Mon Jan 14 22:14:24 2019 +0200
>
>     camellia-aarch64: do not export look-up table globally
>
>     * cipher/camellia-aarch64.S (_gcry_camellia_arm_tables): Remove
>     '.globl' export.
>     --
>
>     Reported-by: Martin Husemann <martin at NetBSD.org>
>     GnuPG-bug-id: 4317
>     Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
>
> diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S
> index b0e9a03..5c6ab02 100644
> --- a/cipher/camellia-aarch64.S
> +++ b/cipher/camellia-aarch64.S
> @@ -289,7 +289,6 @@ _gcry_camellia_arm_decrypt_block:
>  ELF(.size
> _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
>
>  /* Encryption/Decryption tables */
> -.globl _gcry_camellia_arm_tables
>  ELF(.type  _gcry_camellia_arm_tables, at object;)
>  .balign 32
>  _gcry_camellia_arm_tables:
>
> -----------------------------------------------------------------------
>
> Summary of changes:
>  cipher/camellia-aarch64.S | 1 -
>  1 file changed, 1 deletion(-)
>
>
> hooks/post-receive
> --
> The GNU crypto library
> http://git.gnupg.org
>
>
> _______________________________________________
> Gnupg-commits mailing list
> Gnupg-commits at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gnupg-commits
>
>
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20190114/8ff9135e/attachment.html>

From gniibe at fsij.org  Tue Jan 15 02:21:41 2019
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 15 Jan 2019 10:21:41 +0900
Subject: [git] GCRYPT - branch, master,
 updated. libgcrypt-1.8.1-139-g09c2728
In-Reply-To: <CAB2_bFo5Q6M3Y42r9aOwuGiYaomWgE=uu6pQxb-sXRJP1fR_aw@mail.gmail.com>
References: <E1gj8k5-0000E8-JF@lists.gnupg.org>
 <CAB2_bFo5Q6M3Y42r9aOwuGiYaomWgE=uu6pQxb-sXRJP1fR_aw@mail.gmail.com>
Message-ID: <87r2deyau2.fsf@fsij.org>

Andre Amorim <andre at amorim.me> wrote:
> I have no idea what camellia-aarch64 means ...

Camellia is a cipher, comparable to AES.  You can see the definition in
RFC-3713.

AArch64 is the 64-bit architecture of ARM, also known as ARM64.
Most likely, you can find a machine as a smartphone.

The code in question offers an optimized implementation of Camellia for
AArch64.
-- 


From cvs at cvs.gnupg.org  Tue Jan 15 05:58:02 2019
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Tue, 15 Jan 2019 05:58:02 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-140-g2677d7d
Message-ID: <E1gjGnb-0003yv-AG@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  2677d7d482bf2d078c1dce64854747c5b148924b (commit)
      from  09c27280cc09798d15369b3a143036b7ab5ddd69 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 2677d7d482bf2d078c1dce64854747c5b148924b
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Tue Jan 15 13:53:45 2019 +0900

    random: Use getentropy when available for not GNU/Linux.
    
    * configure.ac: Detect getentropy.
    * random/rndlinux.c [__linux__] (getentropy): Macro defined.
    [HAVE_GETENTROPY] (_gcry_rndlinux_gather_random): Use getentropy.
    
    --
    
    GnuPG-bug-id: 4288
    Reported-by: David Carlier
    Signed-off-by: NIIBE Yutaka <gniibe at fsij.org>

diff --git a/configure.ac b/configure.ac
index 5843884..67cf1f7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1772,7 +1772,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile)
-AC_CHECK_FUNCS(explicit_bzero)
+AC_CHECK_FUNCS(explicit_bzero getentropy)
 
 GNUPG_CHECK_MLOCK
 
diff --git a/random/rndlinux.c b/random/rndlinux.c
index 3d41cd3..d71261c 100644
--- a/random/rndlinux.c
+++ b/random/rndlinux.c
@@ -32,8 +32,13 @@
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
-#if defined(__linux__) && defined(HAVE_SYSCALL)
+#if defined(__linux__) || !defined(HAVE_GETENTROPY)
+#ifdef HAVE_SYSCALL
 # include <sys/syscall.h>
+# ifdef __NR_getrandom
+# define getentropy(buf,buflen) syscall (__NR_getrandom, buf, buflen, 0)
+# endif
+#endif
 #endif
 
 #include "types.h"
@@ -247,16 +252,14 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t,
       struct timeval tv;
       int rc;
 
-      /* If we have a modern Linux kernel, we first try to use the new
-       * getrandom syscall.  That call guarantees that the kernel's
+      /* If we have a modern operating system, we first try to use the new
+       * getentropy function.  That call guarantees that the kernel's
        * RNG has been properly seeded before returning any data.  This
        * is different from /dev/urandom which may, due to its
        * non-blocking semantics, return data even if the kernel has
        * not been properly seeded.  And it differs from /dev/random by never
-       * blocking once the kernel is seeded. Unfortunately we need to use a
-       * syscall and not a new device and thus we are not able to use
-       * select(2) to have a timeout. */
-#if defined(__linux__) && defined(HAVE_SYSCALL) && defined(__NR_getrandom)
+       * blocking once the kernel is seeded.  */
+#if defined(HAVE_GETENTROPY) || defined(__NR_getrandom)
         {
           long ret;
           size_t nbytes;
@@ -267,20 +270,19 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t,
               if (nbytes > 256)
                 nbytes = 256;
               _gcry_pre_syscall ();
-              ret = syscall (__NR_getrandom,
-                             (void*)buffer, (size_t)nbytes, (unsigned int)0);
+              ret = getentropy (buffer, nbytes);
               _gcry_post_syscall ();
             }
           while (ret == -1 && errno == EINTR);
           if (ret == -1 && errno == ENOSYS)
-            ; /* The syscall is not supported - fallback to pulling from fd.  */
+            ; /* getentropy is not supported - fallback to pulling from fd.  */
           else
-            { /* The syscall is supported.  Some sanity checks.  */
+            { /* getentropy is supported.  Some sanity checks.  */
               if (ret == -1)
-                log_fatal ("unexpected error from getrandom: %s\n",
+                log_fatal ("unexpected error from getentropy: %s\n",
                            strerror (errno));
               else if (ret != nbytes)
-                log_fatal ("getrandom returned only"
+                log_fatal ("getentropy returned only"
                            " %ld of %zu requested bytes\n", ret, nbytes);
 
               (*add)(buffer, nbytes, origin);

-----------------------------------------------------------------------

Summary of changes:
 configure.ac      |  2 +-
 random/rndlinux.c | 28 +++++++++++++++-------------
 2 files changed, 16 insertions(+), 14 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From cvs at cvs.gnupg.org  Tue Jan 15 07:50:33 2019
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Tue, 15 Jan 2019 07:50:33 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-141-g17f246c
Message-ID: <E1gjIYU-0005Bp-Aq@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  17f246c7044ab9ed236f6ec73fc126654257f0f9 (commit)
      from  2677d7d482bf2d078c1dce64854747c5b148924b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 17f246c7044ab9ed236f6ec73fc126654257f0f9
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Tue Jan 15 15:48:25 2019 +0900

    random: Fix previous commit for getentropy function.
    
    * random/rndlinux.c [__NR_getrandom] (_gcry_rndlinux_gather_random):
    Check return value only for use of syscall.
    
    --
    
    The function returns 0 on success.
    
    Signed-off-by: NIIBE Yutaka <gniibe at fsij.org>

diff --git a/random/rndlinux.c b/random/rndlinux.c
index d71261c..04e2a46 100644
--- a/random/rndlinux.c
+++ b/random/rndlinux.c
@@ -281,9 +281,11 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t,
               if (ret == -1)
                 log_fatal ("unexpected error from getentropy: %s\n",
                            strerror (errno));
+#ifdef __NR_getrandom
               else if (ret != nbytes)
                 log_fatal ("getentropy returned only"
                            " %ld of %zu requested bytes\n", ret, nbytes);
+#endif
 
               (*add)(buffer, nbytes, origin);
               length -= nbytes;

-----------------------------------------------------------------------

Summary of changes:
 random/rndlinux.c | 2 ++
 1 file changed, 2 insertions(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From cvs at cvs.gnupg.org  Tue Jan 15 08:18:18 2019
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Tue, 15 Jan 2019 08:18:18 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-142-ge5c2f8a
Message-ID: <E1gjIzL-0001a7-Cv@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  e5c2f8a2cd2b89d90ea30de2dedb0e92498a5f70 (commit)
      from  17f246c7044ab9ed236f6ec73fc126654257f0f9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit e5c2f8a2cd2b89d90ea30de2dedb0e92498a5f70
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Tue Jan 15 16:14:51 2019 +0900

    build: With LD_LIBRARY_PATH defined, use --disable-new-dtags.
    
    * configure.ac (LDADD_FOR_TESTS_KLUDGE): New for --disable-new-dtags.
    * tests/Makefile.am (LDADD, t_lock_LDADD): Use LDADD_FOR_TESTS_KLUDGE.
    
    --
    
    GnuPG-bug-id: 4298
    Signed-off-by: NIIBE Yutaka <gniibe at fsij.org>

diff --git a/configure.ac b/configure.ac
index 67cf1f7..bb3c666 100644
--- a/configure.ac
+++ b/configure.ac
@@ -146,6 +146,41 @@ AC_PROG_AWK
 
 AC_GNU_SOURCE
 
+# Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
+dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
+dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
+dnl the precedence over the run path, so that if a compatible MPFR library
+dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
+dnl MPFR library will be this library instead of the MPFR library from the
+dnl build tree. Other OS with the same issue might be added later.
+dnl
+dnl References:
+dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
+dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
+dnl
+dnl We need to check whether --disable-new-dtags is supported as alternate
+dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
+dnl
+case $host in
+  *-*-linux*)
+    if test -n "$LD_LIBRARY_PATH"; then
+      saved_LDFLAGS="$LDFLAGS"
+      LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
+      LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
+      AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
+      AC_LINK_IFELSE([AC_LANG_SOURCE([[
+int main (void) { return 0; }
+      ]])],
+      [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
+      [AC_MSG_RESULT(no)
+       LDADD_FOR_TESTS_KLUDGE=""
+      ])
+      LDFLAGS="$saved_LDFLAGS"
+    fi
+    ;;
+esac
+AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
+
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
diff --git a/tests/Makefile.am b/tests/Makefile.am
index eee24fa..9e11797 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -62,6 +62,6 @@ EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \
 	     blake2b.h blake2s.h \
 	     basic-disable-all-hwf.in basic_all_hwfeature_combinations.sh
 
-LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS)
-t_lock_LDADD = $(standard_ldadd) $(GPG_ERROR_MT_LIBS)
+LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@
+t_lock_LDADD = $(standard_ldadd) $(GPG_ERROR_MT_LIBS) @LDADD_FOR_TESTS_KLUDGE@
 t_lock_CFLAGS = $(GPG_ERROR_MT_CFLAGS)

-----------------------------------------------------------------------

Summary of changes:
 configure.ac      | 35 +++++++++++++++++++++++++++++++++++
 tests/Makefile.am |  4 ++--
 2 files changed, 37 insertions(+), 2 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From jussi.kivilinna at iki.fi  Fri Jan 18 23:35:37 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 19 Jan 2019 00:35:37 +0200
Subject: [PATCH 1/4] tests/basic: check AEAD tags in check_one_cipher test
Message-ID: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>

* tests/basic.c (get_algo_mode_taglen): New.
(check_one_cipher_core_reset): Check that tags are same with
AEAD modes.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/tests/basic.c b/tests/basic.c
index 0afae3047..96af6c743 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -7256,6 +7256,23 @@ get_algo_mode_blklen (int algo, int mode)
 }
 
 
+static unsigned int
+get_algo_mode_taglen (int algo, int mode)
+{
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_CCM:
+    case GCRY_CIPHER_MODE_GCM:
+    case GCRY_CIPHER_MODE_POLY1305:
+      return 16;
+    case GCRY_CIPHER_MODE_EAX:
+      return gcry_cipher_get_algo_blklen(algo);
+    }
+
+  return 0;
+}
+
+
 static int
 check_one_cipher_core_reset (gcry_cipher_hd_t hd, int algo, int mode, int pass,
                              int nplain)
@@ -7311,14 +7328,18 @@ check_one_cipher_core (int algo, int mode, int flags,
   gcry_cipher_hd_t hd;
   unsigned char in_buffer[1040+1], out_buffer[1040+1];
   unsigned char enc_result[1040];
+  unsigned char tag_result[16];
+  unsigned char tag[16];
   unsigned char *in, *out;
   int keylen;
   gcry_error_t err = 0;
   unsigned int blklen;
   unsigned int piecelen;
   unsigned int pos;
+  unsigned int taglen;
 
   blklen = get_algo_mode_blklen(algo, mode);
+  taglen = get_algo_mode_taglen(algo, mode);
 
   assert (nkey == 64);
   assert (nplain == 1040);
@@ -7402,6 +7423,20 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, gcry_cipher_gettag failed: %s\n",
+		pass, algo, mode, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+
+      memcpy(tag_result, tag, taglen);
+    }
+
   memcpy (enc_result, out, nplain);
 
   if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
@@ -7416,6 +7451,18 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, gcry_cipher_checktag failed: %s\n",
+		pass, algo, mode, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+    }
+
   if (memcmp (plain, in, nplain))
     fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n",
           pass, algo, mode);
@@ -7435,6 +7482,23 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, in-place, "
+		"gcry_cipher_gettag failed: %s\n",
+		pass, algo, mode, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+
+      if (memcmp (tag_result, tag, taglen))
+	fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
+	      pass, algo, mode);
+    }
+
   if (memcmp (enc_result, out, nplain))
     fail ("pass %d, algo %d, mode %d, in-place, encrypt mismatch\n",
           pass, algo, mode);
@@ -7452,6 +7516,19 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, in-place, "
+		"gcry_cipher_checktag failed: %s\n",
+		pass, algo, mode, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+    }
+
   if (memcmp (plain, out, nplain))
     fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n",
           pass, algo, mode);
@@ -7482,6 +7559,23 @@ check_one_cipher_core (int algo, int mode, int flags,
       piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_gettag failed: %s\n",
+		pass, algo, mode, pos, piecelen, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+
+      if (memcmp (tag_result, tag, taglen))
+	fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
+	      pass, algo, mode);
+    }
+
   if (memcmp (enc_result, out, nplain))
     fail ("pass %d, algo %d, mode %d, split-buffer, encrypt mismatch\n",
           pass, algo, mode);
@@ -7510,6 +7604,19 @@ check_one_cipher_core (int algo, int mode, int flags,
       piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
     }
 
+  if (taglen > 0)
+    {
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
+      if (err)
+	{
+	  fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_checktag failed: %s\n",
+		pass, algo, mode, pos, piecelen, gpg_strerror (err));
+	  gcry_cipher_close (hd);
+	  return -1;
+	}
+    }
+
   if (memcmp (plain, in, nplain))
     fail ("pass %d, algo %d, mode %d, split-buffer, encrypt-decrypt mismatch\n",
           pass, algo, mode);


From jussi.kivilinna at iki.fi  Fri Jan 18 23:35:47 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 19 Jan 2019 00:35:47 +0200
Subject: [PATCH 3/4] Add SSSE3 optimized non-parallel ChaCha20 function
In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
Message-ID: <154785094751.23435.17690493289577003135.stgit@localhost.localdomain>

* cipher/chacha20-amd64-ssse3.S (ROTATE_SHUF, ROTATE, WORD_SHUF)
(QUARTERROUND4, _gcry_chacha20_amd64_ssse3_blocks1): New.
* cipher/chacha20.c (_gcry_chacha20_amd64_ssse3_blocks1): New
prototype.
(chacha20_blocks): Rename to ...
(do_chacha20_blocks): ... this.
(chacha20_blocks): New.
(chacha20_encrypt_stream): Adjust for new chacha20_blocks function.
--

This patch provides SSSE3 optimized version of non-parallel
ChaCha20 core block function. On Intel Haswell generic C function
runs at 6.9 cycles/byte. New function runs at 5.2 cycles/byte, thus
being ~32% faster.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index f23722814..0e59ff981 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -163,6 +163,8 @@ chacha20_data:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Lcounter1:
+	.long 1,0,0,0
 .Linc_counter:
 	.long 0,1,2,3
 .Lunsigned_cmp:
@@ -221,7 +223,7 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
-.Lround2:
+.Lround2_4:
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
@@ -235,7 +237,7 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
 	sub $2, ROUND;
-	jnz .Lround2;
+	jnz .Lround2_4;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
@@ -337,5 +339,111 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
+/**********************************************************************
+  1-way chacha20
+ **********************************************************************/
+
+#define ROTATE_SHUF(v1,shuf)		\
+	pshufb shuf, v1;
+
+#define ROTATE(v1,c,tmp1)		\
+	movdqa v1, tmp1; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;
+
+#define WORD_SHUF(v1,shuf)		\
+	pshufd $shuf, v1, v1;
+
+#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
+		      shuf_x2,shuf_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
+	PLUS(x2, x3); \
+	  WORD_SHUF(x3, shuf_x3); \
+		      XOR(x1, x2); \
+	  WORD_SHUF(x2, shuf_x2); \
+				   ROTATE(x1, 7, tmp1); \
+	  WORD_SHUF(x1, shuf_x1);
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks1, at function;)
+
+_gcry_chacha20_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 */
+
+	/* Load constants */
+	movdqa .Lcounter1 RIP, X4;
+	movdqa .Lshuf_rol8 RIP, X5;
+	movdqa .Lshuf_rol16 RIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+.Loop1:
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+.Lround2_1:
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_1;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	lea (64)(DST), DST;
+	lea (64)(SRC), SRC;
+
+	sub $1, NBLKS;
+	jnz .Loop1;
+
+	/* Store counter */
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	/* eax zeroed by round loop. */
+	ret;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 84a9b2b80..f1afd18e0 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -112,6 +112,10 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
+
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
@@ -156,7 +160,7 @@ static const char *selftest (void);
   buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
 static unsigned int
-chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
+do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
   u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
@@ -239,6 +243,21 @@ chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 }
 
 
+static unsigned int
+chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
+		 size_t nblks)
+{
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+  return do_chacha20_blocks (ctx->input, dst, src, nblks);
+}
+
+
 static void
 chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
@@ -475,7 +494,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
@@ -484,7 +503,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
 
   if (length > 0)
     {
-      nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
+      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);


From jussi.kivilinna at iki.fi  Fri Jan 18 23:35:42 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 19 Jan 2019 00:35:42 +0200
Subject: [PATCH 2/4] tests/basic: increase buffer size for check_one_cipher
In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
Message-ID: <154785094234.23435.17421077587711411597.stgit@localhost.localdomain>

* tests/basic.c (check_one_cipher_core)
(check_one_cipher): Increase buffer from 1040 to 1904 bytes.
--

This is for better test coverage of highly parallel cipher
implementations.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/tests/basic.c b/tests/basic.c
index 96af6c743..3d86e022e 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -7326,8 +7326,8 @@ check_one_cipher_core (int algo, int mode, int flags,
                        int bufshift, int pass)
 {
   gcry_cipher_hd_t hd;
-  unsigned char in_buffer[1040+1], out_buffer[1040+1];
-  unsigned char enc_result[1040];
+  unsigned char in_buffer[1904+1], out_buffer[1904+1];
+  unsigned char enc_result[1904];
   unsigned char tag_result[16];
   unsigned char tag[16];
   unsigned char *in, *out;
@@ -7342,7 +7342,7 @@ check_one_cipher_core (int algo, int mode, int flags,
   taglen = get_algo_mode_taglen(algo, mode);
 
   assert (nkey == 64);
-  assert (nplain == 1040);
+  assert (nplain == 1904);
   assert (sizeof(in_buffer) == nplain + 1);
   assert (sizeof(out_buffer) == sizeof(in_buffer));
   assert (blklen > 0);
@@ -7692,7 +7692,7 @@ static void
 check_one_cipher (int algo, int mode, int flags)
 {
   char key[64+1];
-  unsigned char plain[1040+1];
+  unsigned char plain[1904+1];
   int bufshift, i;
 
   for (bufshift=0; bufshift < 4; bufshift++)
@@ -7701,7 +7701,7 @@ check_one_cipher (int algo, int mode, int flags)
       memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_"
 		   "0123456789abcdef.,;/[]{}-=ABCDEF", 64);
       memcpy (plain, "foobar42FOOBAR17", 16);
-      for (i = 16; i < 1040; i += 16)
+      for (i = 16; i < 1904; i += 16)
         {
           memcpy (&plain[i], &plain[i-16], 16);
           if (!++plain[i+7])
@@ -7710,25 +7710,25 @@ check_one_cipher (int algo, int mode, int flags)
             plain[i+14]++;
         }
 
-      if (check_one_cipher_core (algo, mode, flags, key, 64, plain, 1040,
+      if (check_one_cipher_core (algo, mode, flags, key, 64, plain, 1904,
                                  bufshift, 0+10*bufshift))
         return;
 
       /* Pass 1: Key not aligned.  */
       memmove (key+1, key, 64);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain, 1040,
+      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain, 1904,
                                  bufshift, 1+10*bufshift))
         return;
 
       /* Pass 2: Key not aligned and data not aligned.  */
-      memmove (plain+1, plain, 1040);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1, 1040,
+      memmove (plain+1, plain, 1904);
+      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1, 1904,
                                  bufshift, 2+10*bufshift))
         return;
 
       /* Pass 3: Key aligned and data not aligned.  */
       memmove (key, key+1, 64);
-      if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1, 1040,
+      if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1, 1904,
                                  bufshift, 3+10*bufshift))
         return;
     }


From jussi.kivilinna at iki.fi  Fri Jan 18 23:35:52 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 19 Jan 2019 00:35:52 +0200
Subject: [PATCH 4/4] Add stitched ChaCha20-Poly1305 SSSE3 and AVX2
 implementations
In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
Message-ID: <154785095267.23435.15952785945759336425.stgit@localhost.localdomain>

* cipher/asm-poly1305-amd64.h: New.
* cipher/Makefile.am: Add 'asm-poly1305-amd64.h'.
* cipher/chacha20-amd64-avx2.S (QUATERROUND2): Add interleave
operators.
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): New.
* cipher/chacha20-amd64-ssse3.S (QUATERROUND2): Add interleave
operators.
(_gcry_chacha20_poly1305_amd64_ssse3_blocks4)
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1): New.
* cipher/chacha20.c (_gcry_chacha20_poly1305_amd64_ssse3_blocks4)
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1)
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): New prototypes.
(chacha20_encrypt_stream): Split tail to...
(do_chacha20_encrypt_stream_tail): ... new function.
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt): New.
* cipher/cipher-internal.h (_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt): New prototypes.
* cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt): Call
'_gcry_chacha20_poly1305_encrypt' if cipher is ChaCha20.
(_gcry_cipher_poly1305_decrypt): Call
'_gcry_chacha20_poly1305_decrypt' if cipher is ChaCha20.
* cipher/poly1305-internal.h (_gcry_cipher_poly1305_update_burn): New
prototype.
* cipher/poly1305.c (poly1305_blocks): Make static.
(_gcry_poly1305_update): Split main function body to ...
(_gcry_poly1305_update_burn): ... new function.
--

Benchmark on Intel Skylake (i5-6500, 3200 Mhz):

Before, 8-way AVX2:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.378 ns/B      2526 MiB/s      1.21 c/B
     STREAM dec |     0.373 ns/B      2560 MiB/s      1.19 c/B
   POLY1305 enc |     0.685 ns/B      1392 MiB/s      2.19 c/B
   POLY1305 dec |     0.686 ns/B      1390 MiB/s      2.20 c/B
  POLY1305 auth |     0.315 ns/B      3031 MiB/s      1.01 c/B

After, 8-way AVX2 (~36% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |     0.503 ns/B      1896 MiB/s      1.61 c/B
   POLY1305 dec |     0.485 ns/B      1965 MiB/s      1.55 c/B

Benchmark on Intel Haswell (i7-4790K, 3998 Mhz):

Before, 8-way AVX2:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.318 ns/B      2999 MiB/s      1.27 c/B
     STREAM dec |     0.317 ns/B      3004 MiB/s      1.27 c/B
   POLY1305 enc |     0.586 ns/B      1627 MiB/s      2.34 c/B
   POLY1305 dec |     0.586 ns/B      1627 MiB/s      2.34 c/B
  POLY1305 auth |     0.271 ns/B      3524 MiB/s      1.08 c/B

After, 8-way AVX2 (~30% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |     0.452 ns/B      2108 MiB/s      1.81 c/B
   POLY1305 dec |     0.440 ns/B      2167 MiB/s      1.76 c/B

Before, 4-way SSSE3:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.627 ns/B      1521 MiB/s      2.51 c/B
     STREAM dec |     0.626 ns/B      1523 MiB/s      2.50 c/B
   POLY1305 enc |     0.895 ns/B      1065 MiB/s      3.58 c/B
   POLY1305 dec |     0.896 ns/B      1064 MiB/s      3.58 c/B
  POLY1305 auth |     0.271 ns/B      3521 MiB/s      1.08 c/B

After, 4-way SSSE3 (~20% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |     0.733 ns/B      1301 MiB/s      2.93 c/B
   POLY1305 dec |     0.726 ns/B      1314 MiB/s      2.90 c/B

Before, 1-way SSSE3:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |      1.56 ns/B     609.6 MiB/s      6.25 c/B
   POLY1305 dec |      1.56 ns/B     609.4 MiB/s      6.26 c/B

After, 1-way SSSE3 (~18% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |      1.31 ns/B     725.4 MiB/s      5.26 c/B
   POLY1305 dec |      1.31 ns/B     727.3 MiB/s      5.24 c/B

For comparison to other libraries (on Intel i7-4790K, 3998 Mhz):

bench-slope-openssl: OpenSSL 1.1.1  11 Sep 2018
Cipher:
 chacha20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.301 ns/B    3166.4 MiB/s      1.20 c/B
     STREAM dec |     0.300 ns/B    3174.7 MiB/s      1.20 c/B
   POLY1305 enc |     0.463 ns/B    2060.6 MiB/s      1.85 c/B
   POLY1305 dec |     0.462 ns/B    2063.8 MiB/s      1.85 c/B
  POLY1305 auth |     0.162 ns/B    5899.3 MiB/s     0.646 c/B

bench-slope-nettle: Nettle 3.4
Cipher:
 chacha         |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      1.65 ns/B     578.2 MiB/s      6.59 c/B
     STREAM dec |      1.65 ns/B     578.2 MiB/s      6.59 c/B
   POLY1305 enc |      2.05 ns/B     464.8 MiB/s      8.20 c/B
   POLY1305 dec |      2.05 ns/B     464.7 MiB/s      8.20 c/B
  POLY1305 auth |     0.404 ns/B    2359.1 MiB/s      1.62 c/B

bench-slope-botan: Botan 2.6.0
Cipher:
 ChaCha         |  nanosecs/byte   mebibytes/sec   cycles/byte
 STREAM enc/dec |     0.855 ns/B    1116.0 MiB/s      3.42 c/B
   POLY1305 enc |      1.60 ns/B     595.4 MiB/s      6.40 c/B
   POLY1305 dec |      1.60 ns/B     595.8 MiB/s      6.40 c/B
  POLY1305 auth |     0.752 ns/B    1268.3 MiB/s      3.01 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98320ca5f..16066bfc6 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -72,6 +72,7 @@ libcipher_la_SOURCES = \
 EXTRA_libcipher_la_SOURCES = \
 	asm-common-amd64.h \
 	asm-common-aarch64.h \
+	asm-poly1305-amd64.h \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/asm-poly1305-amd64.h b/cipher/asm-poly1305-amd64.h
new file mode 100644
index 000000000..3f99ea3e1
--- /dev/null
+++ b/cipher/asm-poly1305-amd64.h
@@ -0,0 +1,171 @@
+/* asm-common-amd64.h  -  Poly1305 macros for AMD64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AMD64_H
+#define GCRY_ASM_POLY1305_AMD64_H
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 AMD64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    %r8
+#define POLY_RSRC      %r9
+
+#define POLY_R_H0      %rbx
+#define POLY_R_H1      %rcx
+#define POLY_R_H2      %r10
+#define POLY_R_H2d     %r10d
+#define POLY_R_R0      %r11
+#define POLY_R_R1_MUL5 %r12
+#define POLY_R_X0_HI   %r13
+#define POLY_R_X0_LO   %r14
+#define POLY_R_X1_HI   %r15
+#define POLY_R_X1_LO   %rsi
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define POLY1305_LOAD_STATE() \
+	movq POLY_S_H0, POLY_R_H0; \
+	movq POLY_S_H1, POLY_R_H1; \
+	movl POLY_S_H2d, POLY_R_H2d; \
+	movq POLY_S_R0, POLY_R_R0; \
+	movq POLY_S_R1, POLY_R_R1_MUL5; \
+	shrq $2, POLY_R_R1_MUL5; \
+	addq POLY_S_R1, POLY_R_R1_MUL5;
+
+#define POLY1305_STORE_STATE() \
+	movq POLY_R_H0, POLY_S_H0; \
+	movq POLY_R_H1, POLY_S_H1; \
+	movl POLY_R_H2d, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1(src_offset) \
+	addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \
+	adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \
+	adcl $1, POLY_R_H2d; \
+	\
+	/* h = a * r (partial mod 2^130-5): */ \
+	\
+	/* h0 * r1 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_S_R1; \
+	movq %rax, POLY_R_X1_LO; \
+	movq %rdx, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART2() \
+	\
+	/* h0 * r0 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_R_R0; \
+	movq %rax, POLY_R_X0_LO; \
+	movq %rdx, POLY_R_X0_HI;
+
+#define POLY1305_BLOCK_PART3() \
+	\
+	/* h1 * r0 */ \
+	movq POLY_R_H1, %rax; \
+	mulq POLY_R_R0; \
+	addq %rax, POLY_R_X1_LO; \
+	adcq %rdx, POLY_R_X1_HI; \
+	\
+	/* h1 * r1 mod 2^130-5 */ \
+	movq POLY_R_R1_MUL5, %rax; \
+	mulq POLY_R_H1;
+
+#define POLY1305_BLOCK_PART4() \
+	movq POLY_R_H2, POLY_R_H1; \
+	imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \
+	addq %rax, POLY_R_X0_LO; \
+	adcq %rdx, POLY_R_X0_HI; \
+	imulq POLY_R_R0, POLY_R_H2;      /* h2 * r0 */ \
+	addq POLY_R_X1_LO, POLY_R_H1; \
+	adcq POLY_R_X1_HI, POLY_R_H2;
+
+#define POLY1305_BLOCK_PART5() \
+	\
+	/* carry propagation */ \
+	movq POLY_R_H2, POLY_R_H0; \
+	andl $3, POLY_R_H2d; \
+	shrq $2, POLY_R_H0; \
+	leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \
+	addq POLY_R_X0_LO, POLY_R_H0; \
+	adcq POLY_R_X0_HI, POLY_R_H1; \
+	adcl $0, POLY_R_H2d;
+
+#ifdef TESTING_POLY1305_ASM
+/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */
+.align 8
+.globl _gcry_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_poly1305_amd64_ssse3_blocks1, at function;)
+
+_gcry_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: poly1305-state
+	 *	%rsi: src
+	 *	%rdx: nblks
+	 */
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(10 * 8), %rsp;
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rdx, (8 * 8)(%rsp); # NBLKS
+
+	movq %rdi, POLY_RSTATE;
+	movq %rsi, POLY_RSRC;
+
+	POLY1305_LOAD_STATE();
+
+.L_poly1:
+	POLY1305_BLOCK_PART1(0 * 16);
+	POLY1305_BLOCK_PART2();
+	POLY1305_BLOCK_PART3();
+	POLY1305_BLOCK_PART4();
+	POLY1305_BLOCK_PART5();
+
+	subq $1, (8 * 8)(%rsp); # NBLKS
+	leaq (16)(POLY_RSRC), POLY_RSRC;
+	jnz .L_poly1;
+
+	POLY1305_STORE_STATE();
+
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave
+	ret;
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index dad9e3e96..ef02c1733 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -1,7 +1,6 @@
 /* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
  *
-
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -36,17 +35,8 @@
 
 .text
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
@@ -139,15 +129,21 @@
 #define PLUS(ds,s) \
 	vpaddd s, ds, ds;
 
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1)		\
-	vbroadcasti128 .Lshuf_rol16 RIP, tmp1;			\
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
+		      interleave_op1,interleave_op2,\
+		      interleave_op3,interleave_op4)		\
+	vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;			\
+		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op2;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1);				\
-	vbroadcasti128 .Lshuf_rol8 RIP, tmp1;			\
+	vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;			\
+		interleave_op3;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op4;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1);
 
@@ -189,12 +185,12 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	subq $STACK_MAX, %rsp;
 	andq $~31, %rsp;
 
-.Loop4:
+.Loop8:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	vpmovzxbd .Linc_counter RIP, X0;
-	vpbroadcastd .Lunsigned_cmp RIP, X2;
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
 	vpbroadcastd (12 * 4)(INPUT), X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
 	vpaddd X0, X12, X12;
@@ -223,14 +219,14 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	vmovdqa X15, (STACK_TMP)(%rsp);
 
 .Lround2:
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8)
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15)
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
 	sub $2, ROUND;
 	jnz .Lround2;
 
@@ -302,7 +298,7 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	sub $8, NBLKS;
 	lea (8 * 64)(DST), DST;
 	lea (8 * 64)(SRC), SRC;
-	jnz .Loop4;
+	jnz .Loop8;
 
 	/* clear the used vector registers and stack */
 	vpxor X0, X0, X0;
@@ -319,5 +315,438 @@ _gcry_chacha20_amd64_avx2_blocks8:
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
 
+/**********************************************************************
+  8-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8, at function;)
+
+_gcry_chacha20_poly1305_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(8 * 8) + STACK_MAX + 32, %rsp;
+	andq $~31, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly8:
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+	# rounds 0,1
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(0 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(1 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(2 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(3 * 16))
+
+	# rounds 2,3
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(4 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(5 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(6 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	# rounds 4,5
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(7 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(8 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(9 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	# rounds 6,7
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(10 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(11 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(12 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	# rounds 8,9
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(13 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(14 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(15 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	# rounds 10,11
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(16 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(17 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(18 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(19 * 16))
+
+	# rounds 12,13
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(20 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(21 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(22 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	# rounds 14,15
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(23 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(24 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(25 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	# rounds 16,17
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(26 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(27 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(28 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	# rounds 18,19
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(29 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(30 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(31 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	vpbroadcastd (8 * 4)(INPUT), X15;
+	PLUS(X8, X15);
+	vpbroadcastd (9 * 4)(INPUT), X15;
+	PLUS(X9, X15);
+	vpbroadcastd (10 * 4)(INPUT), X15;
+	PLUS(X10, X15);
+	vpbroadcastd (11 * 4)(INPUT), X15;
+	PLUS(X11, X15);
+	vmovdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	vmovdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X13, (STACK_TMP)(%rsp);
+	vpbroadcastd (14 * 4)(INPUT), X13;
+	PLUS(X14, X13);
+	vmovdqa X14, (STACK_TMP1)(%rsp);
+	vpbroadcastd (15 * 4)(INPUT), X13;
+	PLUS(X15, X13);
+	vmovdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14);
+	transpose_4x4(X4, X5, X6, X7, X13, X14);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
+	vmovdqa (STACK_TMP)(%rsp), X13;
+	vmovdqa (STACK_TMP1)(%rsp), X14;
+	vmovdqa (STACK_TMP2)(%rsp), X15;
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+
+	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (32 * 16)(POLY_RSRC), POLY_RSRC;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly8;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vmovdqa X0, (STACK_TMP2)(%rsp);
+	vzeroall;
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 0e59ff981..d7faf6442 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -1,6 +1,6 @@
 /* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
  *
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -35,17 +35,8 @@
 
 .text
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
@@ -145,13 +136,16 @@
 #define PLUS(ds,s) \
 	paddd s, ds;
 
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)	\
-	movdqa .Lshuf_rol16 RIP, tmp1;				\
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
+		      interleave_op1,interleave_op2)		\
+	movdqa .Lshuf_rol16 rRIP, tmp1;				\
+		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
-	movdqa .Lshuf_rol8 RIP, tmp1;				\
+	movdqa .Lshuf_rol8 rRIP, tmp1;				\
+		interleave_op2;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
@@ -192,8 +186,8 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	movdqa .Linc_counter RIP, X0;
-	movdqa .Lunsigned_cmp RIP, X2;
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;
@@ -224,18 +218,18 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	movdqa X15, (STACK_TMP1)(%rsp);
 
 .Lround2_4:
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
 	sub $2, ROUND;
 	jnz .Lround2_4;
 
@@ -380,9 +374,9 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	 */
 
 	/* Load constants */
-	movdqa .Lcounter1 RIP, X4;
-	movdqa .Lshuf_rol8 RIP, X5;
-	movdqa .Lshuf_rol16 RIP, X6;
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
 
 	/* Load state */
 	movdqu (0 * 4)(INPUT), X10;
@@ -445,5 +439,570 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
 
+/**********************************************************************
+  4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4, at function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(8 * 8) + STACK_MAX + 16, %rsp;
+	andq $~15, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly4:
+
+	/* Construct counter vectors X12 and X13 */
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+	/* rounds 0,1 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(0 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(1 * 16))
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	/* rounds 2,3 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(2 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(3 * 16))
+
+	/* rounds 4,5 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(4 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	/* rounds 6,7 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(5 * 16))
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(6 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	/* rounds 8,9 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(7 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* rounds 10,11 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(8 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(9 * 16))
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	/* rounds 12,13 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(10 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(11 * 16))
+
+	/* rounds 14,15 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(12 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	/* rounds 16,17 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(13 * 16))
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(14 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	/* rounds 18,19 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(15 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (16 * 16)(POLY_RSRC), POLY_RSRC;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly4;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
+
+/**********************************************************************
+  1-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1, at function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: chacha20-state
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(8 * 8), %rsp;
+	movq %rbx, (0 * 8)(%rsp);
+	movq %r12, (1 * 8)(%rsp);
+	movq %r13, (2 * 8)(%rsp);
+	movq %r14, (3 * 8)(%rsp);
+	movq %r15, (4 * 8)(%rsp);
+
+	movq %rdx, (5 * 8)(%rsp); # SRC
+	movq %rsi, (6 * 8)(%rsp); # DST
+	movq %rcx, (7 * 8)(%rsp); # NBLKS
+
+	/* Load constants */
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+	POLY1305_LOAD_STATE();
+
+.Loop_poly1:
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	/* Process one ChaCha20 block and four Poly1305 blocks. */
+	POLY1305_BLOCK_PART1(0 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(1 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART1(2 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(3 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	movq (5 * 8)(%rsp), SRC;
+	movq (6 * 8)(%rsp), DST;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	subq $1, (7 * 8)(%rsp); # NBLKS
+	lea (64)(POLY_RSRC), POLY_RSRC;
+	lea (64)(SRC), SRC;
+	lea (64)(DST), DST;
+	movq SRC, (5 * 8)(%rsp);
+	movq DST, (6 * 8)(%rsp);
+
+	jnz .Loop_poly1;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	movq (0 * 8)(%rsp), %rbx;
+	movq (1 * 8)(%rsp), %r12;
+	movq (2 * 8)(%rsp), %r13;
+	movq (3 * 8)(%rsp), %r14;
+	movq (4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index f1afd18e0..0847c20ea 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,5 +1,5 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
- * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -36,6 +36,7 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "cipher-internal.h"
 #include "bufhelp.h"
 
 
@@ -116,6 +117,14 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
@@ -124,6 +133,10 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
 #endif /* USE_AVX2 */
 
 #ifdef USE_ARMV7_NEON
@@ -402,39 +415,13 @@ chacha20_setkey (void *context, const byte *key, unsigned int keylen,
 }
 
 
-static void
-chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
-                         size_t length)
+static unsigned int
+do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
+				 const byte *inbuf, size_t length)
 {
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
-  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   unsigned int nburn, burn = 0;
 
-  if (!length)
-    return;
-
-  if (ctx->unused)
-    {
-      unsigned char *p = ctx->pad;
-      size_t n;
-
-      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
-
-      n = ctx->unused;
-      if (n > length)
-        n = length;
-
-      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
-      length -= n;
-      outbuf += n;
-      inbuf += n;
-      ctx->unused -= n;
-
-      if (!length)
-        return;
-      gcry_assert (!ctx->unused);
-    }
-
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
@@ -510,7 +497,349 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
       ctx->unused = CHACHA20_BLOCK_SIZE - length;
     }
 
-  _gcry_burn_stack (burn);
+  if (burn)
+    burn += 5 * sizeof(void *);
+
+  return burn;
+}
+
+
+static void
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+                         size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+        return;
+      gcry_assert (!ctx->unused);
+    }
+
+  nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+  burn = nburn > burn ? nburn : burn;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c;
+  unsigned int nburn, burn = 0;
+  byte *authptr = NULL;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+  if (0)
+    { }
+#ifdef USE_AVX2
+  else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 8 * CHACHA20_BLOCK_SIZE;
+      outbuf += 8 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_SSSE3
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 1 * CHACHA20_BLOCK_SIZE;
+      outbuf += 1 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+  if (authptr)
+    {
+      size_t authoffset = outbuf - authptr;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2 &&
+	  length >= 8 * CHACHA20_BLOCK_SIZE &&
+	  authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 8;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+#endif
+
+#ifdef USE_SSSE3
+      if (ctx->use_ssse3)
+	{
+	  if (length >= 4 * CHACHA20_BLOCK_SIZE &&
+	      authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	      nblocks -= nblocks % 4;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+
+	  if (length >= CHACHA20_BLOCK_SIZE &&
+	      authoffset >= CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+	}
+#endif
+
+      if (authoffset > 0)
+	{
+	  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
+	  authptr += authoffset;
+	  authoffset = 0;
+	}
+
+      gcry_assert(authptr == outbuf);
+    }
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+
+      nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+			ctx->input, outbuf, inbuf, nblocks,
+			&c->u_mode.poly1305.ctx.state, inbuf);
+      burn = nburn > burn ? nburn : burn;
+
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      if (length >= 4 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 4;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+
+      if (length >= CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+    }
+#endif
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
 }
 
 
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 898869623..78f05dbb5 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -542,6 +542,15 @@ void _gcry_cipher_poly1305_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
+/*-- chacha20.c --*/
+gcry_err_code_t _gcry_chacha20_poly1305_encrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+gcry_err_code_t _gcry_chacha20_poly1305_decrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+
+
 /*-- cipher-ocb.c --*/
 gcry_err_code_t _gcry_cipher_ocb_encrypt
 /*           */ (gcry_cipher_hd_t c,
diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index 607586b55..bb475236b 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -164,6 +164,11 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen);
+    }
+
   while (inbuflen)
     {
       size_t currlen = inbuflen;
@@ -217,6 +222,11 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
       return GPG_ERR_INV_LENGTH;
     }
 
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen);
+    }
+
   while (inbuflen)
     {
       size_t currlen = inbuflen;
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 2405a090f..19cee5f6f 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -58,5 +58,7 @@ void _gcry_poly1305_finish (poly1305_context_t *ctx,
 void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
 			     size_t buflen);
 
+unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx,
+					 const byte *m, size_t bytes);
 
 #endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 571f82862..8de6cd5e6 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -133,7 +133,7 @@ static void poly1305_init (poly1305_context_t *ctx,
     ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
   } while (0)
 
-unsigned int
+static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
@@ -337,7 +337,7 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
     ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
   } while (0)
 
-unsigned int
+static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
@@ -444,8 +444,9 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
 #endif /* USE_MPI_32BIT */
 
 
-void
-_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+unsigned int
+_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
+			    size_t bytes)
 {
   unsigned int burn = 0;
 
@@ -460,7 +461,7 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
       m += want;
       ctx->leftover += want;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
-	return;
+	return 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
@@ -481,6 +482,17 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
       ctx->leftover += bytes;
     }
 
+  return burn;
+}
+
+
+void
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+{
+  unsigned int burn;
+
+  burn = _gcry_poly1305_update_burn (ctx, m, bytes);
+
   if (burn)
     _gcry_burn_stack (burn);
 }


From jussi.kivilinna at iki.fi  Sun Jan 20 00:31:56 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 20 Jan 2019 01:31:56 +0200
Subject: bench-slope for other libraries... (Re: [PATCH 4/4] Add stitched
 ChaCha20-Poly1305 SSSE3 and AVX2 implementations)
In-Reply-To: <154785095267.23435.15952785945759336425.stgit@localhost.localdomain>
References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain>
 <154785095267.23435.15952785945759336425.stgit@localhost.localdomain>
Message-ID: <24d16ceb-a0ef-ce5f-8191-aec458b1321f@iki.fi>

Hello,

On 19.1.2019 0.35, Jussi Kivilinna wrote:
> For comparison to other libraries (on Intel i7-4790K, 3998 Mhz):
> 
> bench-slope-openssl: OpenSSL 1.1.1  11 Sep 2018
> Cipher:
>  chacha20       |  nanosecs/byte   mebibytes/sec   cycles/byte
>      STREAM enc |     0.301 ns/B    3166.4 MiB/s      1.20 c/B
>      STREAM dec |     0.300 ns/B    3174.7 MiB/s      1.20 c/B
>    POLY1305 enc |     0.463 ns/B    2060.6 MiB/s      1.85 c/B
>    POLY1305 dec |     0.462 ns/B    2063.8 MiB/s      1.85 c/B
>   POLY1305 auth |     0.162 ns/B    5899.3 MiB/s     0.646 c/B
> 
> bench-slope-nettle: Nettle 3.4
> Cipher:
>  chacha         |  nanosecs/byte   mebibytes/sec   cycles/byte
>      STREAM enc |      1.65 ns/B     578.2 MiB/s      6.59 c/B
>      STREAM dec |      1.65 ns/B     578.2 MiB/s      6.59 c/B
>    POLY1305 enc |      2.05 ns/B     464.8 MiB/s      8.20 c/B
>    POLY1305 dec |      2.05 ns/B     464.7 MiB/s      8.20 c/B
>   POLY1305 auth |     0.404 ns/B    2359.1 MiB/s      1.62 c/B
> 
> bench-slope-botan: Botan 2.6.0
> Cipher:
>  ChaCha         |  nanosecs/byte   mebibytes/sec   cycles/byte
>  STREAM enc/dec |     0.855 ns/B    1116.0 MiB/s      3.42 c/B
>    POLY1305 enc |      1.60 ns/B     595.4 MiB/s      6.40 c/B
>    POLY1305 dec |      1.60 ns/B     595.8 MiB/s      6.40 c/B
>   POLY1305 auth |     0.752 ns/B    1268.3 MiB/s      3.01 c/B

These bench-slope versions are available at:
 https://github.com/jkivilin/bench-slopes

Autoconf/Build system is not very polished but should work at least on latest Ubuntu. Building manually should not be too hard either (you need to compile slope.c + bench-slope-*.c|cpp). 

-Jussi

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 671 bytes
Desc: OpenPGP digital signature
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20190120/cf9771ad/attachment.sig>

From jussi.kivilinna at iki.fi  Mon Jan 21 21:59:11 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 21 Jan 2019 22:59:11 +0200
Subject: [PATCH 1/2] tests/bench-slope: add missing cipher context reset
Message-ID: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain>

* tests/bench-slope.c (bench_encrypt_do_bench)
(bench_decrypt_do_bench): Add call to 'gcry_cipher_reset'.
--

Some non-AEAD results were negativily affected by missing state
reset (~1% for aesni-ctr and chacha20-stream).

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 5c64f229d..07282b786 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -859,7 +859,9 @@ bench_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen)
   gcry_cipher_hd_t hd = obj->hd;
   int err;
 
-  err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen);
+  err = gcry_cipher_reset (hd);
+  if (!err)
+    err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen);
   if (err)
     {
       fprintf (stderr, PGM ": gcry_cipher_encrypt failed: %s\n",
@@ -875,7 +877,9 @@ bench_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen)
   gcry_cipher_hd_t hd = obj->hd;
   int err;
 
-  err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen);
+  err = gcry_cipher_reset (hd);
+  if (!err)
+    err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen);
   if (err)
     {
       fprintf (stderr, PGM ": gcry_cipher_encrypt failed: %s\n",


From jussi.kivilinna at iki.fi  Mon Jan 21 21:59:16 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 21 Jan 2019 22:59:16 +0200
Subject: [PATCH 2/2] tests/bench-slope: prevent auto-mhz detection getting
 stuck
In-Reply-To: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain>
References: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain>
Message-ID: <154810435612.14673.4924095282245421571.stgit@localhost.localdomain>

* cipher/bench-slope.c (bench_ghz, bench_ghz_diff): New static
variables.
(AUTO_GHZ_TARGET_DIFF): New macro.
(do_slope_benchmark): Reduce target auto-mhz accuracy after
repeated failures.
(bench_print_result_csv, bench_print_result_std): Print auto-ghz
different if 1 Mhz or more.
(do_slope_benchmark, bench_print_result_csv, bench_print_result_std)
(bench_print_result): Remove 'bench_ghz' parameter.
(cipher_bench_one, hash_bench_one, mac_bench_one)
(kdf_bench_one): Remove 'bench_ghz' variable.
--

This patch prevents auto-mhz detection getting stuck on systems with
high load or unstable CPU frequency.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 07282b786..2ead3c9e2 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -64,6 +64,13 @@ static char *current_algo_name;
 static char *current_mode_name;
 
 
+/* Currently used CPU Ghz (either user input or auto-detected. */
+static double bench_ghz;
+
+/* Current accuracy of auto-detected CPU Ghz. */
+static double bench_ghz_diff;
+
+
 /*************************************** Default parameters for measurements. */
 
 /* Start at small buffer size, to get reasonable timer calibration for fast
@@ -82,6 +89,9 @@ static char *current_mode_name;
  * measurements is selected as data point further analysis. */
 #define NUM_MEASUREMENT_REPETITIONS	64
 
+/* Target accuracy for auto-detected CPU Ghz. */
+#define AUTO_GHZ_TARGET_DIFF		(5e-5)
+
 /**************************************************** High-resolution timers. */
 
 /* This benchmarking module needs needs high resolution timer.  */
@@ -540,7 +550,7 @@ get_auto_ghz (void)
 
 
 double
-do_slope_benchmark (struct bench_obj *obj, double *bench_ghz)
+do_slope_benchmark (struct bench_obj *obj)
 {
   double ret;
 
@@ -550,14 +560,17 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz)
 
       ret = slope_benchmark (obj);
 
-      *bench_ghz = cpu_ghz;
+      bench_ghz = cpu_ghz;
+      bench_ghz_diff = 0;
     }
   else
     {
+      double target_diff = AUTO_GHZ_TARGET_DIFF;
       double cpu_auto_ghz_before;
       double cpu_auto_ghz_after;
       double nsecs_per_iteration;
       double diff;
+      unsigned int try_count = 0;
 
       /* Perform measurement with CPU frequency autodetection. */
 
@@ -565,6 +578,15 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz)
         {
           /* Repeat measurement until CPU turbo frequency has stabilized. */
 
+	  if (try_count++ > 4)
+	    {
+	      /* Too much frequency instability on the system, relax target
+	       * accuracy. */
+
+	      try_count = 0;
+	      target_diff *= 2;
+	    }
+
           cpu_auto_ghz_before = get_auto_ghz ();
 
           nsecs_per_iteration = slope_benchmark (obj);
@@ -574,11 +596,12 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz)
           diff = 1.0 - (cpu_auto_ghz_before / cpu_auto_ghz_after);
           diff = diff < 0 ? -diff : diff;
         }
-      while (diff > 5e-5);
+      while (diff > target_diff);
 
       ret = nsecs_per_iteration;
 
-      *bench_ghz = cpu_auto_ghz_after;
+      bench_ghz = (cpu_auto_ghz_before + cpu_auto_ghz_after) / 2;
+      bench_ghz_diff = diff;
     }
 
   return ret;
@@ -605,14 +628,16 @@ double_to_str (char *out, size_t outlen, double value)
 }
 
 static void
-bench_print_result_csv (double nsecs_per_byte, double bench_ghz)
+bench_print_result_csv (double nsecs_per_byte)
 {
   double cycles_per_byte, mbytes_per_sec;
   char nsecpbyte_buf[16];
   char mbpsec_buf[16];
   char cpbyte_buf[16];
   char mhz_buf[16];
+  char mhz_diff_buf[32];
 
+  strcpy (mhz_diff_buf, "");
   *cpbyte_buf = 0;
   *mhz_buf = 0;
 
@@ -624,6 +649,11 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz)
       cycles_per_byte = nsecs_per_byte * bench_ghz;
       double_to_str (cpbyte_buf, sizeof (cpbyte_buf), cycles_per_byte);
       double_to_str (mhz_buf, sizeof (mhz_buf), bench_ghz * 1000);
+      if (auto_ghz && bench_ghz_diff * 1000 >= 1)
+	{
+	  snprintf(mhz_diff_buf, sizeof(mhz_diff_buf), ",%.0f,Mhz-diff",
+		   bench_ghz_diff * 1000);
+	}
     }
 
   mbytes_per_sec =
@@ -633,14 +663,15 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz)
   /* We print two empty fields to allow for future enhancements.  */
   if (auto_ghz)
     {
-      printf ("%s,%s,%s,,,%s,ns/B,%s,MiB/s,%s,c/B,%s,Mhz\n",
+      printf ("%s,%s,%s,,,%s,ns/B,%s,MiB/s,%s,c/B,%s,Mhz%s\n",
               current_section_name,
               current_algo_name? current_algo_name : "",
               current_mode_name? current_mode_name : "",
               nsecpbyte_buf,
               mbpsec_buf,
               cpbyte_buf,
-              mhz_buf);
+              mhz_buf,
+              mhz_diff_buf);
     }
   else
     {
@@ -655,13 +686,16 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz)
 }
 
 static void
-bench_print_result_std (double nsecs_per_byte, double bench_ghz)
+bench_print_result_std (double nsecs_per_byte)
 {
   double cycles_per_byte, mbytes_per_sec;
   char nsecpbyte_buf[16];
   char mbpsec_buf[16];
   char cpbyte_buf[16];
   char mhz_buf[16];
+  char mhz_diff_buf[32];
+
+  strcpy (mhz_diff_buf, "");
 
   double_to_str (nsecpbyte_buf, sizeof (nsecpbyte_buf), nsecs_per_byte);
 
@@ -671,6 +705,11 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz)
       cycles_per_byte = nsecs_per_byte * bench_ghz;
       double_to_str (cpbyte_buf, sizeof (cpbyte_buf), cycles_per_byte);
       double_to_str (mhz_buf, sizeof (mhz_buf), bench_ghz * 1000);
+      if (auto_ghz && bench_ghz_diff * 1000 >= 0.5)
+	{
+	  snprintf(mhz_diff_buf, sizeof(mhz_diff_buf), "?%.0f",
+		   bench_ghz_diff * 1000);
+	}
     }
   else
     {
@@ -684,8 +723,8 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz)
 
   if (auto_ghz)
     {
-      printf ("%9s ns/B %9s MiB/s %9s c/B %9s\n",
-              nsecpbyte_buf, mbpsec_buf, cpbyte_buf, mhz_buf);
+      printf ("%9s ns/B %9s MiB/s %9s c/B %9s%s\n",
+              nsecpbyte_buf, mbpsec_buf, cpbyte_buf, mhz_buf, mhz_diff_buf);
     }
   else
     {
@@ -695,12 +734,12 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz)
 }
 
 static void
-bench_print_result (double nsecs_per_byte, double bench_ghz)
+bench_print_result (double nsecs_per_byte)
 {
   if (csv_mode)
-    bench_print_result_csv (nsecs_per_byte, bench_ghz);
+    bench_print_result_csv (nsecs_per_byte);
   else
-    bench_print_result_std (nsecs_per_byte, bench_ghz);
+    bench_print_result_std (nsecs_per_byte);
 }
 
 static void
@@ -1520,7 +1559,6 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode)
   struct bench_cipher_mode mode = *pmode;
   struct bench_obj obj = { 0 };
   double result;
-  double bench_ghz;
   unsigned int blklen;
 
   mode.algo = algo;
@@ -1565,9 +1603,9 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode)
   obj.ops = mode.ops;
   obj.priv = &mode;
 
-  result = do_slope_benchmark (&obj, &bench_ghz);
+  result = do_slope_benchmark (&obj);
 
-  bench_print_result (result, bench_ghz);
+  bench_print_result (result);
 }
 
 
@@ -1685,7 +1723,6 @@ hash_bench_one (int algo, struct bench_hash_mode *pmode)
 {
   struct bench_hash_mode mode = *pmode;
   struct bench_obj obj = { 0 };
-  double bench_ghz;
   double result;
 
   mode.algo = algo;
@@ -1698,9 +1735,9 @@ hash_bench_one (int algo, struct bench_hash_mode *pmode)
   obj.ops = mode.ops;
   obj.priv = &mode;
 
-  result = do_slope_benchmark (&obj, &bench_ghz);
+  result = do_slope_benchmark (&obj);
 
-  bench_print_result (result, bench_ghz);
+  bench_print_result (result);
 }
 
 static void
@@ -1852,7 +1889,6 @@ mac_bench_one (int algo, struct bench_mac_mode *pmode)
 {
   struct bench_mac_mode mode = *pmode;
   struct bench_obj obj = { 0 };
-  double bench_ghz;
   double result;
 
   mode.algo = algo;
@@ -1865,9 +1901,9 @@ mac_bench_one (int algo, struct bench_mac_mode *pmode)
   obj.ops = mode.ops;
   obj.priv = &mode;
 
-  result = do_slope_benchmark (&obj, &bench_ghz);
+  result = do_slope_benchmark (&obj);
 
-  bench_print_result (result, bench_ghz);
+  bench_print_result (result);
 }
 
 static void
@@ -1970,7 +2006,6 @@ kdf_bench_one (int algo, int subalgo)
   struct bench_obj obj = { 0 };
   double nsecs_per_iteration;
   double cycles_per_iteration;
-  double bench_ghz;
   char algo_name[32];
   char nsecpiter_buf[16];
   char cpiter_buf[16];
@@ -2008,7 +2043,7 @@ kdf_bench_one (int algo, int subalgo)
   obj.ops = mode.ops;
   obj.priv = &mode;
 
-  nsecs_per_iteration = do_slope_benchmark (&obj, &bench_ghz);
+  nsecs_per_iteration = do_slope_benchmark (&obj);
 
   strcpy(cpiter_buf, csv_mode ? "" : "-");
   strcpy(mhz_buf, csv_mode ? "" : "-");


From jussi.kivilinna at iki.fi  Mon Jan 21 22:01:01 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 21 Jan 2019 23:01:01 +0200
Subject: [PATCH] chacha20-amd64-avx2: optimize output xoring
Message-ID: <154810446170.15440.12619479282908244271.stgit@localhost.localdomain>

* cipher/chacha20-amd64-avx2.S (STACK_TMP2): Remove.
(transpose_16byte_2x2, xor_src_dst): New.
(BUF_XOR_256_TO_128): Remove.
(_gcry_chaha20_amd64_avx2_blocks8)
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): Replace
BUF_XOR_256_TO_128 with transpose_16byte_2x2/xor_src_dst; Reduce stack
usage; Better interleave chacha20 state merging and output xoring.
--

Benchmark on Intel i7-4790K:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.314 ns/B      3035 MiB/s      1.26 c/B      3998
     STREAM dec |     0.314 ns/B      3037 MiB/s      1.26 c/B      3998
   POLY1305 enc |     0.451 ns/B      2117 MiB/s      1.80 c/B      3998
   POLY1305 dec |     0.441 ns/B      2162 MiB/s      1.76 c/B      3998

After:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.309 ns/B      3086 MiB/s      1.24 c/B      3998
     STREAM dec |     0.309 ns/B      3083 MiB/s      1.24 c/B      3998
   POLY1305 enc |     0.445 ns/B      2141 MiB/s      1.78 c/B      3998
   POLY1305 dec |     0.436 ns/B      2188 MiB/s      1.74 c/B      3998

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index ef02c1733..94c8e8cf7 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -50,9 +50,8 @@
 #define STACK_VEC_X13 (32 + STACK_VEC_X12)
 #define STACK_TMP     (32 + STACK_VEC_X13)
 #define STACK_TMP1    (32 + STACK_TMP)
-#define STACK_TMP2    (32 + STACK_TMP1)
 
-#define STACK_MAX     (32 + STACK_TMP2)
+#define STACK_MAX     (32 + STACK_TMP1)
 
 /* vector registers */
 #define X0 %ymm0
@@ -101,11 +100,22 @@
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
-	vpunpckhqdq t1,	x0, x1; \
-	vpunpcklqdq t1,	x0, x0; \
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
-	vpunpcklqdq x2,	t2, x2;
+	vpunpcklqdq x2, t2, x2;
+
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+	vmovdqa    x0, t1; \
+	vperm2i128 $0x20, x1, x0, x0; \
+	vperm2i128 $0x31, x1, t1, x1;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg) \
+	vpxor offset(src), xreg, xreg; \
+	vmovdqu xreg, offset(dst);
 
 /**********************************************************************
   8-way chacha20
@@ -147,13 +157,6 @@
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1);
 
-#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1)	\
-	vextracti128 $1, yreg, tmp1##h;					\
-	vpxor offset_lo(src), yreg##h, yreg##h;				\
-	vpxor offset_hi(src), tmp1##h, tmp1##h;				\
-	vmovdqu yreg##h, offset_lo(dst);				\
-	vmovdqu tmp1##h, offset_hi(dst);
-
 .align 32
 chacha20_data:
 .Lshuf_rol16:
@@ -230,6 +233,8 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	sub $2, ROUND;
 	jnz .Lround2;
 
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
@@ -247,53 +252,56 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
-	vpbroadcastd (8 * 4)(INPUT), X15;
-	PLUS(X8, X15);
-	vpbroadcastd (9 * 4)(INPUT), X15;
-	PLUS(X9, X15);
-	vpbroadcastd (10 * 4)(INPUT), X15;
-	PLUS(X10, X15);
-	vpbroadcastd (11 * 4)(INPUT), X15;
-	PLUS(X11, X15);
-	vmovdqa (STACK_VEC_X12)(%rsp), X15;
-	PLUS(X12, X15);
-	vmovdqa (STACK_VEC_X13)(%rsp), X15;
-	PLUS(X13, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X13, (STACK_TMP)(%rsp);
-	vpbroadcastd (14 * 4)(INPUT), X13;
-	PLUS(X14, X13);
-	vmovdqa X14, (STACK_TMP1)(%rsp);
-	vpbroadcastd (15 * 4)(INPUT), X13;
-	PLUS(X15, X13);
-	vmovdqa X15, (STACK_TMP2)(%rsp);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
-	transpose_4x4(X0, X1, X2, X3, X13, X14);
-	transpose_4x4(X4, X5, X6, X7, X13, X14);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
-	vmovdqa (STACK_TMP)(%rsp), X13;
-	vmovdqa (STACK_TMP1)(%rsp), X14;
-	vmovdqa (STACK_TMP2)(%rsp), X15;
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	sub $8, NBLKS;
 	lea (8 * 64)(DST), DST;
@@ -306,7 +314,6 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
-	vmovdqa X0, (STACK_TMP2)(%rsp);
 	vzeroall;
 
 	/* eax zeroed by round loop. */
@@ -646,6 +653,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
@@ -663,56 +675,56 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
-	vpbroadcastd (8 * 4)(INPUT), X15;
-	PLUS(X8, X15);
-	vpbroadcastd (9 * 4)(INPUT), X15;
-	PLUS(X9, X15);
-	vpbroadcastd (10 * 4)(INPUT), X15;
-	PLUS(X10, X15);
-	vpbroadcastd (11 * 4)(INPUT), X15;
-	PLUS(X11, X15);
-	vmovdqa (STACK_VEC_X12)(%rsp), X15;
-	PLUS(X12, X15);
-	vmovdqa (STACK_VEC_X13)(%rsp), X15;
-	PLUS(X13, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X13, (STACK_TMP)(%rsp);
-	vpbroadcastd (14 * 4)(INPUT), X13;
-	PLUS(X14, X13);
-	vmovdqa X14, (STACK_TMP1)(%rsp);
-	vpbroadcastd (15 * 4)(INPUT), X13;
-	PLUS(X15, X13);
-	vmovdqa X15, (STACK_TMP2)(%rsp);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
-	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
-	movq (STACK_MAX + 6 * 8)(%rsp), DST;
-
-	transpose_4x4(X0, X1, X2, X3, X13, X14);
-	transpose_4x4(X4, X5, X6, X7, X13, X14);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
-	vmovdqa (STACK_TMP)(%rsp), X13;
-	vmovdqa (STACK_TMP1)(%rsp), X14;
-	vmovdqa (STACK_TMP2)(%rsp), X15;
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
@@ -733,7 +745,6 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
-	vmovdqa X0, (STACK_TMP2)(%rsp);
 	vzeroall;
 
 	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;


From jussi.kivilinna at iki.fi  Wed Jan 23 22:20:42 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 23 Jan 2019 23:20:42 +0200
Subject: [PATCH] Calculate OCB L-tables when setting key instead of when
 setting nonce
Message-ID: <154827844272.13543.14365324123881852158.stgit@localhost.localdomain>

* cipher/cipher-internal.h (gcry_cipher_handle): Mark areas of
u_mode.ocb that are and are not cleared by gcry_cipher_reset.
(_gcry_cipher_ocb_setkey): New.
* cipher/cipher-ocb.c (_gcry_cipher_ocb_set_nonce): Split
L-table generation to ...
(_gcry_cipher_ocb_setkey): ... this new function.
* cipher/cipher.c (cipher_setkey): Add handling for OCB mode.
(cipher_reset): Do not clear L-values for OCB mode.
--

OCB L-tables do not depend on nonce value, but only on cipher key.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 78f05dbb5..79de140dd 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -316,6 +316,8 @@ struct gcry_cipher_handle
 
     /* Mode specific storage for OCB mode. */
     struct {
+      /* --- Following members are not cleared in gcry_cipher_reset --- */
+
       /* Helper variables and pre-computed table of L values.  */
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
@@ -323,6 +325,8 @@ struct gcry_cipher_handle
       unsigned char L0L1L0[OCB_BLOCK_LEN];
       unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
+      /* --- Following members are cleared in gcry_cipher_reset --- */
+
       /* The tag is valid if marks.tag has been set.  */
       unsigned char tag[OCB_BLOCK_LEN];
 
@@ -571,6 +575,8 @@ gcry_err_code_t _gcry_cipher_ocb_get_tag
 gcry_err_code_t _gcry_cipher_ocb_check_tag
 /*           */ (gcry_cipher_hd_t c,
                  const unsigned char *intag, size_t taglen);
+void _gcry_cipher_ocb_setkey
+/*           */ (gcry_cipher_hd_t c);
 
 
 /*-- cipher-xts.c --*/
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index 58f7be7e6..be6b8dffb 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -123,6 +123,37 @@ ocb_get_L_big (gcry_cipher_hd_t c, u64 n, unsigned char *l_buf)
 }
 
 
+/* Called after key has been set. Sets up L table. */
+void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c)
+{
+  unsigned char ktop[OCB_BLOCK_LEN];
+  unsigned int burn = 0;
+  unsigned int nburn;
+  int i;
+
+  /* L_star = E(zero_128) */
+  memset (ktop, 0, OCB_BLOCK_LEN);
+  nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop);
+  burn = nburn > burn ? nburn : burn;
+  /* L_dollar = double(L_star)  */
+  double_block_cpy (c->u_mode.ocb.L_dollar, c->u_mode.ocb.L_star);
+  /* L_0 = double(L_dollar), ...  */
+  double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar);
+  for (i = 1; i < OCB_L_TABLE_SIZE; i++)
+    double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]);
+  /* Precalculated offsets L0+L1, L0+L1+L0 */
+  cipher_block_xor (c->u_mode.ocb.L0L1,
+		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
+  cipher_block_xor (c->u_mode.ocb.L0L1L0,
+		    c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN);
+
+  /* Cleanup */
+  wipememory (ktop, sizeof ktop);
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+}
+
+
 /* Set the nonce for OCB.  This requires that the key has been set.
    Using it again resets start a new encryption cycle using the same
    key.  */
@@ -133,7 +164,6 @@ _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
   unsigned char ktop[OCB_BLOCK_LEN];
   unsigned char stretch[OCB_BLOCK_LEN + 8];
   unsigned int bottom;
-  int i;
   unsigned int burn = 0;
   unsigned int nburn;
 
@@ -159,23 +189,6 @@ _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
   if (noncelen > (120/8) || noncelen < (64/8) || noncelen >= OCB_BLOCK_LEN)
     return GPG_ERR_INV_LENGTH;
 
-  /* Set up the L table.  */
-  /* L_star = E(zero_128) */
-  memset (ktop, 0, OCB_BLOCK_LEN);
-  nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop);
-  burn = nburn > burn ? nburn : burn;
-  /* L_dollar = double(L_star)  */
-  double_block_cpy (c->u_mode.ocb.L_dollar, c->u_mode.ocb.L_star);
-  /* L_0 = double(L_dollar), ...  */
-  double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar);
-  for (i = 1; i < OCB_L_TABLE_SIZE; i++)
-    double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]);
-  /* Precalculated offsets L0+L1, L0+L1+L0 */
-  cipher_block_xor (c->u_mode.ocb.L0L1,
-		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
-  cipher_block_xor (c->u_mode.ocb.L0L1L0,
-		    c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN);
-
   /* Prepare the nonce.  */
   memset (ktop, 0, (OCB_BLOCK_LEN - noncelen));
   buf_cpy (ktop + (OCB_BLOCK_LEN - noncelen), nonce, noncelen);
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 55b991c35..ab3e4240e 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -816,6 +816,10 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
           _gcry_cipher_gcm_setkey (c);
           break;
 
+        case GCRY_CIPHER_MODE_OCB:
+          _gcry_cipher_ocb_setkey (c);
+          break;
+
         case GCRY_CIPHER_MODE_POLY1305:
           _gcry_cipher_poly1305_setkey (c);
           break;
@@ -931,9 +935,18 @@ cipher_reset (gcry_cipher_hd_t c)
       break;
 
     case GCRY_CIPHER_MODE_OCB:
-      memset (&c->u_mode.ocb, 0, sizeof c->u_mode.ocb);
-      /* Setup default taglen.  */
-      c->u_mode.ocb.taglen = 16;
+      /* Do not clear precalculated L-values */
+      {
+	byte *u_mode_head_pos = (void *)&c->u_mode.ocb;
+	byte *u_mode_tail_pos = (void *)&c->u_mode.ocb.tag;
+	size_t u_mode_head_length = u_mode_tail_pos - u_mode_head_pos;
+	size_t u_mode_tail_length = sizeof(c->u_mode.ocb) - u_mode_head_length;
+
+	memset (u_mode_tail_pos, 0, u_mode_tail_length);
+
+	/* Setup default taglen.  */
+	c->u_mode.ocb.taglen = 16;
+      }
       break;
 
     case GCRY_CIPHER_MODE_XTS:


From jussi.kivilinna at iki.fi  Sun Jan 27 12:01:28 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 27 Jan 2019 13:01:28 +0200
Subject: [PATCH] Do not precalculate OCB offset L0+L1+L0
Message-ID: <154858688785.4028.6266486144765162943.stgit@localhost.localdomain>

* cipher/cipher-internal.h (gcry_cipher_handle): Remove OCB L0L1L0.
* cipher/cipher-ocb.c (_gcry_cipher_ocb_setkey): Ditto.
* cipher/rijndael-aesni.c (aesni_ocb_enc, aesni_ocb_dec)
(_gcry_aes_aesni_ocb_auth): Replace L0L1L0 use with L1.
--

Patch fixes L0+L1+L0 thinko. This is same as L1 (L0 xor L1 xor L0).
---
 0 files changed

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 79de140dd..5ece774e6 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -322,7 +322,6 @@ struct gcry_cipher_handle
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
       unsigned char L0L1[OCB_BLOCK_LEN];
-      unsigned char L0L1L0[OCB_BLOCK_LEN];
       unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
       /* --- Following members are cleared in gcry_cipher_reset --- */
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index be6b8dffb..308b04952 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -141,11 +141,9 @@ void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c)
   double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar);
   for (i = 1; i < OCB_L_TABLE_SIZE; i++)
     double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]);
-  /* Precalculated offsets L0+L1, L0+L1+L0 */
+  /* Precalculated offset L0+L1 */
   cipher_block_xor (c->u_mode.ocb.L0L1,
 		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
-  cipher_block_xor (c->u_mode.ocb.L0L1L0,
-		    c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN);
 
   /* Cleanup */
   wipememory (ktop, sizeof ktop);
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index ec9f4d4a5..9883861a2 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2429,11 +2429,11 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 	  l = aes_ocb_get_l(c, n);
 
 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
-			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
 			"movdqu %[l3],     %%xmm15\n\t"
 			:
 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
-			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
 			  [l3] "m" (*l)
 			: "memory" );
 
@@ -2561,13 +2561,13 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    "movdqu %%xmm0,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
@@ -2730,11 +2730,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  l = aes_ocb_get_l(c, n);
 
 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
-			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
 			"movdqu %[l3],     %%xmm15\n\t"
 			:
 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
-			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
 			  [l3] "m" (*l)
 			: "memory" );
 
@@ -2862,13 +2862,13 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    "movdqu %%xmm0,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
@@ -3028,11 +3028,11 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       asm volatile ("movdqu %[l0],     %%xmm7\n\t"
 		    "movdqu %[l0l1],   %%xmm12\n\t"
-		    "movdqu %[l0l1l0], %%xmm13\n\t"
+		    "movdqu %[l1],     %%xmm13\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
-		      [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0)
+		      [l1] "m" (*c->u_mode.ocb.L[1])
 		    : "memory" );
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
@@ -3138,12 +3138,12 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm7\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    :
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"


From cvs at cvs.gnupg.org  Sun Jan 27 12:05:30 2019
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Sun, 27 Jan 2019 12:05:30 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-151-gafab94d
Message-ID: <E1gniFn-0003iy-LN@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  afab94d222425ecb838eb56cb0723bdaf3e5de36 (commit)
      from  c15409c49993166ab1325d45360b3a8fe72a5556 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit afab94d222425ecb838eb56cb0723bdaf3e5de36
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Jan 27 12:55:22 2019 +0200

    Do not precalculate OCB offset L0+L1+L0
    
    * cipher/cipher-internal.h (gcry_cipher_handle): Remove OCB L0L1L0.
    * cipher/cipher-ocb.c (_gcry_cipher_ocb_setkey): Ditto.
    * cipher/rijndael-aesni.c (aesni_ocb_enc, aesni_ocb_dec)
    (_gcry_aes_aesni_ocb_auth): Replace L0L1L0 use with L1.
    --
    
    Patch fixes L0+L1+L0 thinko. This is same as L1 (L0 xor L1 xor L0).
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 79de140..5ece774 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -322,7 +322,6 @@ struct gcry_cipher_handle
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
       unsigned char L0L1[OCB_BLOCK_LEN];
-      unsigned char L0L1L0[OCB_BLOCK_LEN];
       unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
       /* --- Following members are cleared in gcry_cipher_reset --- */
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index be6b8df..308b049 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -141,11 +141,9 @@ void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c)
   double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar);
   for (i = 1; i < OCB_L_TABLE_SIZE; i++)
     double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]);
-  /* Precalculated offsets L0+L1, L0+L1+L0 */
+  /* Precalculated offset L0+L1 */
   cipher_block_xor (c->u_mode.ocb.L0L1,
 		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
-  cipher_block_xor (c->u_mode.ocb.L0L1L0,
-		    c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN);
 
   /* Cleanup */
   wipememory (ktop, sizeof ktop);
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index ec9f4d4..9883861 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2429,11 +2429,11 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 	  l = aes_ocb_get_l(c, n);
 
 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
-			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
 			"movdqu %[l3],     %%xmm15\n\t"
 			:
 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
-			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
 			  [l3] "m" (*l)
 			: "memory" );
 
@@ -2561,13 +2561,13 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    "movdqu %%xmm0,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
@@ -2730,11 +2730,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  l = aes_ocb_get_l(c, n);
 
 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
-			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
 			"movdqu %[l3],     %%xmm15\n\t"
 			:
 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
-			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
 			  [l3] "m" (*l)
 			: "memory" );
 
@@ -2862,13 +2862,13 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    "movdqu %%xmm0,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
@@ -3028,11 +3028,11 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       asm volatile ("movdqu %[l0],     %%xmm7\n\t"
 		    "movdqu %[l0l1],   %%xmm12\n\t"
-		    "movdqu %[l0l1l0], %%xmm13\n\t"
+		    "movdqu %[l1],     %%xmm13\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
-		      [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0)
+		      [l1] "m" (*c->u_mode.ocb.L[1])
 		    : "memory" );
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
@@ -3138,12 +3138,12 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
 		    "movdqu %[l3],     %%xmm7\n\t"
 		    "pxor   %%xmm5,    %%xmm0\n\t"
 		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    :
-		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [l3] "m" (*l)
 		    : "memory" );
       asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-internal.h |  1 -
 cipher/cipher-ocb.c      |  4 +---
 cipher/rijndael-aesni.c  | 24 ++++++++++++------------
 3 files changed, 13 insertions(+), 16 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits