From cvs at cvs.gnupg.org Wed Jan 2 20:35:57 2019 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Wed, 02 Jan 2019 20:35:57 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-138-g3ee6588 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 3ee6588de8311b461ef8707c70ff86d2b252966d (commit) via 4871f11745f33c5c5051bfe6f325ac1c10764b04 (commit) via edde61f325e4b345f17c47369f3b6b1400656f04 (commit) from 3028a221d39c1b593ea0c1bcbfccd33959769692 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 3ee6588de8311b461ef8707c70ff86d2b252966d Author: Jussi Kivilinna Date: Wed Jan 2 21:25:44 2019 +0200 Process CCM/EAX/GCM/Poly1305 AEAD cipher modes input in 24 KiB chucks * cipher/cipher-ccm.c (_gcry_cipher_ccm_encrypt) (_gcry_cipher_ccm_decrypt): Process data in 24 KiB chunks. * cipher/cipher-eax.c (_gcry_cipher_eax_encrypt) (_gcry_cipher_eax_decrypt): Ditto. * cipher/cipher-gcm.c (_gcry_cipher_gcm_encrypt) (_gcry_cipher_gcm_decrypt): Ditto. * cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt) (_gcry_cipher_poly1305_decrypt): Ditto. -- Patch changes AEAD modes to process input in 24 KiB chuncks to improve cache locality when processing large buffers. Huge buffer test in tests/benchmark show 0.7% improvement for AES-CCM and AES-EAX, 6% for AES-GCM and 4% for Chacha20-Poly1305 on Intel Core i7-4790K. Signed-off-by: Jussi Kivilinna diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c index e71c6f1..fd284ca 100644 --- a/cipher/cipher-ccm.c +++ b/cipher/cipher-ccm.c @@ -319,7 +319,9 @@ _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - unsigned int burn; + gcry_err_code_t err = 0; + unsigned int burn = 0; + unsigned int nburn; if (outbuflen < inbuflen) return GPG_ERR_BUFFER_TOO_SHORT; @@ -329,12 +331,32 @@ _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf, if (inbuflen > c->u_mode.ccm.encryptlen) return GPG_ERR_INV_LENGTH; - c->u_mode.ccm.encryptlen -= inbuflen; - burn = do_cbc_mac (c, inbuf, inbuflen, 0); + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done before encryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for encryption. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + c->u_mode.ccm.encryptlen -= currlen; + nburn = do_cbc_mac (c, inbuf, currlen, 0); + burn = nburn > burn ? nburn : burn; + + err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen); + if (err) + break; + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } + if (burn) _gcry_burn_stack (burn + sizeof(void *) * 5); - - return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); + return err; } @@ -343,8 +365,9 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - gcry_err_code_t err; - unsigned int burn; + gcry_err_code_t err = 0; + unsigned int burn = 0; + unsigned int nburn; if (outbuflen < inbuflen) return GPG_ERR_BUFFER_TOO_SHORT; @@ -354,14 +377,30 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, if (inbuflen > c->u_mode.ccm.encryptlen) return GPG_ERR_INV_LENGTH; - err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); - if (err) - return err; + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done after decryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for checksumming. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen); + if (err) + break; + + c->u_mode.ccm.encryptlen -= currlen; + nburn = do_cbc_mac (c, outbuf, currlen, 0); + burn = nburn > burn ? nburn : burn; + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } - c->u_mode.ccm.encryptlen -= inbuflen; - burn = do_cbc_mac (c, outbuf, inbuflen, 0); if (burn) _gcry_burn_stack (burn + sizeof(void *) * 5); - return err; } diff --git a/cipher/cipher-eax.c b/cipher/cipher-eax.c index 3b17bb6..08f815a 100644 --- a/cipher/cipher-eax.c +++ b/cipher/cipher-eax.c @@ -48,11 +48,31 @@ _gcry_cipher_eax_encrypt (gcry_cipher_hd_t c, return err; } - err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); - if (err != 0) - return err; + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done after encryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for checksumming. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen); + if (err != 0) + return err; - return _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf, inbuflen); + err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf, + currlen); + if (err != 0) + return err; + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } + + return 0; } @@ -75,11 +95,31 @@ _gcry_cipher_eax_decrypt (gcry_cipher_hd_t c, return err; } - err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf, inbuflen); - if (err != 0) - return err; + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done before decryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for decryption. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf, + currlen); + if (err != 0) + return err; - return _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); + err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen); + if (err != 0) + return err; + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } + + return 0; } diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 32ec9fa..f9ddbc5 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -666,11 +666,26 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } - err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen); - if (err != 0) - return err; + while (inbuflen) + { + size_t currlen = inbuflen; - do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, inbuflen, 0); + /* Since checksumming is done after encryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for checksumming. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen); + if (err != 0) + return err; + + do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, currlen, 0); + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } return 0; } @@ -682,6 +697,7 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c, const byte *inbuf, size_t inbuflen) { static const unsigned char zerobuf[MAX_BLOCKSIZE]; + gcry_err_code_t err; if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN) return GPG_ERR_CIPHER_ALGO; @@ -711,9 +727,28 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } - do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, inbuflen, 0); + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done before decryption, process input in + * 24KiB chunks to keep data loaded in L1 cache for decryption. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; - return gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, inbuflen); + do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, currlen, 0); + + err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen); + if (err) + return err; + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } + + return 0; } diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c index 82537aa..607586b 100644 --- a/cipher/cipher-poly1305.c +++ b/cipher/cipher-poly1305.c @@ -164,9 +164,24 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } - c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done after encryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for checksumming. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen); - _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, inbuflen); + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, currlen); + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } return 0; } @@ -202,9 +217,25 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } - _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, inbuflen); + while (inbuflen) + { + size_t currlen = inbuflen; + + /* Since checksumming is done before decryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for decryption. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen); + + c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, currlen); + + outbuf += currlen; + inbuf += currlen; + outbuflen -= currlen; + inbuflen -= currlen; + } - c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); return 0; } commit 4871f11745f33c5c5051bfe6f325ac1c10764b04 Author: Jussi Kivilinna Date: Wed Jan 2 21:25:44 2019 +0200 tests/benchmark: add Chacha20-Poly1305 benchmarking * tests/benchmark.c (cipher_bench): Add Chacha20-Poly1305. -- Signed-off-by: Jussi Kivilinna diff --git a/tests/benchmark.c b/tests/benchmark.c index f9974fc..418f929 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -825,7 +825,7 @@ cipher_bench ( const char *algoname ) int doublekey; } modes[] = { { GCRY_CIPHER_MODE_ECB, " ECB/Stream", 1, 0xffffffffU }, - { GCRY_CIPHER_MODE_CBC, " CBC", 1, 0xffffffffU }, + { GCRY_CIPHER_MODE_CBC, " CBC/Poly1305", 1, 0xffffffffU }, { GCRY_CIPHER_MODE_CFB, " CFB", 0, 0xffffffffU }, { GCRY_CIPHER_MODE_OFB, " OFB", 0, 0xffffffffU }, { GCRY_CIPHER_MODE_CTR, " CTR", 0, 0xffffffffU }, @@ -840,6 +840,8 @@ cipher_bench ( const char *algoname ) { GCRY_CIPHER_MODE_EAX, " EAX", 0, 0xffffffffU, NULL, 0, 8, 8 }, { GCRY_CIPHER_MODE_STREAM, "", 0, 0xffffffffU }, + { GCRY_CIPHER_MODE_POLY1305, "", 0, 0xffffffffU, + NULL, 1, 16, 12 }, {0} }; int modeidx; @@ -931,9 +933,14 @@ cipher_bench ( const char *algoname ) for (modeidx=0; modes[modeidx].mode; modeidx++) { size_t modekeylen = keylen * (!!modes[modeidx].doublekey + 1); + int is_stream = modes[modeidx].mode == GCRY_CIPHER_MODE_STREAM + || modes[modeidx].mode == GCRY_CIPHER_MODE_POLY1305; - if ((blklen > 1 && modes[modeidx].mode == GCRY_CIPHER_MODE_STREAM) - || (blklen == 1 && modes[modeidx].mode != GCRY_CIPHER_MODE_STREAM)) + if ((blklen > 1 && is_stream) || (blklen == 1 && !is_stream)) + continue; + + if (modes[modeidx].mode == GCRY_CIPHER_MODE_POLY1305 + && algo != GCRY_CIPHER_CHACHA20) continue; if (modes[modeidx].req_blocksize > 0 commit edde61f325e4b345f17c47369f3b6b1400656f04 Author: Jussi Kivilinna Date: Wed Jan 2 21:25:44 2019 +0200 tests/benchmark: add --huge-buffers option for cipher tests * tests/benchmark.c (huge_buffers, cipher_encrypt, cipher_decrypt): New. (cipher_bench): Add 'max_inlen' to modes structure; add huge buffers mode selection. (main): Add '--huge-buffers'. -- Signed-off-by: Jussi Kivilinna diff --git a/tests/benchmark.c b/tests/benchmark.c index 59ea32c..f9974fc 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -37,9 +37,12 @@ #define PGM "benchmark" #include "t-common.h" -/* Do encryption tests with large buffers. */ +/* Do encryption tests with large buffers (100 KiB). */ static int large_buffers; +/* Do encryption tests with huge buffers (256 MiB). */ +static int huge_buffers; + /* Number of cipher repetitions. */ static int cipher_repetitions; @@ -743,6 +746,60 @@ static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen) } +static gcry_error_t +cipher_encrypt (gcry_cipher_hd_t h, char *out, size_t outsize, + const char *in, size_t inlen, size_t max_inlen) +{ + gcry_error_t ret; + + while (inlen) + { + size_t currlen = inlen; + + if (currlen > max_inlen) + currlen = max_inlen; + + ret = gcry_cipher_encrypt(h, out, outsize, in, currlen); + if (ret) + return ret; + + out += currlen; + in += currlen; + outsize -= currlen; + inlen -= currlen; + } + + return 0; +} + + +static gcry_error_t +cipher_decrypt (gcry_cipher_hd_t h, char *out, size_t outsize, + const char *in, size_t inlen, size_t max_inlen) +{ + gcry_error_t ret; + + while (inlen) + { + size_t currlen = inlen; + + if (currlen > max_inlen) + currlen = max_inlen; + + ret = gcry_cipher_decrypt(h, out, outsize, in, currlen); + if (ret) + return ret; + + out += currlen; + in += currlen; + outsize -= currlen; + inlen -= currlen; + } + + return 0; +} + + static void cipher_bench ( const char *algoname ) { @@ -760,34 +817,34 @@ cipher_bench ( const char *algoname ) int mode; const char *name; int blocked; + unsigned int max_inlen; void (* const aead_init)(gcry_cipher_hd_t hd, size_t buflen, int authlen); int req_blocksize; int authlen; int noncelen; int doublekey; } modes[] = { - { GCRY_CIPHER_MODE_ECB, " ECB/Stream", 1 }, - { GCRY_CIPHER_MODE_CBC, " CBC", 1 }, - { GCRY_CIPHER_MODE_CFB, " CFB", 0 }, - { GCRY_CIPHER_MODE_OFB, " OFB", 0 }, - { GCRY_CIPHER_MODE_CTR, " CTR", 0 }, - { GCRY_CIPHER_MODE_XTS, " XTS", 0, + { GCRY_CIPHER_MODE_ECB, " ECB/Stream", 1, 0xffffffffU }, + { GCRY_CIPHER_MODE_CBC, " CBC", 1, 0xffffffffU }, + { GCRY_CIPHER_MODE_CFB, " CFB", 0, 0xffffffffU }, + { GCRY_CIPHER_MODE_OFB, " OFB", 0, 0xffffffffU }, + { GCRY_CIPHER_MODE_CTR, " CTR", 0, 0xffffffffU }, + { GCRY_CIPHER_MODE_XTS, " XTS", 0, 16 << 20, NULL, GCRY_XTS_BLOCK_LEN, 0, 0, 1 }, - { GCRY_CIPHER_MODE_CCM, " CCM", 0, - ccm_aead_init, GCRY_CCM_BLOCK_LEN, 8 }, - { GCRY_CIPHER_MODE_GCM, " GCM", 0, + { GCRY_CIPHER_MODE_CCM, " CCM", 0, 0xffffffffU, + ccm_aead_init, GCRY_CCM_BLOCK_LEN, 8, }, + { GCRY_CIPHER_MODE_GCM, " GCM", 0, 0xffffffffU, NULL, GCRY_GCM_BLOCK_LEN, GCRY_GCM_BLOCK_LEN }, - { GCRY_CIPHER_MODE_OCB, " OCB", 1, + { GCRY_CIPHER_MODE_OCB, " OCB", 1, 0xffffffffU, NULL, 16, 16, 15 }, - { GCRY_CIPHER_MODE_EAX, " EAX", 0, + { GCRY_CIPHER_MODE_EAX, " EAX", 0, 0xffffffffU, NULL, 0, 8, 8 }, - { GCRY_CIPHER_MODE_STREAM, "", 0 }, + { GCRY_CIPHER_MODE_STREAM, "", 0, 0xffffffffU }, {0} }; int modeidx; gcry_error_t err = GPG_ERR_NO_ERROR; - if (!algoname) { for (i=1; i < 400; i++) @@ -796,7 +853,12 @@ cipher_bench ( const char *algoname ) return; } - if (large_buffers) + if (huge_buffers) + { + allocated_buflen = 256 * 1024 * 1024; + repetitions = 4; + } + else if (large_buffers) { allocated_buflen = 1024 * 100; repetitions = 10; @@ -945,14 +1007,16 @@ cipher_bench ( const char *algoname ) { (*modes[modeidx].aead_init) (hd, buflen, modes[modeidx].authlen); gcry_cipher_final (hd); - err = gcry_cipher_encrypt (hd, outbuf, buflen, buf, buflen); + err = cipher_encrypt (hd, outbuf, buflen, buf, buflen, + modes[modeidx].max_inlen); if (err) break; err = gcry_cipher_gettag (hd, outbuf, modes[modeidx].authlen); } else { - err = gcry_cipher_encrypt (hd, outbuf, buflen, buf, buflen); + err = cipher_encrypt (hd, outbuf, buflen, buf, buflen, + modes[modeidx].max_inlen); } } stop_timer (); @@ -1024,7 +1088,8 @@ cipher_bench ( const char *algoname ) { (*modes[modeidx].aead_init) (hd, buflen, modes[modeidx].authlen); gcry_cipher_final (hd); - err = gcry_cipher_decrypt (hd, outbuf, buflen, buf, buflen); + err = cipher_decrypt (hd, outbuf, buflen, buf, buflen, + modes[modeidx].max_inlen); if (err) break; err = gcry_cipher_checktag (hd, outbuf, modes[modeidx].authlen); @@ -1034,7 +1099,8 @@ cipher_bench ( const char *algoname ) else { gcry_cipher_final (hd); - err = gcry_cipher_decrypt (hd, outbuf, buflen, buf, buflen); + err = cipher_decrypt (hd, outbuf, buflen, buf, buflen, + modes[modeidx].max_inlen); } } stop_timer (); @@ -1741,6 +1807,11 @@ main( int argc, char **argv ) large_buffers = 1; argc--; argv++; } + else if (!strcmp (*argv, "--huge-buffers")) + { + huge_buffers = 1; + argc--; argv++; + } else if (!strcmp (*argv, "--cipher-repetitions")) { argc--; argv++; ----------------------------------------------------------------------- Summary of changes: cipher/cipher-ccm.c | 65 ++++++++++++++++++++----- cipher/cipher-eax.c | 56 ++++++++++++++++++---- cipher/cipher-gcm.c | 47 ++++++++++++++++--- cipher/cipher-poly1305.c | 39 +++++++++++++-- tests/benchmark.c | 120 ++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 275 insertions(+), 52 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Mon Jan 14 21:21:52 2019 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Mon, 14 Jan 2019 21:21:52 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-139-g09c2728 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 09c27280cc09798d15369b3a143036b7ab5ddd69 (commit) from 3ee6588de8311b461ef8707c70ff86d2b252966d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 09c27280cc09798d15369b3a143036b7ab5ddd69 Author: Jussi Kivilinna Date: Mon Jan 14 22:14:24 2019 +0200 camellia-aarch64: do not export look-up table globally * cipher/camellia-aarch64.S (_gcry_camellia_arm_tables): Remove '.globl' export. -- Reported-by: Martin Husemann GnuPG-bug-id: 4317 Signed-off-by: Jussi Kivilinna diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S index b0e9a03..5c6ab02 100644 --- a/cipher/camellia-aarch64.S +++ b/cipher/camellia-aarch64.S @@ -289,7 +289,6 @@ _gcry_camellia_arm_decrypt_block: ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;) /* Encryption/Decryption tables */ -.globl _gcry_camellia_arm_tables ELF(.type _gcry_camellia_arm_tables, at object;) .balign 32 _gcry_camellia_arm_tables: ----------------------------------------------------------------------- Summary of changes: cipher/camellia-aarch64.S | 1 - 1 file changed, 1 deletion(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From andre at amorim.me Mon Jan 14 21:29:27 2019 From: andre at amorim.me (Andre Amorim) Date: Mon, 14 Jan 2019 20:29:27 +0000 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-139-g09c2728 In-Reply-To: References: Message-ID: I have no idea what camellia-aarch64 means ... On Mon, 14 Jan 2019 at 20:23, by Jussi Kivilinna wrote: > This is an automated email from the git hooks/post-receive script. It was > generated because a ref change was pushed to the repository containing > the project "The GNU crypto library". > > The branch, master has been updated > via 09c27280cc09798d15369b3a143036b7ab5ddd69 (commit) > from 3ee6588de8311b461ef8707c70ff86d2b252966d (commit) > > Those revisions listed above that are new to this repository have > not appeared on any other notification email; so we list those > revisions in full, below. > > - Log ----------------------------------------------------------------- > commit 09c27280cc09798d15369b3a143036b7ab5ddd69 > Author: Jussi Kivilinna > Date: Mon Jan 14 22:14:24 2019 +0200 > > camellia-aarch64: do not export look-up table globally > > * cipher/camellia-aarch64.S (_gcry_camellia_arm_tables): Remove > '.globl' export. > -- > > Reported-by: Martin Husemann > GnuPG-bug-id: 4317 > Signed-off-by: Jussi Kivilinna > > diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S > index b0e9a03..5c6ab02 100644 > --- a/cipher/camellia-aarch64.S > +++ b/cipher/camellia-aarch64.S > @@ -289,7 +289,6 @@ _gcry_camellia_arm_decrypt_block: > ELF(.size > _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;) > > /* Encryption/Decryption tables */ > -.globl _gcry_camellia_arm_tables > ELF(.type _gcry_camellia_arm_tables, at object;) > .balign 32 > _gcry_camellia_arm_tables: > > ----------------------------------------------------------------------- > > Summary of changes: > cipher/camellia-aarch64.S | 1 - > 1 file changed, 1 deletion(-) > > > hooks/post-receive > -- > The GNU crypto library > http://git.gnupg.org > > > _______________________________________________ > Gnupg-commits mailing list > Gnupg-commits at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gnupg-commits > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -------------- next part -------------- An HTML attachment was scrubbed... URL: From gniibe at fsij.org Tue Jan 15 02:21:41 2019 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 15 Jan 2019 10:21:41 +0900 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-139-g09c2728 In-Reply-To: References: Message-ID: <87r2deyau2.fsf@fsij.org> Andre Amorim wrote: > I have no idea what camellia-aarch64 means ... Camellia is a cipher, comparable to AES. You can see the definition in RFC-3713. AArch64 is the 64-bit architecture of ARM, also known as ARM64. Most likely, you can find a machine as a smartphone. The code in question offers an optimized implementation of Camellia for AArch64. -- From cvs at cvs.gnupg.org Tue Jan 15 05:58:02 2019 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Tue, 15 Jan 2019 05:58:02 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-140-g2677d7d Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 2677d7d482bf2d078c1dce64854747c5b148924b (commit) from 09c27280cc09798d15369b3a143036b7ab5ddd69 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 2677d7d482bf2d078c1dce64854747c5b148924b Author: NIIBE Yutaka Date: Tue Jan 15 13:53:45 2019 +0900 random: Use getentropy when available for not GNU/Linux. * configure.ac: Detect getentropy. * random/rndlinux.c [__linux__] (getentropy): Macro defined. [HAVE_GETENTROPY] (_gcry_rndlinux_gather_random): Use getentropy. -- GnuPG-bug-id: 4288 Reported-by: David Carlier Signed-off-by: NIIBE Yutaka diff --git a/configure.ac b/configure.ac index 5843884..67cf1f7 100644 --- a/configure.ac +++ b/configure.ac @@ -1772,7 +1772,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile) -AC_CHECK_FUNCS(explicit_bzero) +AC_CHECK_FUNCS(explicit_bzero getentropy) GNUPG_CHECK_MLOCK diff --git a/random/rndlinux.c b/random/rndlinux.c index 3d41cd3..d71261c 100644 --- a/random/rndlinux.c +++ b/random/rndlinux.c @@ -32,8 +32,13 @@ #include #include #include -#if defined(__linux__) && defined(HAVE_SYSCALL) +#if defined(__linux__) || !defined(HAVE_GETENTROPY) +#ifdef HAVE_SYSCALL # include +# ifdef __NR_getrandom +# define getentropy(buf,buflen) syscall (__NR_getrandom, buf, buflen, 0) +# endif +#endif #endif #include "types.h" @@ -247,16 +252,14 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, struct timeval tv; int rc; - /* If we have a modern Linux kernel, we first try to use the new - * getrandom syscall. That call guarantees that the kernel's + /* If we have a modern operating system, we first try to use the new + * getentropy function. That call guarantees that the kernel's * RNG has been properly seeded before returning any data. This * is different from /dev/urandom which may, due to its * non-blocking semantics, return data even if the kernel has * not been properly seeded. And it differs from /dev/random by never - * blocking once the kernel is seeded. Unfortunately we need to use a - * syscall and not a new device and thus we are not able to use - * select(2) to have a timeout. */ -#if defined(__linux__) && defined(HAVE_SYSCALL) && defined(__NR_getrandom) + * blocking once the kernel is seeded. */ +#if defined(HAVE_GETENTROPY) || defined(__NR_getrandom) { long ret; size_t nbytes; @@ -267,20 +270,19 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, if (nbytes > 256) nbytes = 256; _gcry_pre_syscall (); - ret = syscall (__NR_getrandom, - (void*)buffer, (size_t)nbytes, (unsigned int)0); + ret = getentropy (buffer, nbytes); _gcry_post_syscall (); } while (ret == -1 && errno == EINTR); if (ret == -1 && errno == ENOSYS) - ; /* The syscall is not supported - fallback to pulling from fd. */ + ; /* getentropy is not supported - fallback to pulling from fd. */ else - { /* The syscall is supported. Some sanity checks. */ + { /* getentropy is supported. Some sanity checks. */ if (ret == -1) - log_fatal ("unexpected error from getrandom: %s\n", + log_fatal ("unexpected error from getentropy: %s\n", strerror (errno)); else if (ret != nbytes) - log_fatal ("getrandom returned only" + log_fatal ("getentropy returned only" " %ld of %zu requested bytes\n", ret, nbytes); (*add)(buffer, nbytes, origin); ----------------------------------------------------------------------- Summary of changes: configure.ac | 2 +- random/rndlinux.c | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Tue Jan 15 07:50:33 2019 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Tue, 15 Jan 2019 07:50:33 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-141-g17f246c Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 17f246c7044ab9ed236f6ec73fc126654257f0f9 (commit) from 2677d7d482bf2d078c1dce64854747c5b148924b (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 17f246c7044ab9ed236f6ec73fc126654257f0f9 Author: NIIBE Yutaka Date: Tue Jan 15 15:48:25 2019 +0900 random: Fix previous commit for getentropy function. * random/rndlinux.c [__NR_getrandom] (_gcry_rndlinux_gather_random): Check return value only for use of syscall. -- The function returns 0 on success. Signed-off-by: NIIBE Yutaka diff --git a/random/rndlinux.c b/random/rndlinux.c index d71261c..04e2a46 100644 --- a/random/rndlinux.c +++ b/random/rndlinux.c @@ -281,9 +281,11 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, if (ret == -1) log_fatal ("unexpected error from getentropy: %s\n", strerror (errno)); +#ifdef __NR_getrandom else if (ret != nbytes) log_fatal ("getentropy returned only" " %ld of %zu requested bytes\n", ret, nbytes); +#endif (*add)(buffer, nbytes, origin); length -= nbytes; ----------------------------------------------------------------------- Summary of changes: random/rndlinux.c | 2 ++ 1 file changed, 2 insertions(+) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Tue Jan 15 08:18:18 2019 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Tue, 15 Jan 2019 08:18:18 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-142-ge5c2f8a Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via e5c2f8a2cd2b89d90ea30de2dedb0e92498a5f70 (commit) from 17f246c7044ab9ed236f6ec73fc126654257f0f9 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit e5c2f8a2cd2b89d90ea30de2dedb0e92498a5f70 Author: NIIBE Yutaka Date: Tue Jan 15 16:14:51 2019 +0900 build: With LD_LIBRARY_PATH defined, use --disable-new-dtags. * configure.ac (LDADD_FOR_TESTS_KLUDGE): New for --disable-new-dtags. * tests/Makefile.am (LDADD, t_lock_LDADD): Use LDADD_FOR_TESTS_KLUDGE. -- GnuPG-bug-id: 4298 Signed-off-by: NIIBE Yutaka diff --git a/configure.ac b/configure.ac index 67cf1f7..bb3c666 100644 --- a/configure.ac +++ b/configure.ac @@ -146,6 +146,41 @@ AC_PROG_AWK AC_GNU_SOURCE +# Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE +dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH +dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has +dnl the precedence over the run path, so that if a compatible MPFR library +dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested +dnl MPFR library will be this library instead of the MPFR library from the +dnl build tree. Other OS with the same issue might be added later. +dnl +dnl References: +dnl https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732 +dnl http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html +dnl +dnl We need to check whether --disable-new-dtags is supported as alternate +dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc). +dnl +case $host in + *-*-linux*) + if test -n "$LD_LIBRARY_PATH"; then + saved_LDFLAGS="$LDFLAGS" + LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags" + LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE" + AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker) + AC_LINK_IFELSE([AC_LANG_SOURCE([[ +int main (void) { return 0; } + ]])], + [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))], + [AC_MSG_RESULT(no) + LDADD_FOR_TESTS_KLUDGE="" + ]) + LDFLAGS="$saved_LDFLAGS" + fi + ;; +esac +AC_SUBST([LDADD_FOR_TESTS_KLUDGE]) + VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \ mym4_minor mym4_micro) AC_SUBST(VERSION_NUMBER) diff --git a/tests/Makefile.am b/tests/Makefile.am index eee24fa..9e11797 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -62,6 +62,6 @@ EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \ blake2b.h blake2s.h \ basic-disable-all-hwf.in basic_all_hwfeature_combinations.sh -LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) -t_lock_LDADD = $(standard_ldadd) $(GPG_ERROR_MT_LIBS) +LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@ +t_lock_LDADD = $(standard_ldadd) $(GPG_ERROR_MT_LIBS) @LDADD_FOR_TESTS_KLUDGE@ t_lock_CFLAGS = $(GPG_ERROR_MT_CFLAGS) ----------------------------------------------------------------------- Summary of changes: configure.ac | 35 +++++++++++++++++++++++++++++++++++ tests/Makefile.am | 4 ++-- 2 files changed, 37 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Fri Jan 18 23:35:37 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 19 Jan 2019 00:35:37 +0200 Subject: [PATCH 1/4] tests/basic: check AEAD tags in check_one_cipher test Message-ID: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> * tests/basic.c (get_algo_mode_taglen): New. (check_one_cipher_core_reset): Check that tags are same with AEAD modes. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/tests/basic.c b/tests/basic.c index 0afae3047..96af6c743 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -7256,6 +7256,23 @@ get_algo_mode_blklen (int algo, int mode) } +static unsigned int +get_algo_mode_taglen (int algo, int mode) +{ + switch (mode) + { + case GCRY_CIPHER_MODE_CCM: + case GCRY_CIPHER_MODE_GCM: + case GCRY_CIPHER_MODE_POLY1305: + return 16; + case GCRY_CIPHER_MODE_EAX: + return gcry_cipher_get_algo_blklen(algo); + } + + return 0; +} + + static int check_one_cipher_core_reset (gcry_cipher_hd_t hd, int algo, int mode, int pass, int nplain) @@ -7311,14 +7328,18 @@ check_one_cipher_core (int algo, int mode, int flags, gcry_cipher_hd_t hd; unsigned char in_buffer[1040+1], out_buffer[1040+1]; unsigned char enc_result[1040]; + unsigned char tag_result[16]; + unsigned char tag[16]; unsigned char *in, *out; int keylen; gcry_error_t err = 0; unsigned int blklen; unsigned int piecelen; unsigned int pos; + unsigned int taglen; blklen = get_algo_mode_blklen(algo, mode); + taglen = get_algo_mode_taglen(algo, mode); assert (nkey == 64); assert (nplain == 1040); @@ -7402,6 +7423,20 @@ check_one_cipher_core (int algo, int mode, int flags, return -1; } + if (taglen > 0) + { + err = gcry_cipher_gettag (hd, tag, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, gcry_cipher_gettag failed: %s\n", + pass, algo, mode, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + + memcpy(tag_result, tag, taglen); + } + memcpy (enc_result, out, nplain); if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0) @@ -7416,6 +7451,18 @@ check_one_cipher_core (int algo, int mode, int flags, return -1; } + if (taglen > 0) + { + err = gcry_cipher_checktag (hd, tag_result, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, gcry_cipher_checktag failed: %s\n", + pass, algo, mode, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + } + if (memcmp (plain, in, nplain)) fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n", pass, algo, mode); @@ -7435,6 +7482,23 @@ check_one_cipher_core (int algo, int mode, int flags, return -1; } + if (taglen > 0) + { + err = gcry_cipher_gettag (hd, tag, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, in-place, " + "gcry_cipher_gettag failed: %s\n", + pass, algo, mode, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + + if (memcmp (tag_result, tag, taglen)) + fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n", + pass, algo, mode); + } + if (memcmp (enc_result, out, nplain)) fail ("pass %d, algo %d, mode %d, in-place, encrypt mismatch\n", pass, algo, mode); @@ -7452,6 +7516,19 @@ check_one_cipher_core (int algo, int mode, int flags, return -1; } + if (taglen > 0) + { + err = gcry_cipher_checktag (hd, tag_result, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, in-place, " + "gcry_cipher_checktag failed: %s\n", + pass, algo, mode, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + } + if (memcmp (plain, out, nplain)) fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n", pass, algo, mode); @@ -7482,6 +7559,23 @@ check_one_cipher_core (int algo, int mode, int flags, piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0); } + if (taglen > 0) + { + err = gcry_cipher_gettag (hd, tag, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, " + "piecelen: %d), gcry_cipher_gettag failed: %s\n", + pass, algo, mode, pos, piecelen, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + + if (memcmp (tag_result, tag, taglen)) + fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n", + pass, algo, mode); + } + if (memcmp (enc_result, out, nplain)) fail ("pass %d, algo %d, mode %d, split-buffer, encrypt mismatch\n", pass, algo, mode); @@ -7510,6 +7604,19 @@ check_one_cipher_core (int algo, int mode, int flags, piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0); } + if (taglen > 0) + { + err = gcry_cipher_checktag (hd, tag_result, taglen); + if (err) + { + fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, " + "piecelen: %d), gcry_cipher_checktag failed: %s\n", + pass, algo, mode, pos, piecelen, gpg_strerror (err)); + gcry_cipher_close (hd); + return -1; + } + } + if (memcmp (plain, in, nplain)) fail ("pass %d, algo %d, mode %d, split-buffer, encrypt-decrypt mismatch\n", pass, algo, mode); From jussi.kivilinna at iki.fi Fri Jan 18 23:35:47 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 19 Jan 2019 00:35:47 +0200 Subject: [PATCH 3/4] Add SSSE3 optimized non-parallel ChaCha20 function In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> Message-ID: <154785094751.23435.17690493289577003135.stgit@localhost.localdomain> * cipher/chacha20-amd64-ssse3.S (ROTATE_SHUF, ROTATE, WORD_SHUF) (QUARTERROUND4, _gcry_chacha20_amd64_ssse3_blocks1): New. * cipher/chacha20.c (_gcry_chacha20_amd64_ssse3_blocks1): New prototype. (chacha20_blocks): Rename to ... (do_chacha20_blocks): ... this. (chacha20_blocks): New. (chacha20_encrypt_stream): Adjust for new chacha20_blocks function. -- This patch provides SSSE3 optimized version of non-parallel ChaCha20 core block function. On Intel Haswell generic C function runs at 6.9 cycles/byte. New function runs at 5.2 cycles/byte, thus being ~32% faster. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index f23722814..0e59ff981 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -163,6 +163,8 @@ chacha20_data: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_rol8: .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.Lcounter1: + .long 1,0,0,0 .Linc_counter: .long 0,1,2,3 .Lunsigned_cmp: @@ -221,7 +223,7 @@ _gcry_chacha20_amd64_ssse3_blocks4: movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); -.Lround2: +.Lround2_4: QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; @@ -235,7 +237,7 @@ _gcry_chacha20_amd64_ssse3_blocks4: movdqa X15, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) sub $2, ROUND; - jnz .Lround2; + jnz .Lround2_4; /* tmp := X15 */ movdqa (STACK_TMP)(%rsp), X11; @@ -337,5 +339,111 @@ _gcry_chacha20_amd64_ssse3_blocks4: ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) +/********************************************************************** + 1-way chacha20 + **********************************************************************/ + +#define ROTATE_SHUF(v1,shuf) \ + pshufb shuf, v1; + +#define ROTATE(v1,c,tmp1) \ + movdqa v1, tmp1; \ + psrld $(32 - (c)), v1; \ + pslld $(c), tmp1; \ + paddb tmp1, v1; + +#define WORD_SHUF(v1,shuf) \ + pshufd $shuf, v1, v1; + +#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\ + shuf_x2,shuf_x3) \ + PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \ + PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \ + PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \ + PLUS(x2, x3); \ + WORD_SHUF(x3, shuf_x3); \ + XOR(x1, x2); \ + WORD_SHUF(x2, shuf_x2); \ + ROTATE(x1, 7, tmp1); \ + WORD_SHUF(x1, shuf_x1); + +.align 8 +.globl _gcry_chacha20_amd64_ssse3_blocks1 +ELF(.type _gcry_chacha20_amd64_ssse3_blocks1, at function;) + +_gcry_chacha20_amd64_ssse3_blocks1: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks + */ + + /* Load constants */ + movdqa .Lcounter1 RIP, X4; + movdqa .Lshuf_rol8 RIP, X5; + movdqa .Lshuf_rol16 RIP, X6; + + /* Load state */ + movdqu (0 * 4)(INPUT), X10; + movdqu (4 * 4)(INPUT), X11; + movdqu (8 * 4)(INPUT), X12; + movdqu (12 * 4)(INPUT), X13; + +.Loop1: + mov $20, ROUND; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + +.Lround2_1: + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + sub $2, ROUND; + jnz .Lround2_1; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + + lea (64)(DST), DST; + lea (64)(SRC), SRC; + + sub $1, NBLKS; + jnz .Loop1; + + /* Store counter */ + movdqu X13, (12 * 4)(INPUT); + + /* clear the used vector registers */ + clear(X0); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + + /* eax zeroed by round loop. */ + ret; +ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, + .-_gcry_chacha20_amd64_ssse3_blocks1;) + #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 84a9b2b80..f1afd18e0 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -112,6 +112,10 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; + #endif /* USE_SSSE3 */ #ifdef USE_AVX2 @@ -156,7 +160,7 @@ static const char *selftest (void); buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) static unsigned int -chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) +do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; @@ -239,6 +243,21 @@ chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) } +static unsigned int +chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, + size_t nblks) +{ +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + { + return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks); + } +#endif + + return do_chacha20_blocks (ctx->input, dst, src, nblks); +} + + static void chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) @@ -475,7 +494,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; - nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks); + nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; @@ -484,7 +503,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, if (length > 0) { - nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1); + nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); From jussi.kivilinna at iki.fi Fri Jan 18 23:35:42 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 19 Jan 2019 00:35:42 +0200 Subject: [PATCH 2/4] tests/basic: increase buffer size for check_one_cipher In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> Message-ID: <154785094234.23435.17421077587711411597.stgit@localhost.localdomain> * tests/basic.c (check_one_cipher_core) (check_one_cipher): Increase buffer from 1040 to 1904 bytes. -- This is for better test coverage of highly parallel cipher implementations. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/tests/basic.c b/tests/basic.c index 96af6c743..3d86e022e 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -7326,8 +7326,8 @@ check_one_cipher_core (int algo, int mode, int flags, int bufshift, int pass) { gcry_cipher_hd_t hd; - unsigned char in_buffer[1040+1], out_buffer[1040+1]; - unsigned char enc_result[1040]; + unsigned char in_buffer[1904+1], out_buffer[1904+1]; + unsigned char enc_result[1904]; unsigned char tag_result[16]; unsigned char tag[16]; unsigned char *in, *out; @@ -7342,7 +7342,7 @@ check_one_cipher_core (int algo, int mode, int flags, taglen = get_algo_mode_taglen(algo, mode); assert (nkey == 64); - assert (nplain == 1040); + assert (nplain == 1904); assert (sizeof(in_buffer) == nplain + 1); assert (sizeof(out_buffer) == sizeof(in_buffer)); assert (blklen > 0); @@ -7692,7 +7692,7 @@ static void check_one_cipher (int algo, int mode, int flags) { char key[64+1]; - unsigned char plain[1040+1]; + unsigned char plain[1904+1]; int bufshift, i; for (bufshift=0; bufshift < 4; bufshift++) @@ -7701,7 +7701,7 @@ check_one_cipher (int algo, int mode, int flags) memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_" "0123456789abcdef.,;/[]{}-=ABCDEF", 64); memcpy (plain, "foobar42FOOBAR17", 16); - for (i = 16; i < 1040; i += 16) + for (i = 16; i < 1904; i += 16) { memcpy (&plain[i], &plain[i-16], 16); if (!++plain[i+7]) @@ -7710,25 +7710,25 @@ check_one_cipher (int algo, int mode, int flags) plain[i+14]++; } - if (check_one_cipher_core (algo, mode, flags, key, 64, plain, 1040, + if (check_one_cipher_core (algo, mode, flags, key, 64, plain, 1904, bufshift, 0+10*bufshift)) return; /* Pass 1: Key not aligned. */ memmove (key+1, key, 64); - if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain, 1040, + if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain, 1904, bufshift, 1+10*bufshift)) return; /* Pass 2: Key not aligned and data not aligned. */ - memmove (plain+1, plain, 1040); - if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1, 1040, + memmove (plain+1, plain, 1904); + if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1, 1904, bufshift, 2+10*bufshift)) return; /* Pass 3: Key aligned and data not aligned. */ memmove (key, key+1, 64); - if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1, 1040, + if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1, 1904, bufshift, 3+10*bufshift)) return; } From jussi.kivilinna at iki.fi Fri Jan 18 23:35:52 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 19 Jan 2019 00:35:52 +0200 Subject: [PATCH 4/4] Add stitched ChaCha20-Poly1305 SSSE3 and AVX2 implementations In-Reply-To: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> Message-ID: <154785095267.23435.15952785945759336425.stgit@localhost.localdomain> * cipher/asm-poly1305-amd64.h: New. * cipher/Makefile.am: Add 'asm-poly1305-amd64.h'. * cipher/chacha20-amd64-avx2.S (QUATERROUND2): Add interleave operators. (_gcry_chacha20_poly1305_amd64_avx2_blocks8): New. * cipher/chacha20-amd64-ssse3.S (QUATERROUND2): Add interleave operators. (_gcry_chacha20_poly1305_amd64_ssse3_blocks4) (_gcry_chacha20_poly1305_amd64_ssse3_blocks1): New. * cipher/chacha20.c (_gcry_chacha20_poly1305_amd64_ssse3_blocks4) (_gcry_chacha20_poly1305_amd64_ssse3_blocks1) (_gcry_chacha20_poly1305_amd64_avx2_blocks8): New prototypes. (chacha20_encrypt_stream): Split tail to... (do_chacha20_encrypt_stream_tail): ... new function. (_gcry_chacha20_poly1305_encrypt) (_gcry_chacha20_poly1305_decrypt): New. * cipher/cipher-internal.h (_gcry_chacha20_poly1305_encrypt) (_gcry_chacha20_poly1305_decrypt): New prototypes. * cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt): Call '_gcry_chacha20_poly1305_encrypt' if cipher is ChaCha20. (_gcry_cipher_poly1305_decrypt): Call '_gcry_chacha20_poly1305_decrypt' if cipher is ChaCha20. * cipher/poly1305-internal.h (_gcry_cipher_poly1305_update_burn): New prototype. * cipher/poly1305.c (poly1305_blocks): Make static. (_gcry_poly1305_update): Split main function body to ... (_gcry_poly1305_update_burn): ... new function. -- Benchmark on Intel Skylake (i5-6500, 3200 Mhz): Before, 8-way AVX2: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.378 ns/B 2526 MiB/s 1.21 c/B STREAM dec | 0.373 ns/B 2560 MiB/s 1.19 c/B POLY1305 enc | 0.685 ns/B 1392 MiB/s 2.19 c/B POLY1305 dec | 0.686 ns/B 1390 MiB/s 2.20 c/B POLY1305 auth | 0.315 ns/B 3031 MiB/s 1.01 c/B After, 8-way AVX2 (~36% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.503 ns/B 1896 MiB/s 1.61 c/B POLY1305 dec | 0.485 ns/B 1965 MiB/s 1.55 c/B Benchmark on Intel Haswell (i7-4790K, 3998 Mhz): Before, 8-way AVX2: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.318 ns/B 2999 MiB/s 1.27 c/B STREAM dec | 0.317 ns/B 3004 MiB/s 1.27 c/B POLY1305 enc | 0.586 ns/B 1627 MiB/s 2.34 c/B POLY1305 dec | 0.586 ns/B 1627 MiB/s 2.34 c/B POLY1305 auth | 0.271 ns/B 3524 MiB/s 1.08 c/B After, 8-way AVX2 (~30% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.452 ns/B 2108 MiB/s 1.81 c/B POLY1305 dec | 0.440 ns/B 2167 MiB/s 1.76 c/B Before, 4-way SSSE3: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.627 ns/B 1521 MiB/s 2.51 c/B STREAM dec | 0.626 ns/B 1523 MiB/s 2.50 c/B POLY1305 enc | 0.895 ns/B 1065 MiB/s 3.58 c/B POLY1305 dec | 0.896 ns/B 1064 MiB/s 3.58 c/B POLY1305 auth | 0.271 ns/B 3521 MiB/s 1.08 c/B After, 4-way SSSE3 (~20% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.733 ns/B 1301 MiB/s 2.93 c/B POLY1305 dec | 0.726 ns/B 1314 MiB/s 2.90 c/B Before, 1-way SSSE3: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 1.56 ns/B 609.6 MiB/s 6.25 c/B POLY1305 dec | 1.56 ns/B 609.4 MiB/s 6.26 c/B After, 1-way SSSE3 (~18% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 1.31 ns/B 725.4 MiB/s 5.26 c/B POLY1305 dec | 1.31 ns/B 727.3 MiB/s 5.24 c/B For comparison to other libraries (on Intel i7-4790K, 3998 Mhz): bench-slope-openssl: OpenSSL 1.1.1 11 Sep 2018 Cipher: chacha20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.301 ns/B 3166.4 MiB/s 1.20 c/B STREAM dec | 0.300 ns/B 3174.7 MiB/s 1.20 c/B POLY1305 enc | 0.463 ns/B 2060.6 MiB/s 1.85 c/B POLY1305 dec | 0.462 ns/B 2063.8 MiB/s 1.85 c/B POLY1305 auth | 0.162 ns/B 5899.3 MiB/s 0.646 c/B bench-slope-nettle: Nettle 3.4 Cipher: chacha | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.65 ns/B 578.2 MiB/s 6.59 c/B STREAM dec | 1.65 ns/B 578.2 MiB/s 6.59 c/B POLY1305 enc | 2.05 ns/B 464.8 MiB/s 8.20 c/B POLY1305 dec | 2.05 ns/B 464.7 MiB/s 8.20 c/B POLY1305 auth | 0.404 ns/B 2359.1 MiB/s 1.62 c/B bench-slope-botan: Botan 2.6.0 Cipher: ChaCha | nanosecs/byte mebibytes/sec cycles/byte STREAM enc/dec | 0.855 ns/B 1116.0 MiB/s 3.42 c/B POLY1305 enc | 1.60 ns/B 595.4 MiB/s 6.40 c/B POLY1305 dec | 1.60 ns/B 595.8 MiB/s 6.40 c/B POLY1305 auth | 0.752 ns/B 1268.3 MiB/s 3.01 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 98320ca5f..16066bfc6 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,6 +72,7 @@ libcipher_la_SOURCES = \ EXTRA_libcipher_la_SOURCES = \ asm-common-amd64.h \ asm-common-aarch64.h \ + asm-poly1305-amd64.h \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ diff --git a/cipher/asm-poly1305-amd64.h b/cipher/asm-poly1305-amd64.h new file mode 100644 index 000000000..3f99ea3e1 --- /dev/null +++ b/cipher/asm-poly1305-amd64.h @@ -0,0 +1,171 @@ +/* asm-common-amd64.h - Poly1305 macros for AMD64 assembly + * + * Copyright (C) 2019 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef GCRY_ASM_POLY1305_AMD64_H +#define GCRY_ASM_POLY1305_AMD64_H + +#include "asm-common-amd64.h" + +/********************************************************************** + poly1305 for stitched chacha20-poly1305 AMD64 implementations + **********************************************************************/ + +#define POLY_RSTATE %r8 +#define POLY_RSRC %r9 + +#define POLY_R_H0 %rbx +#define POLY_R_H1 %rcx +#define POLY_R_H2 %r10 +#define POLY_R_H2d %r10d +#define POLY_R_R0 %r11 +#define POLY_R_R1_MUL5 %r12 +#define POLY_R_X0_HI %r13 +#define POLY_R_X0_LO %r14 +#define POLY_R_X1_HI %r15 +#define POLY_R_X1_LO %rsi + +#define POLY_S_R0 (4 * 4 + 0 * 8)(POLY_RSTATE) +#define POLY_S_R1 (4 * 4 + 1 * 8)(POLY_RSTATE) +#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE) +#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE) +#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE) + +#define POLY1305_LOAD_STATE() \ + movq POLY_S_H0, POLY_R_H0; \ + movq POLY_S_H1, POLY_R_H1; \ + movl POLY_S_H2d, POLY_R_H2d; \ + movq POLY_S_R0, POLY_R_R0; \ + movq POLY_S_R1, POLY_R_R1_MUL5; \ + shrq $2, POLY_R_R1_MUL5; \ + addq POLY_S_R1, POLY_R_R1_MUL5; + +#define POLY1305_STORE_STATE() \ + movq POLY_R_H0, POLY_S_H0; \ + movq POLY_R_H1, POLY_S_H1; \ + movl POLY_R_H2d, POLY_S_H2d; + +/* a = h + m */ +#define POLY1305_BLOCK_PART1(src_offset) \ + addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \ + adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \ + adcl $1, POLY_R_H2d; \ + \ + /* h = a * r (partial mod 2^130-5): */ \ + \ + /* h0 * r1 */ \ + movq POLY_R_H0, %rax; \ + mulq POLY_S_R1; \ + movq %rax, POLY_R_X1_LO; \ + movq %rdx, POLY_R_X1_HI; + +#define POLY1305_BLOCK_PART2() \ + \ + /* h0 * r0 */ \ + movq POLY_R_H0, %rax; \ + mulq POLY_R_R0; \ + movq %rax, POLY_R_X0_LO; \ + movq %rdx, POLY_R_X0_HI; + +#define POLY1305_BLOCK_PART3() \ + \ + /* h1 * r0 */ \ + movq POLY_R_H1, %rax; \ + mulq POLY_R_R0; \ + addq %rax, POLY_R_X1_LO; \ + adcq %rdx, POLY_R_X1_HI; \ + \ + /* h1 * r1 mod 2^130-5 */ \ + movq POLY_R_R1_MUL5, %rax; \ + mulq POLY_R_H1; + +#define POLY1305_BLOCK_PART4() \ + movq POLY_R_H2, POLY_R_H1; \ + imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \ + addq %rax, POLY_R_X0_LO; \ + adcq %rdx, POLY_R_X0_HI; \ + imulq POLY_R_R0, POLY_R_H2; /* h2 * r0 */ \ + addq POLY_R_X1_LO, POLY_R_H1; \ + adcq POLY_R_X1_HI, POLY_R_H2; + +#define POLY1305_BLOCK_PART5() \ + \ + /* carry propagation */ \ + movq POLY_R_H2, POLY_R_H0; \ + andl $3, POLY_R_H2d; \ + shrq $2, POLY_R_H0; \ + leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \ + addq POLY_R_X0_LO, POLY_R_H0; \ + adcq POLY_R_X0_HI, POLY_R_H1; \ + adcl $0, POLY_R_H2d; + +#ifdef TESTING_POLY1305_ASM +/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */ +.align 8 +.globl _gcry_poly1305_amd64_ssse3_blocks1 +ELF(.type _gcry_poly1305_amd64_ssse3_blocks1, at function;) + +_gcry_poly1305_amd64_ssse3_blocks1: + /* input: + * %rdi: poly1305-state + * %rsi: src + * %rdx: nblks + */ + pushq %rbp; + movq %rsp, %rbp; + + subq $(10 * 8), %rsp; + movq %rbx, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + movq %r14, (4 * 8)(%rsp); + movq %r15, (5 * 8)(%rsp); + + movq %rdx, (8 * 8)(%rsp); # NBLKS + + movq %rdi, POLY_RSTATE; + movq %rsi, POLY_RSRC; + + POLY1305_LOAD_STATE(); + +.L_poly1: + POLY1305_BLOCK_PART1(0 * 16); + POLY1305_BLOCK_PART2(); + POLY1305_BLOCK_PART3(); + POLY1305_BLOCK_PART4(); + POLY1305_BLOCK_PART5(); + + subq $1, (8 * 8)(%rsp); # NBLKS + leaq (16)(POLY_RSRC), POLY_RSRC; + jnz .L_poly1; + + POLY1305_STORE_STATE(); + + movq (1 * 8)(%rsp), %rbx; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + movq (4 * 8)(%rsp), %r14; + movq (5 * 8)(%rsp), %r15; + + xorl %eax, %eax; + leave + ret; +#endif + +#endif /* GCRY_ASM_POLY1305_AMD64_H */ diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index dad9e3e96..ef02c1733 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -1,7 +1,6 @@ /* chacha20-amd64-avx2.S - AVX2 implementation of ChaCha20 cipher * - - * Copyright (C) 2017,2018 Jussi Kivilinna + * Copyright (C) 2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -36,17 +35,8 @@ .text -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif +#include "asm-common-amd64.h" +#include "asm-poly1305-amd64.h" /* register macros */ #define INPUT %rdi @@ -139,15 +129,21 @@ #define PLUS(ds,s) \ vpaddd s, ds, ds; -#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1) \ - vbroadcasti128 .Lshuf_rol16 RIP, tmp1; \ +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\ + interleave_op1,interleave_op2,\ + interleave_op3,interleave_op4) \ + vbroadcasti128 .Lshuf_rol16 rRIP, tmp1; \ + interleave_op1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ + interleave_op2; \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12, tmp1); \ - vbroadcasti128 .Lshuf_rol8 RIP, tmp1; \ + vbroadcasti128 .Lshuf_rol8 rRIP, tmp1; \ + interleave_op3; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ + interleave_op4; \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1); @@ -189,12 +185,12 @@ _gcry_chacha20_amd64_avx2_blocks8: subq $STACK_MAX, %rsp; andq $~31, %rsp; -.Loop4: +.Loop8: mov $20, ROUND; /* Construct counter vectors X12 and X13 */ - vpmovzxbd .Linc_counter RIP, X0; - vpbroadcastd .Lunsigned_cmp RIP, X2; + vpmovzxbd .Linc_counter rRIP, X0; + vpbroadcastd .Lunsigned_cmp rRIP, X2; vpbroadcastd (12 * 4)(INPUT), X12; vpbroadcastd (13 * 4)(INPUT), X13; vpaddd X0, X12, X12; @@ -223,14 +219,14 @@ _gcry_chacha20_amd64_avx2_blocks8: vmovdqa X15, (STACK_TMP)(%rsp); .Lround2: - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15) + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,,,,) vmovdqa (STACK_TMP)(%rsp), X15; vmovdqa X8, (STACK_TMP)(%rsp); - QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8) - QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8) + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,,,,) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,,,,) vmovdqa (STACK_TMP)(%rsp), X8; vmovdqa X15, (STACK_TMP)(%rsp); - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15) + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,,,,) sub $2, ROUND; jnz .Lround2; @@ -302,7 +298,7 @@ _gcry_chacha20_amd64_avx2_blocks8: sub $8, NBLKS; lea (8 * 64)(DST), DST; lea (8 * 64)(SRC), SRC; - jnz .Loop4; + jnz .Loop8; /* clear the used vector registers and stack */ vpxor X0, X0, X0; @@ -319,5 +315,438 @@ _gcry_chacha20_amd64_avx2_blocks8: ELF(.size _gcry_chacha20_amd64_avx2_blocks8, .-_gcry_chacha20_amd64_avx2_blocks8;) +/********************************************************************** + 8-way stitched chacha20-poly1305 + **********************************************************************/ + +.align 8 +.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8 +ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8, at function;) + +_gcry_chacha20_poly1305_amd64_avx2_blocks8: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks (multiple of 8) + * %r9: poly1305-state + * %r8: poly1305-src + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(8 * 8) + STACK_MAX + 32, %rsp; + andq $~31, %rsp; + + movq %rbx, (STACK_MAX + 0 * 8)(%rsp); + movq %r12, (STACK_MAX + 1 * 8)(%rsp); + movq %r13, (STACK_MAX + 2 * 8)(%rsp); + movq %r14, (STACK_MAX + 3 * 8)(%rsp); + movq %r15, (STACK_MAX + 4 * 8)(%rsp); + + movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC + movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST + movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS + + /* Load state */ + POLY1305_LOAD_STATE(); + +.Loop_poly8: + + /* Construct counter vectors X12 and X13 */ + vpmovzxbd .Linc_counter rRIP, X0; + vpbroadcastd .Lunsigned_cmp rRIP, X2; + vpbroadcastd (12 * 4)(INPUT), X12; + vpbroadcastd (13 * 4)(INPUT), X13; + vpaddd X0, X12, X12; + vpxor X2, X0, X0; + vpxor X2, X12, X1; + vpcmpgtd X1, X0, X0; + vpsubd X0, X13, X13; + vmovdqa X12, (STACK_VEC_X12)(%rsp); + vmovdqa X13, (STACK_VEC_X13)(%rsp); + + /* Load vectors */ + vpbroadcastd (0 * 4)(INPUT), X0; + vpbroadcastd (1 * 4)(INPUT), X1; + vpbroadcastd (2 * 4)(INPUT), X2; + vpbroadcastd (3 * 4)(INPUT), X3; + vpbroadcastd (4 * 4)(INPUT), X4; + vpbroadcastd (5 * 4)(INPUT), X5; + vpbroadcastd (6 * 4)(INPUT), X6; + vpbroadcastd (7 * 4)(INPUT), X7; + vpbroadcastd (8 * 4)(INPUT), X8; + vpbroadcastd (9 * 4)(INPUT), X9; + vpbroadcastd (10 * 4)(INPUT), X10; + vpbroadcastd (11 * 4)(INPUT), X11; + vpbroadcastd (14 * 4)(INPUT), X14; + vpbroadcastd (15 * 4)(INPUT), X15; + vmovdqa X15, (STACK_TMP)(%rsp); + + # rounds 0,1 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(1 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(2 * 16), + POLY1305_BLOCK_PART2()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(3 * 16)) + + # rounds 2,3 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART1(4 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(5 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(6 * 16), + POLY1305_BLOCK_PART2()) + + # rounds 4,5 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(7 * 16)) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART1(8 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(9 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + + # rounds 6,7 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(10 * 16), + POLY1305_BLOCK_PART2()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(11 * 16)) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART1(12 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + + # rounds 8,9 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(13 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(14 * 16), + POLY1305_BLOCK_PART2()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(15 * 16)) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + + # rounds 10,11 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART1(16 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(17 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(18 * 16), + POLY1305_BLOCK_PART2()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(19 * 16)) + + # rounds 12,13 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART1(20 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(21 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(22 * 16), + POLY1305_BLOCK_PART2()) + + # rounds 14,15 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(23 * 16)) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART1(24 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(25 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + + # rounds 16,17 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(26 * 16), + POLY1305_BLOCK_PART2()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(27 * 16)) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART1(28 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + + # rounds 18,19 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(29 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(30 * 16), + POLY1305_BLOCK_PART2()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(31 * 16)) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + + /* tmp := X15 */ + vpbroadcastd (0 * 4)(INPUT), X15; + PLUS(X0, X15); + vpbroadcastd (1 * 4)(INPUT), X15; + PLUS(X1, X15); + vpbroadcastd (2 * 4)(INPUT), X15; + PLUS(X2, X15); + vpbroadcastd (3 * 4)(INPUT), X15; + PLUS(X3, X15); + vpbroadcastd (4 * 4)(INPUT), X15; + PLUS(X4, X15); + vpbroadcastd (5 * 4)(INPUT), X15; + PLUS(X5, X15); + vpbroadcastd (6 * 4)(INPUT), X15; + PLUS(X6, X15); + vpbroadcastd (7 * 4)(INPUT), X15; + PLUS(X7, X15); + vpbroadcastd (8 * 4)(INPUT), X15; + PLUS(X8, X15); + vpbroadcastd (9 * 4)(INPUT), X15; + PLUS(X9, X15); + vpbroadcastd (10 * 4)(INPUT), X15; + PLUS(X10, X15); + vpbroadcastd (11 * 4)(INPUT), X15; + PLUS(X11, X15); + vmovdqa (STACK_VEC_X12)(%rsp), X15; + PLUS(X12, X15); + vmovdqa (STACK_VEC_X13)(%rsp), X15; + PLUS(X13, X15); + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X13, (STACK_TMP)(%rsp); + vpbroadcastd (14 * 4)(INPUT), X13; + PLUS(X14, X13); + vmovdqa X14, (STACK_TMP1)(%rsp); + vpbroadcastd (15 * 4)(INPUT), X13; + PLUS(X15, X13); + vmovdqa X15, (STACK_TMP2)(%rsp); + + /* Update counter */ + addq $8, (12 * 4)(INPUT); + + movq (STACK_MAX + 5 * 8)(%rsp), SRC; + movq (STACK_MAX + 6 * 8)(%rsp), DST; + + transpose_4x4(X0, X1, X2, X3, X13, X14); + transpose_4x4(X4, X5, X6, X7, X13, X14); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15); + vmovdqa (STACK_TMP)(%rsp), X13; + vmovdqa (STACK_TMP1)(%rsp), X14; + vmovdqa (STACK_TMP2)(%rsp), X15; + transpose_4x4(X8, X9, X10, X11, X0, X1); + transpose_4x4(X12, X13, X14, X15, X0, X1); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0); + + subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS + + lea (32 * 16)(POLY_RSRC), POLY_RSRC; + lea (8 * 64)(DST), DST; + lea (8 * 64)(SRC), SRC; + movq SRC, (STACK_MAX + 5 * 8)(%rsp); + movq DST, (STACK_MAX + 6 * 8)(%rsp); + + jnz .Loop_poly8; + + /* Store state */ + POLY1305_STORE_STATE(); + + /* clear the used vector registers and stack */ + vpxor X0, X0, X0; + vmovdqa X0, (STACK_VEC_X12)(%rsp); + vmovdqa X0, (STACK_VEC_X13)(%rsp); + vmovdqa X0, (STACK_TMP)(%rsp); + vmovdqa X0, (STACK_TMP1)(%rsp); + vmovdqa X0, (STACK_TMP2)(%rsp); + vzeroall; + + movq (STACK_MAX + 0 * 8)(%rsp), %rbx; + movq (STACK_MAX + 1 * 8)(%rsp), %r12; + movq (STACK_MAX + 2 * 8)(%rsp), %r13; + movq (STACK_MAX + 3 * 8)(%rsp), %r14; + movq (STACK_MAX + 4 * 8)(%rsp), %r15; + + xorl %eax, %eax; + leave; + ret; +ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8, + .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;) + #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index 0e59ff981..d7faf6442 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -1,6 +1,6 @@ /* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher * - * Copyright (C) 2017,2018 Jussi Kivilinna + * Copyright (C) 2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -35,17 +35,8 @@ .text -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif +#include "asm-common-amd64.h" +#include "asm-poly1305-amd64.h" /* register macros */ #define INPUT %rdi @@ -145,13 +136,16 @@ #define PLUS(ds,s) \ paddd s, ds; -#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ - movdqa .Lshuf_rol16 RIP, tmp1; \ +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\ + interleave_op1,interleave_op2) \ + movdqa .Lshuf_rol16 rRIP, tmp1; \ + interleave_op1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12, tmp1, tmp2); \ - movdqa .Lshuf_rol8 RIP, tmp1; \ + movdqa .Lshuf_rol8 rRIP, tmp1; \ + interleave_op2; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ @@ -192,8 +186,8 @@ _gcry_chacha20_amd64_ssse3_blocks4: mov $20, ROUND; /* Construct counter vectors X12 and X13 */ - movdqa .Linc_counter RIP, X0; - movdqa .Lunsigned_cmp RIP, X2; + movdqa .Linc_counter rRIP, X0; + movdqa .Lunsigned_cmp rRIP, X2; pbroadcastd((12 * 4)(INPUT), X12); pbroadcastd((13 * 4)(INPUT), X13); paddd X0, X12; @@ -224,18 +218,18 @@ _gcry_chacha20_amd64_ssse3_blocks4: movdqa X15, (STACK_TMP1)(%rsp); .Lround2_4: - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,,) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; movdqa X8, (STACK_TMP)(%rsp); movdqa X9, (STACK_TMP1)(%rsp); - QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9) - QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9) + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,,) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,,) movdqa (STACK_TMP)(%rsp), X8; movdqa (STACK_TMP1)(%rsp), X9; movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,,) sub $2, ROUND; jnz .Lround2_4; @@ -380,9 +374,9 @@ _gcry_chacha20_amd64_ssse3_blocks1: */ /* Load constants */ - movdqa .Lcounter1 RIP, X4; - movdqa .Lshuf_rol8 RIP, X5; - movdqa .Lshuf_rol16 RIP, X6; + movdqa .Lcounter1 rRIP, X4; + movdqa .Lshuf_rol8 rRIP, X5; + movdqa .Lshuf_rol16 rRIP, X6; /* Load state */ movdqu (0 * 4)(INPUT), X10; @@ -445,5 +439,570 @@ _gcry_chacha20_amd64_ssse3_blocks1: ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, .-_gcry_chacha20_amd64_ssse3_blocks1;) +/********************************************************************** + 4-way stitched chacha20-poly1305 + **********************************************************************/ + +.align 8 +.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4 +ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4, at function;) + +_gcry_chacha20_poly1305_amd64_ssse3_blocks4: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks (multiple of 4) + * %r9: poly1305-state + * %r8: poly1305-src + */ + + pushq %rbp; + movq %rsp, %rbp; + + subq $(8 * 8) + STACK_MAX + 16, %rsp; + andq $~15, %rsp; + + movq %rbx, (STACK_MAX + 0 * 8)(%rsp); + movq %r12, (STACK_MAX + 1 * 8)(%rsp); + movq %r13, (STACK_MAX + 2 * 8)(%rsp); + movq %r14, (STACK_MAX + 3 * 8)(%rsp); + movq %r15, (STACK_MAX + 4 * 8)(%rsp); + + movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC + movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST + movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS + + /* Load state */ + POLY1305_LOAD_STATE(); + +.Loop_poly4: + + /* Construct counter vectors X12 and X13 */ + movdqa .Linc_counter rRIP, X0; + movdqa .Lunsigned_cmp rRIP, X2; + pbroadcastd((12 * 4)(INPUT), X12); + pbroadcastd((13 * 4)(INPUT), X13); + paddd X0, X12; + movdqa X12, X1; + pxor X2, X0; + pxor X2, X1; + pcmpgtd X1, X0; + psubd X0, X13; + movdqa X12, (STACK_VEC_X12)(%rsp); + movdqa X13, (STACK_VEC_X13)(%rsp); + + /* Load vectors */ + pbroadcastd((0 * 4)(INPUT), X0); + pbroadcastd((1 * 4)(INPUT), X1); + pbroadcastd((2 * 4)(INPUT), X2); + pbroadcastd((3 * 4)(INPUT), X3); + pbroadcastd((4 * 4)(INPUT), X4); + pbroadcastd((5 * 4)(INPUT), X5); + pbroadcastd((6 * 4)(INPUT), X6); + pbroadcastd((7 * 4)(INPUT), X7); + pbroadcastd((8 * 4)(INPUT), X8); + pbroadcastd((9 * 4)(INPUT), X9); + pbroadcastd((10 * 4)(INPUT), X10); + pbroadcastd((11 * 4)(INPUT), X11); + pbroadcastd((14 * 4)(INPUT), X14); + pbroadcastd((15 * 4)(INPUT), X15); + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + + /* rounds 0,1 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(1 * 16)) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + + /* rounds 2,3 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART1(2 * 16), + POLY1305_BLOCK_PART2()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(3 * 16)) + + /* rounds 4,5 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART1(4 * 16), + POLY1305_BLOCK_PART2()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + + /* rounds 6,7 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(5 * 16)) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART1(6 * 16), + POLY1305_BLOCK_PART2()) + + /* rounds 8,9 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(7 * 16)) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + + /* rounds 10,11 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART1(8 * 16), + POLY1305_BLOCK_PART2()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(9 * 16)) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + + /* rounds 12,13 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART1(10 * 16), + POLY1305_BLOCK_PART2()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(11 * 16)) + + /* rounds 14,15 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART1(12 * 16), + POLY1305_BLOCK_PART2()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + + /* rounds 16,17 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(13 * 16)) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART1(14 * 16), + POLY1305_BLOCK_PART2()) + + /* rounds 18,19 */ + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART1(15 * 16)) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3()) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5()) + + /* tmp := X15 */ + movdqa (STACK_TMP)(%rsp), X11; + pbroadcastd((0 * 4)(INPUT), X15); + PLUS(X0, X15); + pbroadcastd((1 * 4)(INPUT), X15); + PLUS(X1, X15); + pbroadcastd((2 * 4)(INPUT), X15); + PLUS(X2, X15); + pbroadcastd((3 * 4)(INPUT), X15); + PLUS(X3, X15); + pbroadcastd((4 * 4)(INPUT), X15); + PLUS(X4, X15); + pbroadcastd((5 * 4)(INPUT), X15); + PLUS(X5, X15); + pbroadcastd((6 * 4)(INPUT), X15); + PLUS(X6, X15); + pbroadcastd((7 * 4)(INPUT), X15); + PLUS(X7, X15); + pbroadcastd((8 * 4)(INPUT), X15); + PLUS(X8, X15); + pbroadcastd((9 * 4)(INPUT), X15); + PLUS(X9, X15); + pbroadcastd((10 * 4)(INPUT), X15); + PLUS(X10, X15); + pbroadcastd((11 * 4)(INPUT), X15); + PLUS(X11, X15); + movdqa (STACK_VEC_X12)(%rsp), X15; + PLUS(X12, X15); + movdqa (STACK_VEC_X13)(%rsp), X15; + PLUS(X13, X15); + movdqa X13, (STACK_TMP)(%rsp); + pbroadcastd((14 * 4)(INPUT), X15); + PLUS(X14, X15); + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X14, (STACK_TMP1)(%rsp); + pbroadcastd((15 * 4)(INPUT), X13); + PLUS(X15, X13); + movdqa X15, (STACK_TMP2)(%rsp); + + /* Update counter */ + addq $4, (12 * 4)(INPUT); + + movq (STACK_MAX + 5 * 8)(%rsp), SRC; + movq (STACK_MAX + 6 * 8)(%rsp), DST; + + transpose_4x4(X0, X1, X2, X3, X13, X14, X15); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15); + transpose_4x4(X4, X5, X6, X7, X0, X1, X2); + movdqa (STACK_TMP)(%rsp), X13; + movdqa (STACK_TMP1)(%rsp), X14; + movdqa (STACK_TMP2)(%rsp), X15; + xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0); + transpose_4x4(X8, X9, X10, X11, X0, X1, X2); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0); + transpose_4x4(X12, X13, X14, X15, X0, X1, X2); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0); + + subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS + + lea (16 * 16)(POLY_RSRC), POLY_RSRC; + lea (4 * 64)(DST), DST; + lea (4 * 64)(SRC), SRC; + movq SRC, (STACK_MAX + 5 * 8)(%rsp); + movq DST, (STACK_MAX + 6 * 8)(%rsp); + + jnz .Loop_poly4; + + /* Store state */ + POLY1305_STORE_STATE(); + + /* clear the used vector registers and stack */ + clear(X0); + movdqa X0, (STACK_VEC_X12)(%rsp); + movdqa X0, (STACK_VEC_X13)(%rsp); + movdqa X0, (STACK_TMP)(%rsp); + movdqa X0, (STACK_TMP1)(%rsp); + movdqa X0, (STACK_TMP2)(%rsp); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X8); + clear(X9); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + clear(X14); + clear(X15); + + movq (STACK_MAX + 0 * 8)(%rsp), %rbx; + movq (STACK_MAX + 1 * 8)(%rsp), %r12; + movq (STACK_MAX + 2 * 8)(%rsp), %r13; + movq (STACK_MAX + 3 * 8)(%rsp), %r14; + movq (STACK_MAX + 4 * 8)(%rsp), %r15; + + xorl %eax, %eax; + leave; + ret; +ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, + .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) + +/********************************************************************** + 1-way stitched chacha20-poly1305 + **********************************************************************/ + +.align 8 +.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1 +ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1, at function;) + +_gcry_chacha20_poly1305_amd64_ssse3_blocks1: + /* input: + * %rdi: chacha20-state + * %rsi: dst + * %rdx: src + * %rcx: nblks + * %r9: poly1305-state + * %r8: poly1305-src + */ + pushq %rbp; + movq %rsp, %rbp; + + subq $(8 * 8), %rsp; + movq %rbx, (0 * 8)(%rsp); + movq %r12, (1 * 8)(%rsp); + movq %r13, (2 * 8)(%rsp); + movq %r14, (3 * 8)(%rsp); + movq %r15, (4 * 8)(%rsp); + + movq %rdx, (5 * 8)(%rsp); # SRC + movq %rsi, (6 * 8)(%rsp); # DST + movq %rcx, (7 * 8)(%rsp); # NBLKS + + /* Load constants */ + movdqa .Lcounter1 rRIP, X4; + movdqa .Lshuf_rol8 rRIP, X5; + movdqa .Lshuf_rol16 rRIP, X6; + + /* Load state */ + movdqu (0 * 4)(INPUT), X10; + movdqu (4 * 4)(INPUT), X11; + movdqu (8 * 4)(INPUT), X12; + movdqu (12 * 4)(INPUT), X13; + + POLY1305_LOAD_STATE(); + +.Loop_poly1: + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + + /* Process one ChaCha20 block and four Poly1305 blocks. */ + POLY1305_BLOCK_PART1(0 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(1 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART1(2 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(3 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + + movq (5 * 8)(%rsp), SRC; + movq (6 * 8)(%rsp), DST; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + + subq $1, (7 * 8)(%rsp); # NBLKS + lea (64)(POLY_RSRC), POLY_RSRC; + lea (64)(SRC), SRC; + lea (64)(DST), DST; + movq SRC, (5 * 8)(%rsp); + movq DST, (6 * 8)(%rsp); + + jnz .Loop_poly1; + + /* Store state */ + POLY1305_STORE_STATE(); + + movdqu X13, (12 * 4)(INPUT); + + /* clear the used vector registers */ + clear(X0); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + + movq (0 * 8)(%rsp), %rbx; + movq (1 * 8)(%rsp), %r12; + movq (2 * 8)(%rsp), %r13; + movq (3 * 8)(%rsp), %r14; + movq (4 * 8)(%rsp), %r15; + + xorl %eax, %eax; + leave; + ret; +ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1, + .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;) + #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index f1afd18e0..0847c20ea 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,5 +1,5 @@ /* chacha20.c - Bernstein's ChaCha20 cipher - * Copyright (C) 2014,2017,2018 Jussi Kivilinna + * Copyright (C) 2014,2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -36,6 +36,7 @@ #include "types.h" #include "g10lib.h" #include "cipher.h" +#include "cipher-internal.h" #include "bufhelp.h" @@ -116,6 +117,14 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4( + u32 *state, byte *dst, const byte *src, size_t nblks, + void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; + +unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1( + u32 *state, byte *dst, const byte *src, size_t nblks, + void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; + #endif /* USE_SSSE3 */ #ifdef USE_AVX2 @@ -124,6 +133,10 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8( + u32 *state, byte *dst, const byte *src, size_t nblks, + void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; + #endif /* USE_AVX2 */ #ifdef USE_ARMV7_NEON @@ -402,39 +415,13 @@ chacha20_setkey (void *context, const byte *key, unsigned int keylen, } -static void -chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, - size_t length) +static unsigned int +do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, + const byte *inbuf, size_t length) { static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; - CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; unsigned int nburn, burn = 0; - if (!length) - return; - - if (ctx->unused) - { - unsigned char *p = ctx->pad; - size_t n; - - gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); - - n = ctx->unused; - if (n > length) - n = length; - - buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); - length -= n; - outbuf += n; - inbuf += n; - ctx->unused -= n; - - if (!length) - return; - gcry_assert (!ctx->unused); - } - #ifdef USE_AVX2 if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { @@ -510,7 +497,349 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, ctx->unused = CHACHA20_BLOCK_SIZE - length; } - _gcry_burn_stack (burn); + if (burn) + burn += 5 * sizeof(void *); + + return burn; +} + + +static void +chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, + size_t length) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + unsigned int nburn, burn = 0; + + if (!length) + return; + + if (ctx->unused) + { + unsigned char *p = ctx->pad; + size_t n; + + gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); + + n = ctx->unused; + if (n > length) + n = length; + + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); + length -= n; + outbuf += n; + inbuf += n; + ctx->unused -= n; + + if (!length) + return; + gcry_assert (!ctx->unused); + } + + nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); + burn = nburn > burn ? nburn : burn; + + if (burn) + _gcry_burn_stack (burn); +} + + +gcry_err_code_t +_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, + const byte *inbuf, size_t length) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c; + unsigned int nburn, burn = 0; + byte *authptr = NULL; + + if (!length) + return 0; + + if (ctx->unused) + { + unsigned char *p = ctx->pad; + size_t n; + + gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); + + n = ctx->unused; + if (n > length) + n = length; + + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); + nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n); + burn = nburn > burn ? nburn : burn; + length -= n; + outbuf += n; + inbuf += n; + ctx->unused -= n; + + if (!length) + { + if (burn) + _gcry_burn_stack (burn); + + return 0; + } + gcry_assert (!ctx->unused); + } + + gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); + + if (0) + { } +#ifdef USE_AVX2 + else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) + { + nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 8 * CHACHA20_BLOCK_SIZE; + outbuf += 8 * CHACHA20_BLOCK_SIZE; + inbuf += 8 * CHACHA20_BLOCK_SIZE; + } +#endif +#ifdef USE_SSSE3 + else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) + { + nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 4 * CHACHA20_BLOCK_SIZE; + outbuf += 4 * CHACHA20_BLOCK_SIZE; + inbuf += 4 * CHACHA20_BLOCK_SIZE; + } + else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) + { + nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 1 * CHACHA20_BLOCK_SIZE; + outbuf += 1 * CHACHA20_BLOCK_SIZE; + inbuf += 1 * CHACHA20_BLOCK_SIZE; + } +#endif + + if (authptr) + { + size_t authoffset = outbuf - authptr; + +#ifdef USE_AVX2 + if (ctx->use_avx2 && + length >= 8 * CHACHA20_BLOCK_SIZE && + authoffset >= 8 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + + nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + { + if (length >= 4 * CHACHA20_BLOCK_SIZE && + authoffset >= 4 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + + nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } + + if (length >= CHACHA20_BLOCK_SIZE && + authoffset >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + + nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } + } +#endif + + if (authoffset > 0) + { + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset); + authptr += authoffset; + authoffset = 0; + } + + gcry_assert(authptr == outbuf); + } + + while (length) + { + size_t currlen = length; + + /* Since checksumming is done after encryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for checksumming. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); + burn = nburn > burn ? nburn : burn; + + nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, + currlen); + burn = nburn > burn ? nburn : burn; + + outbuf += currlen; + inbuf += currlen; + length -= currlen; + } + + if (burn) + _gcry_burn_stack (burn); + + return 0; +} + + +gcry_err_code_t +_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, + const byte *inbuf, size_t length) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c; + unsigned int nburn, burn = 0; + + if (!length) + return 0; + + if (ctx->unused) + { + unsigned char *p = ctx->pad; + size_t n; + + gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); + + n = ctx->unused; + if (n > length) + n = length; + + nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n); + burn = nburn > burn ? nburn : burn; + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); + length -= n; + outbuf += n; + inbuf += n; + ctx->unused -= n; + + if (!length) + { + if (burn) + _gcry_burn_stack (burn); + + return 0; + } + gcry_assert (!ctx->unused); + } + + gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); + +#ifdef USE_AVX2 + if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + + nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + { + if (length >= 4 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + + nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } + + if (length >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + + nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } + } +#endif + + while (length) + { + size_t currlen = length; + + /* Since checksumming is done before decryption, process input in 24KiB + * chunks to keep data loaded in L1 cache for decryption. */ + if (currlen > 24 * 1024) + currlen = 24 * 1024; + + nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, + currlen); + burn = nburn > burn ? nburn : burn; + + nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); + burn = nburn > burn ? nburn : burn; + + outbuf += currlen; + inbuf += currlen; + length -= currlen; + } + + if (burn) + _gcry_burn_stack (burn); + + return 0; } diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 898869623..78f05dbb5 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -542,6 +542,15 @@ void _gcry_cipher_poly1305_setkey /* */ (gcry_cipher_hd_t c); +/*-- chacha20.c --*/ +gcry_err_code_t _gcry_chacha20_poly1305_encrypt +/* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, + size_t length); +gcry_err_code_t _gcry_chacha20_poly1305_decrypt +/* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, + size_t length); + + /*-- cipher-ocb.c --*/ gcry_err_code_t _gcry_cipher_ocb_encrypt /* */ (gcry_cipher_hd_t c, diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c index 607586b55..bb475236b 100644 --- a/cipher/cipher-poly1305.c +++ b/cipher/cipher-poly1305.c @@ -164,6 +164,11 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } + if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20)) + { + return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen); + } + while (inbuflen) { size_t currlen = inbuflen; @@ -217,6 +222,11 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; } + if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20)) + { + return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen); + } + while (inbuflen) { size_t currlen = inbuflen; diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index 2405a090f..19cee5f6f 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -58,5 +58,7 @@ void _gcry_poly1305_finish (poly1305_context_t *ctx, void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf, size_t buflen); +unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx, + const byte *m, size_t bytes); #endif /* G10_POLY1305_INTERNAL_H */ diff --git a/cipher/poly1305.c b/cipher/poly1305.c index 571f82862..8de6cd5e6 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -133,7 +133,7 @@ static void poly1305_init (poly1305_context_t *ctx, ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \ } while (0) -unsigned int +static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { @@ -337,7 +337,7 @@ static unsigned int poly1305_final (poly1305_context_t *ctx, ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \ } while (0) -unsigned int +static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { @@ -444,8 +444,9 @@ static unsigned int poly1305_final (poly1305_context_t *ctx, #endif /* USE_MPI_32BIT */ -void -_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) +unsigned int +_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m, + size_t bytes) { unsigned int burn = 0; @@ -460,7 +461,7 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) m += want; ctx->leftover += want; if (ctx->leftover < POLY1305_BLOCKSIZE) - return; + return 0; burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1); ctx->leftover = 0; } @@ -481,6 +482,17 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) ctx->leftover += bytes; } + return burn; +} + + +void +_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) +{ + unsigned int burn; + + burn = _gcry_poly1305_update_burn (ctx, m, bytes); + if (burn) _gcry_burn_stack (burn); } From jussi.kivilinna at iki.fi Sun Jan 20 00:31:56 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 20 Jan 2019 01:31:56 +0200 Subject: bench-slope for other libraries... (Re: [PATCH 4/4] Add stitched ChaCha20-Poly1305 SSSE3 and AVX2 implementations) In-Reply-To: <154785095267.23435.15952785945759336425.stgit@localhost.localdomain> References: <154785093715.23435.1643032871224097043.stgit@localhost.localdomain> <154785095267.23435.15952785945759336425.stgit@localhost.localdomain> Message-ID: <24d16ceb-a0ef-ce5f-8191-aec458b1321f@iki.fi> Hello, On 19.1.2019 0.35, Jussi Kivilinna wrote: > For comparison to other libraries (on Intel i7-4790K, 3998 Mhz): > > bench-slope-openssl: OpenSSL 1.1.1 11 Sep 2018 > Cipher: > chacha20 | nanosecs/byte mebibytes/sec cycles/byte > STREAM enc | 0.301 ns/B 3166.4 MiB/s 1.20 c/B > STREAM dec | 0.300 ns/B 3174.7 MiB/s 1.20 c/B > POLY1305 enc | 0.463 ns/B 2060.6 MiB/s 1.85 c/B > POLY1305 dec | 0.462 ns/B 2063.8 MiB/s 1.85 c/B > POLY1305 auth | 0.162 ns/B 5899.3 MiB/s 0.646 c/B > > bench-slope-nettle: Nettle 3.4 > Cipher: > chacha | nanosecs/byte mebibytes/sec cycles/byte > STREAM enc | 1.65 ns/B 578.2 MiB/s 6.59 c/B > STREAM dec | 1.65 ns/B 578.2 MiB/s 6.59 c/B > POLY1305 enc | 2.05 ns/B 464.8 MiB/s 8.20 c/B > POLY1305 dec | 2.05 ns/B 464.7 MiB/s 8.20 c/B > POLY1305 auth | 0.404 ns/B 2359.1 MiB/s 1.62 c/B > > bench-slope-botan: Botan 2.6.0 > Cipher: > ChaCha | nanosecs/byte mebibytes/sec cycles/byte > STREAM enc/dec | 0.855 ns/B 1116.0 MiB/s 3.42 c/B > POLY1305 enc | 1.60 ns/B 595.4 MiB/s 6.40 c/B > POLY1305 dec | 1.60 ns/B 595.8 MiB/s 6.40 c/B > POLY1305 auth | 0.752 ns/B 1268.3 MiB/s 3.01 c/B These bench-slope versions are available at: https://github.com/jkivilin/bench-slopes Autoconf/Build system is not very polished but should work at least on latest Ubuntu. Building manually should not be too hard either (you need to compile slope.c + bench-slope-*.c|cpp). -Jussi -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 671 bytes Desc: OpenPGP digital signature URL: From jussi.kivilinna at iki.fi Mon Jan 21 21:59:11 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 21 Jan 2019 22:59:11 +0200 Subject: [PATCH 1/2] tests/bench-slope: add missing cipher context reset Message-ID: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain> * tests/bench-slope.c (bench_encrypt_do_bench) (bench_decrypt_do_bench): Add call to 'gcry_cipher_reset'. -- Some non-AEAD results were negativily affected by missing state reset (~1% for aesni-ctr and chacha20-stream). Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 5c64f229d..07282b786 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -859,7 +859,9 @@ bench_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) gcry_cipher_hd_t hd = obj->hd; int err; - err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen); + err = gcry_cipher_reset (hd); + if (!err) + err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen); if (err) { fprintf (stderr, PGM ": gcry_cipher_encrypt failed: %s\n", @@ -875,7 +877,9 @@ bench_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) gcry_cipher_hd_t hd = obj->hd; int err; - err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen); + err = gcry_cipher_reset (hd); + if (!err) + err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen); if (err) { fprintf (stderr, PGM ": gcry_cipher_encrypt failed: %s\n", From jussi.kivilinna at iki.fi Mon Jan 21 21:59:16 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 21 Jan 2019 22:59:16 +0200 Subject: [PATCH 2/2] tests/bench-slope: prevent auto-mhz detection getting stuck In-Reply-To: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain> References: <154810435096.14673.10254021544852475355.stgit@localhost.localdomain> Message-ID: <154810435612.14673.4924095282245421571.stgit@localhost.localdomain> * cipher/bench-slope.c (bench_ghz, bench_ghz_diff): New static variables. (AUTO_GHZ_TARGET_DIFF): New macro. (do_slope_benchmark): Reduce target auto-mhz accuracy after repeated failures. (bench_print_result_csv, bench_print_result_std): Print auto-ghz different if 1 Mhz or more. (do_slope_benchmark, bench_print_result_csv, bench_print_result_std) (bench_print_result): Remove 'bench_ghz' parameter. (cipher_bench_one, hash_bench_one, mac_bench_one) (kdf_bench_one): Remove 'bench_ghz' variable. -- This patch prevents auto-mhz detection getting stuck on systems with high load or unstable CPU frequency. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 07282b786..2ead3c9e2 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -64,6 +64,13 @@ static char *current_algo_name; static char *current_mode_name; +/* Currently used CPU Ghz (either user input or auto-detected. */ +static double bench_ghz; + +/* Current accuracy of auto-detected CPU Ghz. */ +static double bench_ghz_diff; + + /*************************************** Default parameters for measurements. */ /* Start at small buffer size, to get reasonable timer calibration for fast @@ -82,6 +89,9 @@ static char *current_mode_name; * measurements is selected as data point further analysis. */ #define NUM_MEASUREMENT_REPETITIONS 64 +/* Target accuracy for auto-detected CPU Ghz. */ +#define AUTO_GHZ_TARGET_DIFF (5e-5) + /**************************************************** High-resolution timers. */ /* This benchmarking module needs needs high resolution timer. */ @@ -540,7 +550,7 @@ get_auto_ghz (void) double -do_slope_benchmark (struct bench_obj *obj, double *bench_ghz) +do_slope_benchmark (struct bench_obj *obj) { double ret; @@ -550,14 +560,17 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz) ret = slope_benchmark (obj); - *bench_ghz = cpu_ghz; + bench_ghz = cpu_ghz; + bench_ghz_diff = 0; } else { + double target_diff = AUTO_GHZ_TARGET_DIFF; double cpu_auto_ghz_before; double cpu_auto_ghz_after; double nsecs_per_iteration; double diff; + unsigned int try_count = 0; /* Perform measurement with CPU frequency autodetection. */ @@ -565,6 +578,15 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz) { /* Repeat measurement until CPU turbo frequency has stabilized. */ + if (try_count++ > 4) + { + /* Too much frequency instability on the system, relax target + * accuracy. */ + + try_count = 0; + target_diff *= 2; + } + cpu_auto_ghz_before = get_auto_ghz (); nsecs_per_iteration = slope_benchmark (obj); @@ -574,11 +596,12 @@ do_slope_benchmark (struct bench_obj *obj, double *bench_ghz) diff = 1.0 - (cpu_auto_ghz_before / cpu_auto_ghz_after); diff = diff < 0 ? -diff : diff; } - while (diff > 5e-5); + while (diff > target_diff); ret = nsecs_per_iteration; - *bench_ghz = cpu_auto_ghz_after; + bench_ghz = (cpu_auto_ghz_before + cpu_auto_ghz_after) / 2; + bench_ghz_diff = diff; } return ret; @@ -605,14 +628,16 @@ double_to_str (char *out, size_t outlen, double value) } static void -bench_print_result_csv (double nsecs_per_byte, double bench_ghz) +bench_print_result_csv (double nsecs_per_byte) { double cycles_per_byte, mbytes_per_sec; char nsecpbyte_buf[16]; char mbpsec_buf[16]; char cpbyte_buf[16]; char mhz_buf[16]; + char mhz_diff_buf[32]; + strcpy (mhz_diff_buf, ""); *cpbyte_buf = 0; *mhz_buf = 0; @@ -624,6 +649,11 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz) cycles_per_byte = nsecs_per_byte * bench_ghz; double_to_str (cpbyte_buf, sizeof (cpbyte_buf), cycles_per_byte); double_to_str (mhz_buf, sizeof (mhz_buf), bench_ghz * 1000); + if (auto_ghz && bench_ghz_diff * 1000 >= 1) + { + snprintf(mhz_diff_buf, sizeof(mhz_diff_buf), ",%.0f,Mhz-diff", + bench_ghz_diff * 1000); + } } mbytes_per_sec = @@ -633,14 +663,15 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz) /* We print two empty fields to allow for future enhancements. */ if (auto_ghz) { - printf ("%s,%s,%s,,,%s,ns/B,%s,MiB/s,%s,c/B,%s,Mhz\n", + printf ("%s,%s,%s,,,%s,ns/B,%s,MiB/s,%s,c/B,%s,Mhz%s\n", current_section_name, current_algo_name? current_algo_name : "", current_mode_name? current_mode_name : "", nsecpbyte_buf, mbpsec_buf, cpbyte_buf, - mhz_buf); + mhz_buf, + mhz_diff_buf); } else { @@ -655,13 +686,16 @@ bench_print_result_csv (double nsecs_per_byte, double bench_ghz) } static void -bench_print_result_std (double nsecs_per_byte, double bench_ghz) +bench_print_result_std (double nsecs_per_byte) { double cycles_per_byte, mbytes_per_sec; char nsecpbyte_buf[16]; char mbpsec_buf[16]; char cpbyte_buf[16]; char mhz_buf[16]; + char mhz_diff_buf[32]; + + strcpy (mhz_diff_buf, ""); double_to_str (nsecpbyte_buf, sizeof (nsecpbyte_buf), nsecs_per_byte); @@ -671,6 +705,11 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz) cycles_per_byte = nsecs_per_byte * bench_ghz; double_to_str (cpbyte_buf, sizeof (cpbyte_buf), cycles_per_byte); double_to_str (mhz_buf, sizeof (mhz_buf), bench_ghz * 1000); + if (auto_ghz && bench_ghz_diff * 1000 >= 0.5) + { + snprintf(mhz_diff_buf, sizeof(mhz_diff_buf), "?%.0f", + bench_ghz_diff * 1000); + } } else { @@ -684,8 +723,8 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz) if (auto_ghz) { - printf ("%9s ns/B %9s MiB/s %9s c/B %9s\n", - nsecpbyte_buf, mbpsec_buf, cpbyte_buf, mhz_buf); + printf ("%9s ns/B %9s MiB/s %9s c/B %9s%s\n", + nsecpbyte_buf, mbpsec_buf, cpbyte_buf, mhz_buf, mhz_diff_buf); } else { @@ -695,12 +734,12 @@ bench_print_result_std (double nsecs_per_byte, double bench_ghz) } static void -bench_print_result (double nsecs_per_byte, double bench_ghz) +bench_print_result (double nsecs_per_byte) { if (csv_mode) - bench_print_result_csv (nsecs_per_byte, bench_ghz); + bench_print_result_csv (nsecs_per_byte); else - bench_print_result_std (nsecs_per_byte, bench_ghz); + bench_print_result_std (nsecs_per_byte); } static void @@ -1520,7 +1559,6 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) struct bench_cipher_mode mode = *pmode; struct bench_obj obj = { 0 }; double result; - double bench_ghz; unsigned int blklen; mode.algo = algo; @@ -1565,9 +1603,9 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) obj.ops = mode.ops; obj.priv = &mode; - result = do_slope_benchmark (&obj, &bench_ghz); + result = do_slope_benchmark (&obj); - bench_print_result (result, bench_ghz); + bench_print_result (result); } @@ -1685,7 +1723,6 @@ hash_bench_one (int algo, struct bench_hash_mode *pmode) { struct bench_hash_mode mode = *pmode; struct bench_obj obj = { 0 }; - double bench_ghz; double result; mode.algo = algo; @@ -1698,9 +1735,9 @@ hash_bench_one (int algo, struct bench_hash_mode *pmode) obj.ops = mode.ops; obj.priv = &mode; - result = do_slope_benchmark (&obj, &bench_ghz); + result = do_slope_benchmark (&obj); - bench_print_result (result, bench_ghz); + bench_print_result (result); } static void @@ -1852,7 +1889,6 @@ mac_bench_one (int algo, struct bench_mac_mode *pmode) { struct bench_mac_mode mode = *pmode; struct bench_obj obj = { 0 }; - double bench_ghz; double result; mode.algo = algo; @@ -1865,9 +1901,9 @@ mac_bench_one (int algo, struct bench_mac_mode *pmode) obj.ops = mode.ops; obj.priv = &mode; - result = do_slope_benchmark (&obj, &bench_ghz); + result = do_slope_benchmark (&obj); - bench_print_result (result, bench_ghz); + bench_print_result (result); } static void @@ -1970,7 +2006,6 @@ kdf_bench_one (int algo, int subalgo) struct bench_obj obj = { 0 }; double nsecs_per_iteration; double cycles_per_iteration; - double bench_ghz; char algo_name[32]; char nsecpiter_buf[16]; char cpiter_buf[16]; @@ -2008,7 +2043,7 @@ kdf_bench_one (int algo, int subalgo) obj.ops = mode.ops; obj.priv = &mode; - nsecs_per_iteration = do_slope_benchmark (&obj, &bench_ghz); + nsecs_per_iteration = do_slope_benchmark (&obj); strcpy(cpiter_buf, csv_mode ? "" : "-"); strcpy(mhz_buf, csv_mode ? "" : "-"); From jussi.kivilinna at iki.fi Mon Jan 21 22:01:01 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 21 Jan 2019 23:01:01 +0200 Subject: [PATCH] chacha20-amd64-avx2: optimize output xoring Message-ID: <154810446170.15440.12619479282908244271.stgit@localhost.localdomain> * cipher/chacha20-amd64-avx2.S (STACK_TMP2): Remove. (transpose_16byte_2x2, xor_src_dst): New. (BUF_XOR_256_TO_128): Remove. (_gcry_chaha20_amd64_avx2_blocks8) (_gcry_chacha20_poly1305_amd64_avx2_blocks8): Replace BUF_XOR_256_TO_128 with transpose_16byte_2x2/xor_src_dst; Reduce stack usage; Better interleave chacha20 state merging and output xoring. -- Benchmark on Intel i7-4790K: Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.314 ns/B 3035 MiB/s 1.26 c/B 3998 STREAM dec | 0.314 ns/B 3037 MiB/s 1.26 c/B 3998 POLY1305 enc | 0.451 ns/B 2117 MiB/s 1.80 c/B 3998 POLY1305 dec | 0.441 ns/B 2162 MiB/s 1.76 c/B 3998 After: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.309 ns/B 3086 MiB/s 1.24 c/B 3998 STREAM dec | 0.309 ns/B 3083 MiB/s 1.24 c/B 3998 POLY1305 enc | 0.445 ns/B 2141 MiB/s 1.78 c/B 3998 POLY1305 dec | 0.436 ns/B 2188 MiB/s 1.74 c/B 3998 Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index ef02c1733..94c8e8cf7 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -50,9 +50,8 @@ #define STACK_VEC_X13 (32 + STACK_VEC_X12) #define STACK_TMP (32 + STACK_VEC_X13) #define STACK_TMP1 (32 + STACK_TMP) -#define STACK_TMP2 (32 + STACK_TMP1) -#define STACK_MAX (32 + STACK_TMP2) +#define STACK_MAX (32 + STACK_TMP1) /* vector registers */ #define X0 %ymm0 @@ -101,11 +100,22 @@ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ - vpunpckhqdq t1, x0, x1; \ - vpunpcklqdq t1, x0, x0; \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ - vpunpcklqdq x2, t2, x2; + vpunpcklqdq x2, t2, x2; + +/* 2x2 128-bit matrix transpose */ +#define transpose_16byte_2x2(x0,x1,t1) \ + vmovdqa x0, t1; \ + vperm2i128 $0x20, x1, x0, x0; \ + vperm2i128 $0x31, x1, t1, x1; + +/* xor register with unaligned src and save to unaligned dst */ +#define xor_src_dst(dst, src, offset, xreg) \ + vpxor offset(src), xreg, xreg; \ + vmovdqu xreg, offset(dst); /********************************************************************** 8-way chacha20 @@ -147,13 +157,6 @@ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1); -#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1) \ - vextracti128 $1, yreg, tmp1##h; \ - vpxor offset_lo(src), yreg##h, yreg##h; \ - vpxor offset_hi(src), tmp1##h, tmp1##h; \ - vmovdqu yreg##h, offset_lo(dst); \ - vmovdqu tmp1##h, offset_hi(dst); - .align 32 chacha20_data: .Lshuf_rol16: @@ -230,6 +233,8 @@ _gcry_chacha20_amd64_avx2_blocks8: sub $2, ROUND; jnz .Lround2; + vmovdqa X8, (STACK_TMP1)(%rsp); + /* tmp := X15 */ vpbroadcastd (0 * 4)(INPUT), X15; PLUS(X0, X15); @@ -247,53 +252,56 @@ _gcry_chacha20_amd64_avx2_blocks8: PLUS(X6, X15); vpbroadcastd (7 * 4)(INPUT), X15; PLUS(X7, X15); - vpbroadcastd (8 * 4)(INPUT), X15; - PLUS(X8, X15); - vpbroadcastd (9 * 4)(INPUT), X15; - PLUS(X9, X15); - vpbroadcastd (10 * 4)(INPUT), X15; - PLUS(X10, X15); - vpbroadcastd (11 * 4)(INPUT), X15; - PLUS(X11, X15); - vmovdqa (STACK_VEC_X12)(%rsp), X15; - PLUS(X12, X15); - vmovdqa (STACK_VEC_X13)(%rsp), X15; - PLUS(X13, X15); + transpose_4x4(X0, X1, X2, X3, X8, X15); + transpose_4x4(X4, X5, X6, X7, X8, X15); + vmovdqa (STACK_TMP1)(%rsp), X8; + transpose_16byte_2x2(X0, X4, X15); + transpose_16byte_2x2(X1, X5, X15); + transpose_16byte_2x2(X2, X6, X15); + transpose_16byte_2x2(X3, X7, X15); vmovdqa (STACK_TMP)(%rsp), X15; - vmovdqa X13, (STACK_TMP)(%rsp); - vpbroadcastd (14 * 4)(INPUT), X13; - PLUS(X14, X13); - vmovdqa X14, (STACK_TMP1)(%rsp); - vpbroadcastd (15 * 4)(INPUT), X13; - PLUS(X15, X13); - vmovdqa X15, (STACK_TMP2)(%rsp); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1); + vpbroadcastd (8 * 4)(INPUT), X0; + PLUS(X8, X0); + vpbroadcastd (9 * 4)(INPUT), X0; + PLUS(X9, X0); + vpbroadcastd (10 * 4)(INPUT), X0; + PLUS(X10, X0); + vpbroadcastd (11 * 4)(INPUT), X0; + PLUS(X11, X0); + vmovdqa (STACK_VEC_X12)(%rsp), X0; + PLUS(X12, X0); + vmovdqa (STACK_VEC_X13)(%rsp), X0; + PLUS(X13, X0); + vpbroadcastd (14 * 4)(INPUT), X0; + PLUS(X14, X0); + vpbroadcastd (15 * 4)(INPUT), X0; + PLUS(X15, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3); /* Update counter */ addq $8, (12 * 4)(INPUT); - transpose_4x4(X0, X1, X2, X3, X13, X14); - transpose_4x4(X4, X5, X6, X7, X13, X14); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15); - vmovdqa (STACK_TMP)(%rsp), X13; - vmovdqa (STACK_TMP1)(%rsp), X14; - vmovdqa (STACK_TMP2)(%rsp), X15; transpose_4x4(X8, X9, X10, X11, X0, X1); transpose_4x4(X12, X13, X14, X15, X0, X1); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0); + xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4); + xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5); + transpose_16byte_2x2(X8, X12, X0); + transpose_16byte_2x2(X9, X13, X0); + transpose_16byte_2x2(X10, X14, X0); + transpose_16byte_2x2(X11, X15, X0); + xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6); + xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11); + xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12); + xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13); + xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14); + xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15); sub $8, NBLKS; lea (8 * 64)(DST), DST; @@ -306,7 +314,6 @@ _gcry_chacha20_amd64_avx2_blocks8: vmovdqa X0, (STACK_VEC_X13)(%rsp); vmovdqa X0, (STACK_TMP)(%rsp); vmovdqa X0, (STACK_TMP1)(%rsp); - vmovdqa X0, (STACK_TMP2)(%rsp); vzeroall; /* eax zeroed by round loop. */ @@ -646,6 +653,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) + movq (STACK_MAX + 5 * 8)(%rsp), SRC; + movq (STACK_MAX + 6 * 8)(%rsp), DST; + + vmovdqa X8, (STACK_TMP1)(%rsp); + /* tmp := X15 */ vpbroadcastd (0 * 4)(INPUT), X15; PLUS(X0, X15); @@ -663,56 +675,56 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: PLUS(X6, X15); vpbroadcastd (7 * 4)(INPUT), X15; PLUS(X7, X15); - vpbroadcastd (8 * 4)(INPUT), X15; - PLUS(X8, X15); - vpbroadcastd (9 * 4)(INPUT), X15; - PLUS(X9, X15); - vpbroadcastd (10 * 4)(INPUT), X15; - PLUS(X10, X15); - vpbroadcastd (11 * 4)(INPUT), X15; - PLUS(X11, X15); - vmovdqa (STACK_VEC_X12)(%rsp), X15; - PLUS(X12, X15); - vmovdqa (STACK_VEC_X13)(%rsp), X15; - PLUS(X13, X15); + transpose_4x4(X0, X1, X2, X3, X8, X15); + transpose_4x4(X4, X5, X6, X7, X8, X15); + vmovdqa (STACK_TMP1)(%rsp), X8; + transpose_16byte_2x2(X0, X4, X15); + transpose_16byte_2x2(X1, X5, X15); + transpose_16byte_2x2(X2, X6, X15); + transpose_16byte_2x2(X3, X7, X15); vmovdqa (STACK_TMP)(%rsp), X15; - vmovdqa X13, (STACK_TMP)(%rsp); - vpbroadcastd (14 * 4)(INPUT), X13; - PLUS(X14, X13); - vmovdqa X14, (STACK_TMP1)(%rsp); - vpbroadcastd (15 * 4)(INPUT), X13; - PLUS(X15, X13); - vmovdqa X15, (STACK_TMP2)(%rsp); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1); + vpbroadcastd (8 * 4)(INPUT), X0; + PLUS(X8, X0); + vpbroadcastd (9 * 4)(INPUT), X0; + PLUS(X9, X0); + vpbroadcastd (10 * 4)(INPUT), X0; + PLUS(X10, X0); + vpbroadcastd (11 * 4)(INPUT), X0; + PLUS(X11, X0); + vmovdqa (STACK_VEC_X12)(%rsp), X0; + PLUS(X12, X0); + vmovdqa (STACK_VEC_X13)(%rsp), X0; + PLUS(X13, X0); + vpbroadcastd (14 * 4)(INPUT), X0; + PLUS(X14, X0); + vpbroadcastd (15 * 4)(INPUT), X0; + PLUS(X15, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3); /* Update counter */ addq $8, (12 * 4)(INPUT); - movq (STACK_MAX + 5 * 8)(%rsp), SRC; - movq (STACK_MAX + 6 * 8)(%rsp), DST; - - transpose_4x4(X0, X1, X2, X3, X13, X14); - transpose_4x4(X4, X5, X6, X7, X13, X14); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15); - vmovdqa (STACK_TMP)(%rsp), X13; - vmovdqa (STACK_TMP1)(%rsp), X14; - vmovdqa (STACK_TMP2)(%rsp), X15; transpose_4x4(X8, X9, X10, X11, X0, X1); transpose_4x4(X12, X13, X14, X15, X0, X1); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0); - BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0); + xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4); + xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5); + transpose_16byte_2x2(X8, X12, X0); + transpose_16byte_2x2(X9, X13, X0); + transpose_16byte_2x2(X10, X14, X0); + transpose_16byte_2x2(X11, X15, X0); + xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6); + xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11); + xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12); + xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13); + xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14); + xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15); subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS @@ -733,7 +745,6 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: vmovdqa X0, (STACK_VEC_X13)(%rsp); vmovdqa X0, (STACK_TMP)(%rsp); vmovdqa X0, (STACK_TMP1)(%rsp); - vmovdqa X0, (STACK_TMP2)(%rsp); vzeroall; movq (STACK_MAX + 0 * 8)(%rsp), %rbx; From jussi.kivilinna at iki.fi Wed Jan 23 22:20:42 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 23 Jan 2019 23:20:42 +0200 Subject: [PATCH] Calculate OCB L-tables when setting key instead of when setting nonce Message-ID: <154827844272.13543.14365324123881852158.stgit@localhost.localdomain> * cipher/cipher-internal.h (gcry_cipher_handle): Mark areas of u_mode.ocb that are and are not cleared by gcry_cipher_reset. (_gcry_cipher_ocb_setkey): New. * cipher/cipher-ocb.c (_gcry_cipher_ocb_set_nonce): Split L-table generation to ... (_gcry_cipher_ocb_setkey): ... this new function. * cipher/cipher.c (cipher_setkey): Add handling for OCB mode. (cipher_reset): Do not clear L-values for OCB mode. -- OCB L-tables do not depend on nonce value, but only on cipher key. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 78f05dbb5..79de140dd 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -316,6 +316,8 @@ struct gcry_cipher_handle /* Mode specific storage for OCB mode. */ struct { + /* --- Following members are not cleared in gcry_cipher_reset --- */ + /* Helper variables and pre-computed table of L values. */ unsigned char L_star[OCB_BLOCK_LEN]; unsigned char L_dollar[OCB_BLOCK_LEN]; @@ -323,6 +325,8 @@ struct gcry_cipher_handle unsigned char L0L1L0[OCB_BLOCK_LEN]; unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN]; + /* --- Following members are cleared in gcry_cipher_reset --- */ + /* The tag is valid if marks.tag has been set. */ unsigned char tag[OCB_BLOCK_LEN]; @@ -571,6 +575,8 @@ gcry_err_code_t _gcry_cipher_ocb_get_tag gcry_err_code_t _gcry_cipher_ocb_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); +void _gcry_cipher_ocb_setkey +/* */ (gcry_cipher_hd_t c); /*-- cipher-xts.c --*/ diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c index 58f7be7e6..be6b8dffb 100644 --- a/cipher/cipher-ocb.c +++ b/cipher/cipher-ocb.c @@ -123,6 +123,37 @@ ocb_get_L_big (gcry_cipher_hd_t c, u64 n, unsigned char *l_buf) } +/* Called after key has been set. Sets up L table. */ +void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c) +{ + unsigned char ktop[OCB_BLOCK_LEN]; + unsigned int burn = 0; + unsigned int nburn; + int i; + + /* L_star = E(zero_128) */ + memset (ktop, 0, OCB_BLOCK_LEN); + nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop); + burn = nburn > burn ? nburn : burn; + /* L_dollar = double(L_star) */ + double_block_cpy (c->u_mode.ocb.L_dollar, c->u_mode.ocb.L_star); + /* L_0 = double(L_dollar), ... */ + double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar); + for (i = 1; i < OCB_L_TABLE_SIZE; i++) + double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]); + /* Precalculated offsets L0+L1, L0+L1+L0 */ + cipher_block_xor (c->u_mode.ocb.L0L1, + c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN); + cipher_block_xor (c->u_mode.ocb.L0L1L0, + c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN); + + /* Cleanup */ + wipememory (ktop, sizeof ktop); + if (burn > 0) + _gcry_burn_stack (burn + 4*sizeof(void*)); +} + + /* Set the nonce for OCB. This requires that the key has been set. Using it again resets start a new encryption cycle using the same key. */ @@ -133,7 +164,6 @@ _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce, unsigned char ktop[OCB_BLOCK_LEN]; unsigned char stretch[OCB_BLOCK_LEN + 8]; unsigned int bottom; - int i; unsigned int burn = 0; unsigned int nburn; @@ -159,23 +189,6 @@ _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce, if (noncelen > (120/8) || noncelen < (64/8) || noncelen >= OCB_BLOCK_LEN) return GPG_ERR_INV_LENGTH; - /* Set up the L table. */ - /* L_star = E(zero_128) */ - memset (ktop, 0, OCB_BLOCK_LEN); - nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop); - burn = nburn > burn ? nburn : burn; - /* L_dollar = double(L_star) */ - double_block_cpy (c->u_mode.ocb.L_dollar, c->u_mode.ocb.L_star); - /* L_0 = double(L_dollar), ... */ - double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar); - for (i = 1; i < OCB_L_TABLE_SIZE; i++) - double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]); - /* Precalculated offsets L0+L1, L0+L1+L0 */ - cipher_block_xor (c->u_mode.ocb.L0L1, - c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN); - cipher_block_xor (c->u_mode.ocb.L0L1L0, - c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN); - /* Prepare the nonce. */ memset (ktop, 0, (OCB_BLOCK_LEN - noncelen)); buf_cpy (ktop + (OCB_BLOCK_LEN - noncelen), nonce, noncelen); diff --git a/cipher/cipher.c b/cipher/cipher.c index 55b991c35..ab3e4240e 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -816,6 +816,10 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen) _gcry_cipher_gcm_setkey (c); break; + case GCRY_CIPHER_MODE_OCB: + _gcry_cipher_ocb_setkey (c); + break; + case GCRY_CIPHER_MODE_POLY1305: _gcry_cipher_poly1305_setkey (c); break; @@ -931,9 +935,18 @@ cipher_reset (gcry_cipher_hd_t c) break; case GCRY_CIPHER_MODE_OCB: - memset (&c->u_mode.ocb, 0, sizeof c->u_mode.ocb); - /* Setup default taglen. */ - c->u_mode.ocb.taglen = 16; + /* Do not clear precalculated L-values */ + { + byte *u_mode_head_pos = (void *)&c->u_mode.ocb; + byte *u_mode_tail_pos = (void *)&c->u_mode.ocb.tag; + size_t u_mode_head_length = u_mode_tail_pos - u_mode_head_pos; + size_t u_mode_tail_length = sizeof(c->u_mode.ocb) - u_mode_head_length; + + memset (u_mode_tail_pos, 0, u_mode_tail_length); + + /* Setup default taglen. */ + c->u_mode.ocb.taglen = 16; + } break; case GCRY_CIPHER_MODE_XTS: From jussi.kivilinna at iki.fi Sun Jan 27 12:01:28 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 27 Jan 2019 13:01:28 +0200 Subject: [PATCH] Do not precalculate OCB offset L0+L1+L0 Message-ID: <154858688785.4028.6266486144765162943.stgit@localhost.localdomain> * cipher/cipher-internal.h (gcry_cipher_handle): Remove OCB L0L1L0. * cipher/cipher-ocb.c (_gcry_cipher_ocb_setkey): Ditto. * cipher/rijndael-aesni.c (aesni_ocb_enc, aesni_ocb_dec) (_gcry_aes_aesni_ocb_auth): Replace L0L1L0 use with L1. -- Patch fixes L0+L1+L0 thinko. This is same as L1 (L0 xor L1 xor L0). --- 0 files changed diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 79de140dd..5ece774e6 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -322,7 +322,6 @@ struct gcry_cipher_handle unsigned char L_star[OCB_BLOCK_LEN]; unsigned char L_dollar[OCB_BLOCK_LEN]; unsigned char L0L1[OCB_BLOCK_LEN]; - unsigned char L0L1L0[OCB_BLOCK_LEN]; unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN]; /* --- Following members are cleared in gcry_cipher_reset --- */ diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c index be6b8dffb..308b04952 100644 --- a/cipher/cipher-ocb.c +++ b/cipher/cipher-ocb.c @@ -141,11 +141,9 @@ void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c) double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar); for (i = 1; i < OCB_L_TABLE_SIZE; i++) double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]); - /* Precalculated offsets L0+L1, L0+L1+L0 */ + /* Precalculated offset L0+L1 */ cipher_block_xor (c->u_mode.ocb.L0L1, c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN); - cipher_block_xor (c->u_mode.ocb.L0L1L0, - c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN); /* Cleanup */ wipememory (ktop, sizeof ktop); diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index ec9f4d4a5..9883861a2 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -2429,11 +2429,11 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); @@ -2561,13 +2561,13 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" @@ -2730,11 +2730,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); @@ -2862,13 +2862,13 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" @@ -3028,11 +3028,11 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, asm volatile ("movdqu %[l0], %%xmm7\n\t" "movdqu %[l0l1], %%xmm12\n\t" - "movdqu %[l0l1l0], %%xmm13\n\t" + "movdqu %[l1], %%xmm13\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0) + [l1] "m" (*c->u_mode.ocb.L[1]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) @@ -3138,12 +3138,12 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" : - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[abuf1], %%xmm2\n\t" From cvs at cvs.gnupg.org Sun Jan 27 12:05:30 2019 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sun, 27 Jan 2019 12:05:30 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-151-gafab94d Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via afab94d222425ecb838eb56cb0723bdaf3e5de36 (commit) from c15409c49993166ab1325d45360b3a8fe72a5556 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit afab94d222425ecb838eb56cb0723bdaf3e5de36 Author: Jussi Kivilinna Date: Sun Jan 27 12:55:22 2019 +0200 Do not precalculate OCB offset L0+L1+L0 * cipher/cipher-internal.h (gcry_cipher_handle): Remove OCB L0L1L0. * cipher/cipher-ocb.c (_gcry_cipher_ocb_setkey): Ditto. * cipher/rijndael-aesni.c (aesni_ocb_enc, aesni_ocb_dec) (_gcry_aes_aesni_ocb_auth): Replace L0L1L0 use with L1. -- Patch fixes L0+L1+L0 thinko. This is same as L1 (L0 xor L1 xor L0). Signed-off-by: Jussi Kivilinna diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 79de140..5ece774 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -322,7 +322,6 @@ struct gcry_cipher_handle unsigned char L_star[OCB_BLOCK_LEN]; unsigned char L_dollar[OCB_BLOCK_LEN]; unsigned char L0L1[OCB_BLOCK_LEN]; - unsigned char L0L1L0[OCB_BLOCK_LEN]; unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN]; /* --- Following members are cleared in gcry_cipher_reset --- */ diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c index be6b8df..308b049 100644 --- a/cipher/cipher-ocb.c +++ b/cipher/cipher-ocb.c @@ -141,11 +141,9 @@ void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c) double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar); for (i = 1; i < OCB_L_TABLE_SIZE; i++) double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]); - /* Precalculated offsets L0+L1, L0+L1+L0 */ + /* Precalculated offset L0+L1 */ cipher_block_xor (c->u_mode.ocb.L0L1, c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN); - cipher_block_xor (c->u_mode.ocb.L0L1L0, - c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN); /* Cleanup */ wipememory (ktop, sizeof ktop); diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index ec9f4d4..9883861 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -2429,11 +2429,11 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); @@ -2561,13 +2561,13 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" @@ -2730,11 +2730,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); @@ -2862,13 +2862,13 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" @@ -3028,11 +3028,11 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, asm volatile ("movdqu %[l0], %%xmm7\n\t" "movdqu %[l0l1], %%xmm12\n\t" - "movdqu %[l0l1l0], %%xmm13\n\t" + "movdqu %[l1], %%xmm13\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0) + [l1] "m" (*c->u_mode.ocb.L[1]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) @@ -3138,12 +3138,12 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, [l0l1] "m" (*c->u_mode.ocb.L0L1), [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" : - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[abuf1], %%xmm2\n\t" ----------------------------------------------------------------------- Summary of changes: cipher/cipher-internal.h | 1 - cipher/cipher-ocb.c | 4 +--- cipher/rijndael-aesni.c | 24 ++++++++++++------------ 3 files changed, 13 insertions(+), 16 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits