From jussi.kivilinna at iki.fi Tue Feb 12 21:41:54 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 12 Feb 2019 22:41:54 +0200 Subject: [PATCH] Add 2-way path for SSSE3 version of ChaCha20 Message-ID: <155000411449.18494.3734175305017827013.stgit@localhost.localdomain> * cipher/chacha20-amd64-ssse3.S (_gcry_chacha20_amd64_ssse3_blocks1) (_gcry_chacha20_poly1305_amd64_ssse3_blocks1): Add 2-way code paths. * cipher/chacha20.c (_gcry_chacha20_poly1305_encrypt): Add preprosessing of 2 blocks with SSSE3. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index d7faf6442..1657f7712 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -334,7 +334,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) /********************************************************************** - 1-way chacha20 + 2-way && 1-way chacha20 **********************************************************************/ #define ROTATE_SHUF(v1,shuf) \ @@ -384,6 +384,66 @@ _gcry_chacha20_amd64_ssse3_blocks1: movdqu (8 * 4)(INPUT), X12; movdqu (12 * 4)(INPUT), X13; + cmp $2, NBLKS; + jb .Loop1; + + mov $20, ROUND; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + + movdqa X10, X8; + movdqa X11, X9; + movdqa X12, X14; + movdqa X13, X15; + paddq X4, X15; + +.Lround2_2: + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + sub $2, ROUND; + jnz .Lround2_2; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + PLUS(X8, X10); + PLUS(X9, X11); + PLUS(X14, X12); + PLUS(X15, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + xor_src_dst(DST, SRC, 16 * 4, X8, X7); + xor_src_dst(DST, SRC, 20 * 4, X9, X7); + xor_src_dst(DST, SRC, 24 * 4, X14, X7); + xor_src_dst(DST, SRC, 28 * 4, X15, X7); + + lea (2 * 64)(DST), DST; + lea (2 * 64)(SRC), SRC; + + clear(X8); + clear(X9); + clear(X14); + clear(X15); + + sub $2, NBLKS; + jz .Ldone1; + .Loop1: mov $20, ROUND; @@ -417,6 +477,7 @@ _gcry_chacha20_amd64_ssse3_blocks1: sub $1, NBLKS; jnz .Loop1; +.Ldone1: /* Store counter */ movdqu X13, (12 * 4)(INPUT); @@ -848,7 +909,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) /********************************************************************** - 1-way stitched chacha20-poly1305 + 2-way && 1-way stitched chacha20-poly1305 **********************************************************************/ .align 8 @@ -891,6 +952,153 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: POLY1305_LOAD_STATE(); + cmpq $2, (7 * 8)(%rsp); #NBLKS + jb .Loop_poly1; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + + movdqa X10, X8; + movdqa X11, X9; + movdqa X12, X14; + movdqa X13, X15; + paddq X4, X15; + + /* Process two ChaCha20 blocks and eight Poly1305 blocks. */ + + POLY1305_BLOCK_PART1(0 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(1 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(2 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART1(3 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART1(4 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(5 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(6 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART1(7 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + movq (5 * 8)(%rsp), SRC; + movq (6 * 8)(%rsp), DST; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + PLUS(X8, X10); + PLUS(X9, X11); + PLUS(X14, X12); + PLUS(X15, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + xor_src_dst(DST, SRC, 16 * 4, X8, X7); + xor_src_dst(DST, SRC, 20 * 4, X9, X7); + xor_src_dst(DST, SRC, 24 * 4, X14, X7); + xor_src_dst(DST, SRC, 28 * 4, X15, X7); + + clear(X8); + clear(X9); + clear(X14); + clear(X15); + + subq $2, (7 * 8)(%rsp); # NBLKS + lea (2 * 64)(POLY_RSRC), POLY_RSRC; + lea (2 * 64)(SRC), SRC; + lea (2 * 64)(DST), DST; + movq SRC, (5 * 8)(%rsp); + movq DST, (6 * 8)(%rsp); + jz .Ldone_poly1; + .Loop_poly1: movdqa X10, X0; movdqa X11, X1; @@ -973,6 +1181,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: jnz .Loop_poly1; +.Ldone_poly1: /* Store state */ POLY1305_STORE_STATE(); diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 3e6327da9..eae4979cc 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -611,6 +611,16 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } + else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2) + { + nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 2 * CHACHA20_BLOCK_SIZE; + outbuf += 2 * CHACHA20_BLOCK_SIZE; + inbuf += 2 * CHACHA20_BLOCK_SIZE; + } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); From smueller at chronox.de Fri Feb 15 09:40:00 2019 From: smueller at chronox.de (Stephan Mueller) Date: Fri, 15 Feb 2019 09:40:00 +0100 Subject: _gcry_mpi_gcd calculation wrong Message-ID: <1854053.1I4kuoCFSc@tauon.chronox.de> Hi, I am trying to check whether the following P and Q values lead to a proper RSA key or not (i.e. I want to check whether P and Q would be the right candidates for an RSA key). e = a6db5b p = bdb4a50991c2d6cf2aeaef86068a026f1a45463697c23f7567c0cbfc5da5bc7b0b70d6e44da33df2e6bca8152292a3c6b776ea2e9f6528ea5d3e74afc19ee271ca940c2bcde6f18bf20c068bb973387d681b12d3689606825987d7bfc241cea0741a1be3a253f83e1654062db92b85287be8b385488a0eae13a4fe497d4fe751d588d0839086d1b935bf70bf715c34f87ed54cba51300aaaf53bdea5288726c7527a028dc2acf8962826a99ede37fad7b7310a77afb2bb8d9306350dc758930f q = c23121afc2530f01528bdf680d6d718f4719792d6137ef4500ea7bf993209c6d324999d668359953c71f8b320ea02af9d4b0f5199c2fef7ccda71f507cafd83d02183fd1575815d41eca6a2cec39104e9209ccbe0800a8c277077a27e726d73c2a0b6834313d0dc7a749c036d1edaafb48dd2a80ec191446b8958ba5e42d2b6424203ea26dc60e6c8397e605398c1e7da441c0ab142a29601bda839e8d69fe037115a2c712910a56beb9b19b938215cecf4e339f05b76059041568016fe64851 I convert the input data into the following SEXP: (genkey(rsa(nbits 4:3072)(test-parms(e 8:10935131)(p %b)(q %b)))) Where %b is replaced with P and Q respectively during gcry_sexp_build. This is followed by a gcry_pk_genkey. libgcrypt generates and error during gcry_pk_genkey indicating that the values are not prime. After debugging the issue, I found that for the P value, _gcry_mpi_gcd indicates that the common divisor for P-1 and E is not 1 (in fact, it calculates the common divisor to be 7). I.e. the following lines fail in generate_fips: if (mpi_gcd (g, p1, e)) ... else if (testparms) goto err; The issue now is that the NIST reference implementation I use to verify the results say that P and Q are prime and that the RSA key should be generated. Also OpenSSL accepts the P and Q values and generates the RSA key using RSA_generate_key_ex where the RSA struct contains P and Q already. Ciao Stephan From smueller at chronox.de Fri Feb 15 11:03:43 2019 From: smueller at chronox.de (Stephan Mueller) Date: Fri, 15 Feb 2019 11:03:43 +0100 Subject: _gcry_mpi_gcd calculation wrong In-Reply-To: <1854053.1I4kuoCFSc@tauon.chronox.de> References: <1854053.1I4kuoCFSc@tauon.chronox.de> Message-ID: <15526056.GFTjTZgObC@tauon.chronox.de> Am Freitag, 15. Februar 2019, 09:40:00 CET schrieb Stephan Mueller: Hi, > Hi, > > I am trying to check whether the following P and Q values lead to a proper > RSA key or not (i.e. I want to check whether P and Q would be the right > candidates for an RSA key). > > e = a6db5b > > p = > bdb4a50991c2d6cf2aeaef86068a026f1a45463697c23f7567c0cbfc5da5bc7b0b70d6e44da3 > 3df2e6bca8152292a3c6b776ea2e9f6528ea5d3e74afc19ee271ca940c2bcde6f18bf20c068b > b973387d681b12d3689606825987d7bfc241cea0741a1be3a253f83e1654062db92b85287be8 > b385488a0eae13a4fe497d4fe751d588d0839086d1b935bf70bf715c34f87ed54cba51300aaa > f53bdea5288726c7527a028dc2acf8962826a99ede37fad7b7310a77afb2bb8d9306350dc758 > 930f > > q = > c23121afc2530f01528bdf680d6d718f4719792d6137ef4500ea7bf993209c6d324999d66835 > 9953c71f8b320ea02af9d4b0f5199c2fef7ccda71f507cafd83d02183fd1575815d41eca6a2c > ec39104e9209ccbe0800a8c277077a27e726d73c2a0b6834313d0dc7a749c036d1edaafb48dd > 2a80ec191446b8958ba5e42d2b6424203ea26dc60e6c8397e605398c1e7da441c0ab142a2960 > 1bda839e8d69fe037115a2c712910a56beb9b19b938215cecf4e339f05b76059041568016fe6 > 4851 > > I convert the input data into the following SEXP: > > (genkey(rsa(nbits 4:3072)(test-parms(e 8:10935131)(p %b)(q %b)))) Apologies for the noise. The error is on my side. e must be provided as binary string and not as an integer. Please disregard the email. Ciao Stephan From cvs at cvs.gnupg.org Sun Feb 17 21:44:53 2019 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sun, 17 Feb 2019 21:44:53 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-152-gd455068 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via d455068988e5779b0200c51415ddab6b51e12dc4 (commit) from afab94d222425ecb838eb56cb0723bdaf3e5de36 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit d455068988e5779b0200c51415ddab6b51e12dc4 Author: Jussi Kivilinna Date: Thu Feb 7 20:50:02 2019 +0200 Add 2-way path for SSSE3 version of ChaCha20 * cipher/chacha20-amd64-ssse3.S (_gcry_chacha20_amd64_ssse3_blocks1) (_gcry_chacha20_poly1305_amd64_ssse3_blocks1): Add 2-way code paths. * cipher/chacha20.c (_gcry_chacha20_poly1305_encrypt): Add preprosessing of 2 blocks with SSSE3. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index d7faf64..1657f77 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -334,7 +334,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) /********************************************************************** - 1-way chacha20 + 2-way && 1-way chacha20 **********************************************************************/ #define ROTATE_SHUF(v1,shuf) \ @@ -384,6 +384,66 @@ _gcry_chacha20_amd64_ssse3_blocks1: movdqu (8 * 4)(INPUT), X12; movdqu (12 * 4)(INPUT), X13; + cmp $2, NBLKS; + jb .Loop1; + + mov $20, ROUND; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + + movdqa X10, X8; + movdqa X11, X9; + movdqa X12, X14; + movdqa X13, X15; + paddq X4, X15; + +.Lround2_2: + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + sub $2, ROUND; + jnz .Lround2_2; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + PLUS(X8, X10); + PLUS(X9, X11); + PLUS(X14, X12); + PLUS(X15, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + xor_src_dst(DST, SRC, 16 * 4, X8, X7); + xor_src_dst(DST, SRC, 20 * 4, X9, X7); + xor_src_dst(DST, SRC, 24 * 4, X14, X7); + xor_src_dst(DST, SRC, 28 * 4, X15, X7); + + lea (2 * 64)(DST), DST; + lea (2 * 64)(SRC), SRC; + + clear(X8); + clear(X9); + clear(X14); + clear(X15); + + sub $2, NBLKS; + jz .Ldone1; + .Loop1: mov $20, ROUND; @@ -417,6 +477,7 @@ _gcry_chacha20_amd64_ssse3_blocks1: sub $1, NBLKS; jnz .Loop1; +.Ldone1: /* Store counter */ movdqu X13, (12 * 4)(INPUT); @@ -848,7 +909,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) /********************************************************************** - 1-way stitched chacha20-poly1305 + 2-way && 1-way stitched chacha20-poly1305 **********************************************************************/ .align 8 @@ -891,6 +952,153 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: POLY1305_LOAD_STATE(); + cmpq $2, (7 * 8)(%rsp); #NBLKS + jb .Loop_poly1; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + + movdqa X10, X8; + movdqa X11, X9; + movdqa X12, X14; + movdqa X13, X15; + paddq X4, X15; + + /* Process two ChaCha20 blocks and eight Poly1305 blocks. */ + + POLY1305_BLOCK_PART1(0 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(1 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(2 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART1(3 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART1(4 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(5 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART1(6 * 16); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART1(7 * 16); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + POLY1305_BLOCK_PART2(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART3(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); + POLY1305_BLOCK_PART4(); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + POLY1305_BLOCK_PART5(); + QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); + + movq (5 * 8)(%rsp), SRC; + movq (6 * 8)(%rsp), DST; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + PLUS(X8, X10); + PLUS(X9, X11); + PLUS(X14, X12); + PLUS(X15, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + xor_src_dst(DST, SRC, 16 * 4, X8, X7); + xor_src_dst(DST, SRC, 20 * 4, X9, X7); + xor_src_dst(DST, SRC, 24 * 4, X14, X7); + xor_src_dst(DST, SRC, 28 * 4, X15, X7); + + clear(X8); + clear(X9); + clear(X14); + clear(X15); + + subq $2, (7 * 8)(%rsp); # NBLKS + lea (2 * 64)(POLY_RSRC), POLY_RSRC; + lea (2 * 64)(SRC), SRC; + lea (2 * 64)(DST), DST; + movq SRC, (5 * 8)(%rsp); + movq DST, (6 * 8)(%rsp); + jz .Ldone_poly1; + .Loop_poly1: movdqa X10, X0; movdqa X11, X1; @@ -973,6 +1181,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: jnz .Loop_poly1; +.Ldone_poly1: /* Store state */ POLY1305_STORE_STATE(); diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 3e6327d..eae4979 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -611,6 +611,16 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } + else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2) + { + nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 2 * CHACHA20_BLOCK_SIZE; + outbuf += 2 * CHACHA20_BLOCK_SIZE; + inbuf += 2 * CHACHA20_BLOCK_SIZE; + } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); ----------------------------------------------------------------------- Summary of changes: cipher/chacha20-amd64-ssse3.S | 213 +++++++++++++++++++++++++++++++++++++++++- cipher/chacha20.c | 10 ++ 2 files changed, 221 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Mon Feb 25 01:07:19 2019 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Mon, 25 Feb 2019 01:07:19 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-153-gad133fc Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via ad133fc79757236359252e92244fe16e9adb45a3 (commit) from d455068988e5779b0200c51415ddab6b51e12dc4 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit ad133fc79757236359252e92244fe16e9adb45a3 Author: NIIBE Yutaka Date: Mon Feb 25 09:02:59 2019 +0900 fips: Only test check_binary_integrity when fips_mode is enabled. * src/fips.c (_gcry_fips_run_selftests): Check the status of fips_mode before calling check_binary_integrity. -- GnuPG-bug-id: 4274 Reported-by: Pedro Monreal Signed-off-by: NIIBE Yutaka diff --git a/src/fips.c b/src/fips.c index 36358bf..1ac7f47 100644 --- a/src/fips.c +++ b/src/fips.c @@ -689,10 +689,13 @@ _gcry_fips_run_selftests (int extended) if (run_pubkey_selftests (extended)) goto leave; - /* Now check the integrity of the binary. We do this this after - having checked the HMAC code. */ - if (check_binary_integrity ()) - goto leave; + if (fips_mode ()) + { + /* Now check the integrity of the binary. We do this this after + having checked the HMAC code. */ + if (check_binary_integrity ()) + goto leave; + } /* All selftests passed. */ result = STATE_OPERATIONAL; ----------------------------------------------------------------------- Summary of changes: src/fips.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits