From jussi.kivilinna at iki.fi Sat May 3 19:25:59 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 03 May 2014 20:25:59 +0300 Subject: [PATCH 1/3] Add ChaCha20 stream cipher Message-ID: <20140503172559.23791.31470.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'chacha20.c'. * cipher/chacha20.c: New. * cipher/cipher.c (cipher_list): Add ChaCha20. * configure.ac: Add ChaCha20. * src/cipher.h (_gcry_cipher_spec_chacha20): New. * src/gcrypt.h.in (GCRY_CIPHER_CHACHA20): Add new algo. * tests/basic.c (MAX_DATA_LEN): Increase to 128 from 100. (check_stream_cipher): Add ChaCha20 test-vectors. (check_ciphers): Add ChaCha20. -- Patch adds Bernstein's ChaCha20 cipher to libgcrypt. Implementation is based on public domain implementations. Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 cipher/chacha20.c | 498 ++++++++++++++++++++++++++++++++++++++++++++++++++++ cipher/cipher.c | 3 configure.ac | 8 + src/cipher.h | 1 src/gcrypt.h.in | 3 tests/basic.c | 330 ++++++++++++++++++++++++++++++++++ 7 files changed, 840 insertions(+), 4 deletions(-) create mode 100644 cipher/chacha20.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3c20d3c..bc7959a 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,6 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ +chacha20.c \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20.c b/cipher/chacha20.c new file mode 100644 index 0000000..f250789 --- /dev/null +++ b/cipher/chacha20.c @@ -0,0 +1,498 @@ +/* chacha20.c - Bernstein's ChaCha20 cipher + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * For a description of the algorithm, see: + * http://cr.yp.to/chacha.html + */ + +/* The code is based on salsa20.c and public-domain ChaCha implementations: + * chacha-ref.c version 20080118 + * D. J. Bernstein + * Public domain. + * and + * Andrew Moon + * https://github.com/floodyberry/chacha-opt + */ + + +#include +#include +#include +#include +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" + + +#define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ +#define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ +#define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ +#define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ +#define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ +#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) + + +struct CHACHA20_context_s; + + +typedef unsigned int (*chacha20_core_t)(u32 *dst, + struct CHACHA20_context_s *ctx); + +typedef struct CHACHA20_context_s +{ + u32 input[CHACHA20_INPUT_LENGTH]; + u32 pad[CHACHA20_INPUT_LENGTH]; + unsigned int unused; /* bytes in the pad. */ + chacha20_core_t core; +} CHACHA20_context_t; + + +static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); +static const char *selftest (void); + + + +#define QROUND(a,b,c,d) \ + do { \ + a += b; d = rol(d ^ a, 16); \ + c += d; b = rol(b ^ c, 12); \ + a += b; d = rol(d ^ a, 8); \ + c += d; b = rol(b ^ c, 7); \ + } while (0) + +static unsigned int +chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes) +{ + u32 pad[CHACHA20_INPUT_LENGTH]; + u32 inp[CHACHA20_INPUT_LENGTH]; + unsigned int i; + + /* Note: 'bytes' must be multiple of 64 and not zero. */ + + inp[0] = state[0]; + inp[1] = state[1]; + inp[2] = state[2]; + inp[3] = state[3]; + inp[4] = state[4]; + inp[5] = state[5]; + inp[6] = state[6]; + inp[7] = state[7]; + inp[8] = state[8]; + inp[9] = state[9]; + inp[10] = state[10]; + inp[11] = state[11]; + inp[12] = state[12]; + inp[13] = state[13]; + inp[14] = state[14]; + inp[15] = state[15]; + + do + { + /* First round. */ + pad[0] = inp[0]; + pad[4] = inp[4]; + pad[8] = inp[8]; + pad[12] = inp[12]; + QROUND (pad[0], pad[4], pad[8], pad[12]); + pad[1] = inp[1]; + pad[5] = inp[5]; + pad[9] = inp[9]; + pad[13] = inp[13]; + QROUND (pad[1], pad[5], pad[9], pad[13]); + pad[2] = inp[2]; + pad[6] = inp[6]; + pad[10] = inp[10]; + pad[14] = inp[14]; + QROUND (pad[2], pad[6], pad[10], pad[14]); + pad[3] = inp[3]; + pad[7] = inp[7]; + pad[11] = inp[11]; + pad[15] = inp[15]; + QROUND (pad[3], pad[7], pad[11], pad[15]); + + for (i = 1; i < 20 - 1; i += 2) + { + QROUND (pad[0], pad[5], pad[10], pad[15]); + QROUND (pad[1], pad[6], pad[11], pad[12]); + QROUND (pad[2], pad[7], pad[8], pad[13]); + QROUND (pad[3], pad[4], pad[9], pad[14]); + + QROUND (pad[0], pad[4], pad[8], pad[12]); + QROUND (pad[1], pad[5], pad[9], pad[13]); + QROUND (pad[2], pad[6], pad[10], pad[14]); + QROUND (pad[3], pad[7], pad[11], pad[15]); + } + + /* Last round. */ + QROUND (pad[0], pad[5], pad[10], pad[15]); + pad[0] = le_bswap32 (pad[0] + inp[0]); + pad[5] = le_bswap32 (pad[5] + inp[5]); + pad[10] = le_bswap32 (pad[10] + inp[10]); + pad[15] = le_bswap32 (pad[15] + inp[15]); + QROUND (pad[1], pad[6], pad[11], pad[12]); + pad[1] = le_bswap32 (pad[1] + inp[1]); + pad[6] = le_bswap32 (pad[6] + inp[6]); + pad[11] = le_bswap32 (pad[11] + inp[11]); + pad[12] = le_bswap32 (pad[12] + inp[12]); + QROUND (pad[2], pad[7], pad[8], pad[13]); + pad[2] = le_bswap32 (pad[2] + inp[2]); + pad[7] = le_bswap32 (pad[7] + inp[7]); + pad[8] = le_bswap32 (pad[8] + inp[8]); + pad[13] = le_bswap32 (pad[13] + inp[13]); + QROUND (pad[3], pad[4], pad[9], pad[14]); + pad[3] = le_bswap32 (pad[3] + inp[3]); + pad[4] = le_bswap32 (pad[4] + inp[4]); + pad[9] = le_bswap32 (pad[9] + inp[9]); + pad[14] = le_bswap32 (pad[14] + inp[14]); + + if (src) + buf_xor(dst, pad, src, CHACHA20_BLOCK_SIZE); + else + buf_cpy(dst, pad, CHACHA20_BLOCK_SIZE); + + /* Update counter. */ + inp[13] += (!++inp[12]); + + bytes -= CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + if (src) + src += CHACHA20_BLOCK_SIZE; + } + while (bytes >= CHACHA20_BLOCK_SIZE); + + state[12] = inp[12]; + state[13] = inp[13]; + + /* burn_stack */ + return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *)); +} + +#undef QROUND + + +static unsigned int +chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx) +{ + return chacha20_blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE); +} + + +static void +chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key, + unsigned int keylen) +{ + /* These constants are the little endian encoding of the string + "expand 32-byte k". For the 128 bit variant, the "32" in that + string will be fixed up to "16". */ + ctx->input[0] = 0x61707865; /* "apxe" */ + ctx->input[1] = 0x3320646e; /* "3 dn" */ + ctx->input[2] = 0x79622d32; /* "yb-2" */ + ctx->input[3] = 0x6b206574; /* "k et" */ + + ctx->input[4] = buf_get_le32 (key + 0); + ctx->input[5] = buf_get_le32 (key + 4); + ctx->input[6] = buf_get_le32 (key + 8); + ctx->input[7] = buf_get_le32 (key + 12); + + if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ + { + ctx->input[8] = buf_get_le32 (key + 16); + ctx->input[9] = buf_get_le32 (key + 20); + ctx->input[10] = buf_get_le32 (key + 24); + ctx->input[11] = buf_get_le32 (key + 28); + } + else /* 128 bits */ + { + ctx->input[8] = ctx->input[4]; + ctx->input[9] = ctx->input[5]; + ctx->input[10] = ctx->input[6]; + ctx->input[11] = ctx->input[7]; + + ctx->input[1] -= 0x02000000; /* Change to "1 dn". */ + ctx->input[2] += 0x00000004; /* Change to "yb-6". */ + } +} + + +static void +chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen) +{ + ctx->input[12] = 0; + + if (ivlen == CHACHA20_MAX_IV_SIZE) + { + ctx->input[13] = buf_get_le32 (iv + 0); + ctx->input[14] = buf_get_le32 (iv + 4); + ctx->input[15] = buf_get_le32 (iv + 8); + } + else if (ivlen == CHACHA20_MIN_IV_SIZE) + { + ctx->input[13] = 0; + ctx->input[14] = buf_get_le32 (iv + 0); + ctx->input[15] = buf_get_le32 (iv + 4); + } + else + { + ctx->input[13] = 0; + ctx->input[14] = 0; + ctx->input[15] = 0; + } +} + + +static gcry_err_code_t +chacha20_do_setkey (CHACHA20_context_t * ctx, + const byte * key, unsigned int keylen) +{ + static int initialized; + static const char *selftest_failed; + + if (!initialized) + { + initialized = 1; + selftest_failed = selftest (); + if (selftest_failed) + log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); + } + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) + return GPG_ERR_INV_KEYLEN; + + ctx->core = chacha20_core; + + chacha20_keysetup (ctx, key, keylen); + + /* We default to a zero nonce. */ + chacha20_setiv (ctx, NULL, 0); + + return 0; +} + + +static gcry_err_code_t +chacha20_setkey (void *context, const byte * key, unsigned int keylen) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); + _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); + return rc; +} + + +static void +chacha20_setiv (void *context, const byte * iv, size_t ivlen) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + + /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ + if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE) + log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); + + if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE)) + chacha20_ivsetup (ctx, iv, ivlen); + else + chacha20_ivsetup (ctx, NULL, 0); + + /* Reset the unused pad bytes counter. */ + ctx->unused = 0; +} + + + +/* Note: This function requires LENGTH > 0. */ +static void +chacha20_do_encrypt_stream (CHACHA20_context_t * ctx, + byte * outbuf, const byte * inbuf, size_t length) +{ + unsigned int nburn, burn = 0; + + if (ctx->unused) + { + unsigned char *p = (void *) ctx->pad; + size_t n; + + gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); + + n = ctx->unused; + if (n > length) + n = length; + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); + length -= n; + outbuf += n; + inbuf += n; + ctx->unused -= n; + if (!length) + return; + gcry_assert (!ctx->unused); + } + + if (length >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + size_t bytes = nblocks * CHACHA20_BLOCK_SIZE; + burn = chacha20_blocks(ctx->input, inbuf, outbuf, bytes); + length -= bytes; + outbuf += bytes; + inbuf += bytes; + } + + while (length > 0) + { + /* Create the next pad and bump the block counter. Note that it + is the user's duty to change to another nonce not later than + after 2^70 processed bytes. */ + nburn = ctx->core (ctx->pad, ctx); + burn = nburn > burn ? nburn : burn; + + if (length <= CHACHA20_BLOCK_SIZE) + { + buf_xor (outbuf, inbuf, ctx->pad, length); + ctx->unused = CHACHA20_BLOCK_SIZE - length; + break; + } + buf_xor (outbuf, inbuf, ctx->pad, CHACHA20_BLOCK_SIZE); + length -= CHACHA20_BLOCK_SIZE; + outbuf += CHACHA20_BLOCK_SIZE; + inbuf += CHACHA20_BLOCK_SIZE; + } + + _gcry_burn_stack (burn); +} + + +static void +chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf, + size_t length) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + + if (length) + chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length); +} + + +static const char * +selftest (void) +{ + CHACHA20_context_t ctx; + byte scratch[127 + 1]; + byte buf[256 + 64 + 4]; + int i; + + /* From draft-strombergson-chacha-test-vectors */ + static byte key_1[] = { + 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, + 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, + 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, + 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d + }; + static const byte nonce_1[] = + { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; + static const byte plaintext_1[127] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + static const byte ciphertext_1[127] = { + 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, + 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, + 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, + 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, + 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, + 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, + 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, + 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, + 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, + 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, + 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, + 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, + 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, + 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, + 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, + 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 + }; + + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + scratch[sizeof (scratch) - 1] = 0; + chacha20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1); + if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) + return "ChaCha20 encryption test 1 failed."; + if (scratch[sizeof (scratch) - 1]) + return "ChaCha20 wrote too much."; + chacha20_setkey (&ctx, key_1, sizeof (key_1)); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1); + if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) + return "ChaCha20 decryption test 1 failed."; + + for (i = 0; i < sizeof buf; i++) + buf[i] = i; + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + /*encrypt */ + chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf); + /*decrypt */ + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, buf, buf, 1); + chacha20_encrypt_stream (&ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); + chacha20_encrypt_stream (&ctx, buf + (sizeof buf) - 1, + buf + (sizeof buf) - 1, 1); + for (i = 0; i < sizeof buf; i++) + if (buf[i] != (byte) i) + return "ChaCha20 encryption test 2 failed."; + + return NULL; +} + + +gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { + GCRY_CIPHER_CHACHA20, + {0, 0}, /* flags */ + "CHACHA20", /* name */ + NULL, /* aliases */ + NULL, /* oids */ + 1, /* blocksize in bytes. */ + CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ + sizeof (CHACHA20_context_t), + chacha20_setkey, + NULL, + NULL, + chacha20_encrypt_stream, + chacha20_encrypt_stream, + NULL, + NULL, + chacha20_setiv +}; diff --git a/cipher/cipher.c b/cipher/cipher.c index 6552ed3..4751302 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -83,6 +83,9 @@ static gcry_cipher_spec_t *cipher_list[] = #if USE_GOST28147 &_gcry_cipher_spec_gost28147, #endif +#if USE_CHACHA20 + &_gcry_cipher_spec_chacha20, +#endif NULL }; diff --git a/configure.ac b/configure.ac index 79f79ef..724c4ee 100644 --- a/configure.ac +++ b/configure.ac @@ -186,7 +186,7 @@ LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" -available_ciphers="$available_ciphers camellia idea salsa20 gost28147" +available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" enabled_ciphers="" # Definitions for public-key ciphers. @@ -1806,6 +1806,12 @@ if test "$found" = "1" ; then AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included]) fi +LIST_MEMBER(chacha20, $enabled_ciphers) +if test "$found" = "1" ; then + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" + AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) +fi + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" diff --git a/src/cipher.h b/src/cipher.h index 5d1b5f6..ed57d3c 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -251,6 +251,7 @@ extern gcry_cipher_spec_t _gcry_cipher_spec_idea; extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20; extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12; extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147; +extern gcry_cipher_spec_t _gcry_cipher_spec_chacha20; /* Declarations for the digest specifications. */ extern gcry_md_spec_t _gcry_digest_spec_crc32; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index c84a3f7..d4e9bb2 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -880,7 +880,8 @@ enum gcry_cipher_algos GCRY_CIPHER_CAMELLIA256 = 312, GCRY_CIPHER_SALSA20 = 313, GCRY_CIPHER_SALSA20R12 = 314, - GCRY_CIPHER_GOST28147 = 315 + GCRY_CIPHER_GOST28147 = 315, + GCRY_CIPHER_CHACHA20 = 316 }; /* The Rijndael algorithm is basically AES, so provide some macros. */ diff --git a/tests/basic.c b/tests/basic.c index 5c6c51c..406d82d 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -195,7 +195,7 @@ show_mac_not_available (int algo) -#define MAX_DATA_LEN 100 +#define MAX_DATA_LEN 128 void progress_handler (void *cb_data, const char *what, int printchar, @@ -2583,8 +2583,331 @@ check_stream_cipher (void) "\x44\xC9\x70\x0A\x0F\x21\x38\xE8\xC1\xA2\x86\xFB\x8C\x1F\xBF\xA0" } } - } + }, #endif /*USE_SALSA20*/ +#ifdef USE_CHACHA20 + /* From draft-strombergson-chacha-test-vectors-01 */ + { + "ChaCha20 128 bit, TC1", + GCRY_CIPHER_CHACHA20, 16, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 8, + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd" + }, + { 112, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8" + "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46" + "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa" + "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f" + "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d" + "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75" + "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06" + }, + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8" + "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46" + "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa" + "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f" + "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d" + "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75" + "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06" + "\x2f\x41\xf6\x7a\x75\x2e\x66\xad\x34\x11\x98\x4c\x78\x7e\x30\xad" + } + } + }, + { + "ChaCha20 256 bit, TC1", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 8, + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90" + }, + { 112, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28" + "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7" + "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37" + "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86" + "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d" + "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed" + "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5" + }, + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28" + "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7" + "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37" + "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86" + "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d" + "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed" + "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5" + "\x31\xed\x1f\x28\x51\x0a\xfb\x45\xac\xe1\x0a\x1f\x4b\x79\x4d\x6f" + } + } + }, + { + "ChaCha20 256 bit, TC2", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xc5\xd3\x0a\x7c\xe1\xec\x11\x93\x78\xc8\x4f\x48\x7d\x77\x5a\x85" + "\x42\xf1\x3e\xce\x23\x8a\x94\x55\xe8\x22\x9e\x88\x8d\xe8\x5b\xbd" + "\x29\xeb\x63\xd0\xa1\x7a\x5b\x99\x9b\x52\xda\x22\xbe\x40\x23\xeb" + "\x07\x62\x0a\x54\xf6\xfa\x6a\xd8\x73\x7b\x71\xeb\x04\x64\xda\xc0" + "\x10\xf6\x56\xe6\xd1\xfd\x55\x05\x3e\x50\xc4\x87\x5c\x99\x30\xa3" + "\x3f\x6d\x02\x63\xbd\x14\xdf\xd6\xab\x8c\x70\x52\x1c\x19\x33\x8b" + "\x23\x08\xb9\x5c\xf8\xd0\xbb\x7d\x20\x2d\x21\x02\x78\x0e\xa3\x52" + "\x8f\x1c\xb4\x85\x60\xf7\x6b\x20\xf3\x82\xb9\x42\x50\x0f\xce\xac" + } + } + }, + { + "ChaCha20 256 bit, TC3", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x01\x00\x00\x00\x00\x00\x00\x00", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xef\x3f\xdf\xd6\xc6\x15\x78\xfb\xf5\xcf\x35\xbd\x3d\xd3\x3b\x80" + "\x09\x63\x16\x34\xd2\x1e\x42\xac\x33\x96\x0b\xd1\x38\xe5\x0d\x32" + "\x11\x1e\x4c\xaf\x23\x7e\xe5\x3c\xa8\xad\x64\x26\x19\x4a\x88\x54" + "\x5d\xdc\x49\x7a\x0b\x46\x6e\x7d\x6b\xbd\xb0\x04\x1b\x2f\x58\x6b" + "\x53\x05\xe5\xe4\x4a\xff\x19\xb2\x35\x93\x61\x44\x67\x5e\xfb\xe4" + "\x40\x9e\xb7\xe8\xe5\xf1\x43\x0f\x5f\x58\x36\xae\xb4\x9b\xb5\x32" + "\x8b\x01\x7c\x4b\x9d\xc1\x1f\x8a\x03\x86\x3f\xa8\x03\xdc\x71\xd5" + "\x72\x6b\x2b\x6b\x31\xaa\x32\x70\x8a\xfe\x5a\xf1\xd6\xb6\x90\x58" + } + } + }, + { + "ChaCha20 256 bit, TC4", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "\xff\xff\xff\xff\xff\xff\xff\xff", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xd9\xbf\x3f\x6b\xce\x6e\xd0\xb5\x42\x54\x55\x77\x67\xfb\x57\x44" + "\x3d\xd4\x77\x89\x11\xb6\x06\x05\x5c\x39\xcc\x25\xe6\x74\xb8\x36" + "\x3f\xea\xbc\x57\xfd\xe5\x4f\x79\x0c\x52\xc8\xae\x43\x24\x0b\x79" + "\xd4\x90\x42\xb7\x77\xbf\xd6\xcb\x80\xe9\x31\x27\x0b\x7f\x50\xeb" + "\x5b\xac\x2a\xcd\x86\xa8\x36\xc5\xdc\x98\xc1\x16\xc1\x21\x7e\xc3" + "\x1d\x3a\x63\xa9\x45\x13\x19\xf0\x97\xf3\xb4\xd6\xda\xb0\x77\x87" + "\x19\x47\x7d\x24\xd2\x4b\x40\x3a\x12\x24\x1d\x7c\xca\x06\x4f\x79" + "\x0f\x1d\x51\xcc\xaf\xf6\xb1\x66\x7d\x4b\xbc\xa1\x95\x8c\x43\x06" + } + } + }, + { + "ChaCha20 256 bit, TC5", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55" + "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55", + "\x55\x55\x55\x55\x55\x55\x55\x55", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xbe\xa9\x41\x1a\xa4\x53\xc5\x43\x4a\x5a\xe8\xc9\x28\x62\xf5\x64" + "\x39\x68\x55\xa9\xea\x6e\x22\xd6\xd3\xb5\x0a\xe1\xb3\x66\x33\x11" + "\xa4\xa3\x60\x6c\x67\x1d\x60\x5c\xe1\x6c\x3a\xec\xe8\xe6\x1e\xa1" + "\x45\xc5\x97\x75\x01\x7b\xee\x2f\xa6\xf8\x8a\xfc\x75\x80\x69\xf7" + "\xe0\xb8\xf6\x76\xe6\x44\x21\x6f\x4d\x2a\x34\x22\xd7\xfa\x36\xc6" + "\xc4\x93\x1a\xca\x95\x0e\x9d\xa4\x27\x88\xe6\xd0\xb6\xd1\xcd\x83" + "\x8e\xf6\x52\xe9\x7b\x14\x5b\x14\x87\x1e\xae\x6c\x68\x04\xc7\x00" + "\x4d\xb5\xac\x2f\xce\x4c\x68\xc7\x26\xd0\x04\xb1\x0f\xca\xba\x86" + } + } + }, + { + "ChaCha20 256 bit, TC6", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa", + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x9a\xa2\xa9\xf6\x56\xef\xde\x5a\xa7\x59\x1c\x5f\xed\x4b\x35\xae" + "\xa2\x89\x5d\xec\x7c\xb4\x54\x3b\x9e\x9f\x21\xf5\xe7\xbc\xbc\xf3" + "\xc4\x3c\x74\x8a\x97\x08\x88\xf8\x24\x83\x93\xa0\x9d\x43\xe0\xb7" + "\xe1\x64\xbc\x4d\x0b\x0f\xb2\x40\xa2\xd7\x21\x15\xc4\x80\x89\x06" + "\x72\x18\x44\x89\x44\x05\x45\xd0\x21\xd9\x7e\xf6\xb6\x93\xdf\xe5" + "\xb2\xc1\x32\xd4\x7e\x6f\x04\x1c\x90\x63\x65\x1f\x96\xb6\x23\xe6" + "\x2a\x11\x99\x9a\x23\xb6\xf7\xc4\x61\xb2\x15\x30\x26\xad\x5e\x86" + "\x6a\x2e\x59\x7e\xd0\x7b\x84\x01\xde\xc6\x3a\x09\x34\xc6\xb2\xa9" + } + } + }, + { + "ChaCha20 256 bit, TC7", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\xff\xee\xdd\xcc\xbb\xaa\x99\x88\x77\x66\x55\x44\x33\x22\x11\x00", + "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x9f\xad\xf4\x09\xc0\x08\x11\xd0\x04\x31\xd6\x7e\xfb\xd8\x8f\xba" + "\x59\x21\x8d\x5d\x67\x08\xb1\xd6\x85\x86\x3f\xab\xbb\x0e\x96\x1e" + "\xea\x48\x0f\xd6\xfb\x53\x2b\xfd\x49\x4b\x21\x51\x01\x50\x57\x42" + "\x3a\xb6\x0a\x63\xfe\x4f\x55\xf7\xa2\x12\xe2\x16\x7c\xca\xb9\x31" + "\xfb\xfd\x29\xcf\x7b\xc1\xd2\x79\xed\xdf\x25\xdd\x31\x6b\xb8\x84" + "\x3d\x6e\xde\xe0\xbd\x1e\xf1\x21\xd1\x2f\xa1\x7c\xbc\x2c\x57\x4c" + "\xcc\xab\x5e\x27\x51\x67\xb0\x8b\xd6\x86\xf8\xa0\x9d\xf8\x7e\xc3" + "\xff\xb3\x53\x61\xb9\x4e\xbf\xa1\x3f\xec\x0e\x48\x89\xd1\x8d\xa5" + } + } + }, + { + "ChaCha20 256 bit, TC8", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xc4\x6e\xc1\xb1\x8c\xe8\xa8\x78\x72\x5a\x37\xe7\x80\xdf\xb7\x35" + "\x1f\x68\xed\x2e\x19\x4c\x79\xfb\xc6\xae\xbe\xe1\xa6\x67\x97\x5d", + "\x1a\xda\x31\xd5\xcf\x68\x82\x21", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06" + "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf" + "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f" + "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92" + "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36" + "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38" + "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0" + "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33\x32" + }, + { 127, + "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06" + "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf" + "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f" + "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92" + "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36" + "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38" + "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0" + "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + } + } + }, + /* from draft-nir-cfrg-chacha20-poly1305-02 */ + { + "ChaCha20 256 bit, IV96-bit", + GCRY_CIPHER_CHACHA20, 32, 12, + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + "\x07\x00\x00\x00\x40\x41\x42\x43\x44\x45\x46\x47", + { + { 64, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x7b\xac\x2b\x25\x2d\xb4\x47\xaf\x09\xb6\x7a\x55\xa4\xe9\x55\x84" + "\x0a\xe1\xd6\x73\x10\x75\xd9\xeb\x2a\x93\x75\x78\x3e\xd5\x53\xff" + "\xa2\x7e\xcc\xde\xad\xdb\x4d\xb4\xd1\x17\x9c\xe4\xc9\x0b\x43\xd8" + "\xbc\xb7\x94\x8c\x4b\x4b\x7d\x8b\x7d\xf6\x27\x39\x32\xa4\x69\x16" + }, + }, + }, +#endif /*USE_CHACHA20*/ }; gcry_cipher_hd_t hde, hdd; @@ -3649,6 +3972,9 @@ check_ciphers (void) GCRY_CIPHER_SALSA20, GCRY_CIPHER_SALSA20R12, #endif +#if USE_CHACHA20 + GCRY_CIPHER_CHACHA20, +#endif 0 }; int i; From jussi.kivilinna at iki.fi Sat May 3 19:26:09 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 03 May 2014 20:26:09 +0300 Subject: [PATCH 3/3] chacha20: add AVX2/AMD64 assembly implementation In-Reply-To: <20140503172559.23791.31470.stgit@localhost6.localdomain6> References: <20140503172559.23791.31470.stgit@localhost6.localdomain6> Message-ID: <20140503172609.23791.23566.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'chacha20-avx2-amd64.S'. * cipher/chacha20-avx2-amd64.S: New. * cipher/chacha20.c (USE_AVX2): New macro. (CHACHA20_context_s): Add 'use_avx2'. [USE_AVX2] (_gcry_chacha20_amd64_avx2_blocks, chacha20_core_avx2): New. (chacha20_do_setkey): Check HW features for AVX2. (chacha20_do_encrypt_stream) [USE_AVX2]: Use AVX2 optimized implementation, if enabled. (selftest): Increase size of buf by 256. * configure.ac [host=x86-64]: Add 'chacha20-avx2-amd64.lo'. -- Add AVX2 optimized implementation for ChaCha20. Based on implementation by Andrew Moon. SSSE3 (Intel Haswell): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.742 ns/B 1284.8 MiB/s 2.38 c/B STREAM dec | 0.741 ns/B 1286.5 MiB/s 2.37 c/B AVX2: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.393 ns/B 2428.0 MiB/s 1.26 c/B STREAM dec | 0.392 ns/B 2433.6 MiB/s 1.25 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/chacha20-avx2-amd64.S | 950 ++++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 38 ++ configure.ac | 1 4 files changed, 989 insertions(+), 2 deletions(-) create mode 100644 cipher/chacha20-avx2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 27ca7ac..26d13d2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c chacha20-ssse3-amd64.S \ +chacha20.c chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S new file mode 100644 index 0000000..8a08725 --- /dev/null +++ b/cipher/chacha20-avx2-amd64.S @@ -0,0 +1,950 @@ +/* chacha20-avx2-amd64.S - AMD64/AVX2 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && USE_CHACHA20 + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +.text + +.align 8 +.globl _gcry_chacha20_amd64_avx2_blocks +.type _gcry_chacha20_amd64_avx2_blocks, at function; +_gcry_chacha20_amd64_avx2_blocks: +.Lchacha_blocks_avx2_local: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + leaq .LC RIP, %rax + vmovdqu 0(%rax), %xmm6 + vmovdqu 16(%rax), %xmm7 + vmovdqu 0(%rdi), %xmm8 + vmovdqu 16(%rdi), %xmm9 + vmovdqu 32(%rdi), %xmm10 + vmovdqu 48(%rdi), %xmm11 + movl $20, %eax + movq $1, %r9 + vmovdqa %xmm8, 0(%rsp) + vmovdqa %xmm9, 16(%rsp) + vmovdqa %xmm10, 32(%rsp) + vmovdqa %xmm11, 48(%rsp) + movq %rax, 64(%rsp) + vmovdqa %xmm6, 448(%rsp) + vmovdqa %xmm6, 464(%rsp) + vmovdqa %xmm7, 480(%rsp) + vmovdqa %xmm7, 496(%rsp) + cmpq $512, %rcx + jae .Lchacha_blocks_avx2_atleast512 + cmp $256, %rcx + jae .Lchacha_blocks_avx2_atleast256 + jmp .Lchacha_blocks_avx2_below256 + .p2align 6,,63 +.Lchacha_blocks_avx2_atleast512: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + leaq 5(%rax), %r11 + leaq 6(%rax), %r12 + leaq 7(%rax), %r13 + leaq 8(%rax), %r14 + movl %eax, 128(%rsp) + movl %r8d, 4+128(%rsp) + movl %r9d, 8+128(%rsp) + movl %r10d, 12+128(%rsp) + movl %ebx, 16+128(%rsp) + movl %r11d, 20+128(%rsp) + movl %r12d, 24+128(%rsp) + movl %r13d, 28+128(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + shrq $32, %rbx + shrq $32, %r11 + shrq $32, %r12 + shrq $32, %r13 + movl %eax, 160(%rsp) + movl %r8d, 4+160(%rsp) + movl %r9d, 8+160(%rsp) + movl %r10d, 12+160(%rsp) + movl %ebx, 16+160(%rsp) + movl %r11d, 20+160(%rsp) + movl %r12d, 24+160(%rsp) + movl %r13d, 28+160(%rsp) + movq %r14, 48(%rsp) + movq 64(%rsp), %rax + vpbroadcastd 0(%rsp), %ymm0 + vpbroadcastd 4+0(%rsp), %ymm1 + vpbroadcastd 8+0(%rsp), %ymm2 + vpbroadcastd 12+0(%rsp), %ymm3 + vpbroadcastd 16(%rsp), %ymm4 + vpbroadcastd 4+16(%rsp), %ymm5 + vpbroadcastd 8+16(%rsp), %ymm6 + vpbroadcastd 12+16(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 +.Lchacha_blocks_avx2_mainloop1: + vpaddd %ymm0, %ymm4, %ymm0 + vpaddd %ymm1, %ymm5, %ymm1 + vpxor %ymm12, %ymm0, %ymm12 + vpxor %ymm13, %ymm1, %ymm13 + vpaddd %ymm2, %ymm6, %ymm2 + vpaddd %ymm3, %ymm7, %ymm3 + vpxor %ymm14, %ymm2, %ymm14 + vpxor %ymm15, %ymm3, %ymm15 + vpshufb 448(%rsp), %ymm12, %ymm12 + vpshufb 448(%rsp), %ymm13, %ymm13 + vpaddd %ymm8, %ymm12, %ymm8 + vpaddd %ymm9, %ymm13, %ymm9 + vpshufb 448(%rsp), %ymm14, %ymm14 + vpshufb 448(%rsp), %ymm15, %ymm15 + vpaddd %ymm10, %ymm14, %ymm10 + vpaddd %ymm11, %ymm15, %ymm11 + vmovdqa %ymm12, 96(%rsp) + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm5, %ymm9, %ymm5 + vpslld $ 12, %ymm4, %ymm12 + vpsrld $20, %ymm4, %ymm4 + vpxor %ymm4, %ymm12, %ymm4 + vpslld $ 12, %ymm5, %ymm12 + vpsrld $20, %ymm5, %ymm5 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm6, %ymm10, %ymm6 + vpxor %ymm7, %ymm11, %ymm7 + vpslld $ 12, %ymm6, %ymm12 + vpsrld $20, %ymm6, %ymm6 + vpxor %ymm6, %ymm12, %ymm6 + vpslld $ 12, %ymm7, %ymm12 + vpsrld $20, %ymm7, %ymm7 + vpxor %ymm7, %ymm12, %ymm7 + vpaddd %ymm0, %ymm4, %ymm0 + vpaddd %ymm1, %ymm5, %ymm1 + vpxor 96(%rsp), %ymm0, %ymm12 + vpxor %ymm13, %ymm1, %ymm13 + vpaddd %ymm2, %ymm6, %ymm2 + vpaddd %ymm3, %ymm7, %ymm3 + vpxor %ymm14, %ymm2, %ymm14 + vpxor %ymm15, %ymm3, %ymm15 + vpshufb 480(%rsp), %ymm12, %ymm12 + vpshufb 480(%rsp), %ymm13, %ymm13 + vpaddd %ymm8, %ymm12, %ymm8 + vpaddd %ymm9, %ymm13, %ymm9 + vpshufb 480(%rsp), %ymm14, %ymm14 + vpshufb 480(%rsp), %ymm15, %ymm15 + vpaddd %ymm10, %ymm14, %ymm10 + vpaddd %ymm11, %ymm15, %ymm11 + vmovdqa %ymm12, 96(%rsp) + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm5, %ymm9, %ymm5 + vpslld $ 7, %ymm4, %ymm12 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm4, %ymm12, %ymm4 + vpslld $ 7, %ymm5, %ymm12 + vpsrld $25, %ymm5, %ymm5 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm6, %ymm10, %ymm6 + vpxor %ymm7, %ymm11, %ymm7 + vpslld $ 7, %ymm6, %ymm12 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm6, %ymm12, %ymm6 + vpslld $ 7, %ymm7, %ymm12 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm7, %ymm12, %ymm7 + vpaddd %ymm0, %ymm5, %ymm0 + vpaddd %ymm1, %ymm6, %ymm1 + vpxor %ymm15, %ymm0, %ymm15 + vpxor 96(%rsp), %ymm1, %ymm12 + vpaddd %ymm2, %ymm7, %ymm2 + vpaddd %ymm3, %ymm4, %ymm3 + vpxor %ymm13, %ymm2, %ymm13 + vpxor %ymm14, %ymm3, %ymm14 + vpshufb 448(%rsp), %ymm15, %ymm15 + vpshufb 448(%rsp), %ymm12, %ymm12 + vpaddd %ymm10, %ymm15, %ymm10 + vpaddd %ymm11, %ymm12, %ymm11 + vpshufb 448(%rsp), %ymm13, %ymm13 + vpshufb 448(%rsp), %ymm14, %ymm14 + vpaddd %ymm8, %ymm13, %ymm8 + vpaddd %ymm9, %ymm14, %ymm9 + vmovdqa %ymm15, 96(%rsp) + vpxor %ymm5, %ymm10, %ymm5 + vpxor %ymm6, %ymm11, %ymm6 + vpslld $ 12, %ymm5, %ymm15 + vpsrld $20, %ymm5, %ymm5 + vpxor %ymm5, %ymm15, %ymm5 + vpslld $ 12, %ymm6, %ymm15 + vpsrld $20, %ymm6, %ymm6 + vpxor %ymm6, %ymm15, %ymm6 + vpxor %ymm7, %ymm8, %ymm7 + vpxor %ymm4, %ymm9, %ymm4 + vpslld $ 12, %ymm7, %ymm15 + vpsrld $20, %ymm7, %ymm7 + vpxor %ymm7, %ymm15, %ymm7 + vpslld $ 12, %ymm4, %ymm15 + vpsrld $20, %ymm4, %ymm4 + vpxor %ymm4, %ymm15, %ymm4 + vpaddd %ymm0, %ymm5, %ymm0 + vpaddd %ymm1, %ymm6, %ymm1 + vpxor 96(%rsp), %ymm0, %ymm15 + vpxor %ymm12, %ymm1, %ymm12 + vpaddd %ymm2, %ymm7, %ymm2 + vpaddd %ymm3, %ymm4, %ymm3 + vpxor %ymm13, %ymm2, %ymm13 + vpxor %ymm14, %ymm3, %ymm14 + vpshufb 480(%rsp), %ymm15, %ymm15 + vpshufb 480(%rsp), %ymm12, %ymm12 + vpaddd %ymm10, %ymm15, %ymm10 + vpaddd %ymm11, %ymm12, %ymm11 + vpshufb 480(%rsp), %ymm13, %ymm13 + vpshufb 480(%rsp), %ymm14, %ymm14 + vpaddd %ymm8, %ymm13, %ymm8 + vpaddd %ymm9, %ymm14, %ymm9 + vmovdqa %ymm15, 96(%rsp) + vpxor %ymm5, %ymm10, %ymm5 + vpxor %ymm6, %ymm11, %ymm6 + vpslld $ 7, %ymm5, %ymm15 + vpsrld $25, %ymm5, %ymm5 + vpxor %ymm5, %ymm15, %ymm5 + vpslld $ 7, %ymm6, %ymm15 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm6, %ymm15, %ymm6 + vpxor %ymm7, %ymm8, %ymm7 + vpxor %ymm4, %ymm9, %ymm4 + vpslld $ 7, %ymm7, %ymm15 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm7, %ymm15, %ymm7 + vpslld $ 7, %ymm4, %ymm15 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm4, %ymm15, %ymm4 + vmovdqa 96(%rsp), %ymm15 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop1 + vmovdqa %ymm8, 192(%rsp) + vmovdqa %ymm9, 224(%rsp) + vmovdqa %ymm10, 256(%rsp) + vmovdqa %ymm11, 288(%rsp) + vmovdqa %ymm12, 320(%rsp) + vmovdqa %ymm13, 352(%rsp) + vmovdqa %ymm14, 384(%rsp) + vmovdqa %ymm15, 416(%rsp) + vpbroadcastd 0(%rsp), %ymm8 + vpbroadcastd 4+0(%rsp), %ymm9 + vpbroadcastd 8+0(%rsp), %ymm10 + vpbroadcastd 12+0(%rsp), %ymm11 + vpbroadcastd 16(%rsp), %ymm12 + vpbroadcastd 4+16(%rsp), %ymm13 + vpbroadcastd 8+16(%rsp), %ymm14 + vpbroadcastd 12+16(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput1 + vpxor 0(%rsi), %ymm8, %ymm8 + vpxor 64(%rsi), %ymm9, %ymm9 + vpxor 128(%rsi), %ymm10, %ymm10 + vpxor 192(%rsi), %ymm11, %ymm11 + vpxor 256(%rsi), %ymm12, %ymm12 + vpxor 320(%rsi), %ymm13, %ymm13 + vpxor 384(%rsi), %ymm14, %ymm14 + vpxor 448(%rsi), %ymm15, %ymm15 + vmovdqu %ymm8, 0(%rdx) + vmovdqu %ymm9, 64(%rdx) + vmovdqu %ymm10, 128(%rdx) + vmovdqu %ymm11, 192(%rdx) + vmovdqu %ymm12, 256(%rdx) + vmovdqu %ymm13, 320(%rdx) + vmovdqu %ymm14, 384(%rdx) + vmovdqu %ymm15, 448(%rdx) + vmovdqa 192(%rsp), %ymm0 + vmovdqa 224(%rsp), %ymm1 + vmovdqa 256(%rsp), %ymm2 + vmovdqa 288(%rsp), %ymm3 + vmovdqa 320(%rsp), %ymm4 + vmovdqa 352(%rsp), %ymm5 + vmovdqa 384(%rsp), %ymm6 + vmovdqa 416(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + vpxor 32(%rsi), %ymm8, %ymm8 + vpxor 96(%rsi), %ymm9, %ymm9 + vpxor 160(%rsi), %ymm10, %ymm10 + vpxor 224(%rsi), %ymm11, %ymm11 + vpxor 288(%rsi), %ymm12, %ymm12 + vpxor 352(%rsi), %ymm13, %ymm13 + vpxor 416(%rsi), %ymm14, %ymm14 + vpxor 480(%rsi), %ymm15, %ymm15 + vmovdqu %ymm8, 32(%rdx) + vmovdqu %ymm9, 96(%rdx) + vmovdqu %ymm10, 160(%rdx) + vmovdqu %ymm11, 224(%rdx) + vmovdqu %ymm12, 288(%rdx) + vmovdqu %ymm13, 352(%rdx) + vmovdqu %ymm14, 416(%rdx) + vmovdqu %ymm15, 480(%rdx) + addq $512, %rsi + jmp .Lchacha_blocks_avx2_mainloop1_cont +.Lchacha_blocks_avx2_noinput1: + vmovdqu %ymm8, 0(%rdx) + vmovdqu %ymm9, 64(%rdx) + vmovdqu %ymm10, 128(%rdx) + vmovdqu %ymm11, 192(%rdx) + vmovdqu %ymm12, 256(%rdx) + vmovdqu %ymm13, 320(%rdx) + vmovdqu %ymm14, 384(%rdx) + vmovdqu %ymm15, 448(%rdx) + vmovdqa 192(%rsp), %ymm0 + vmovdqa 224(%rsp), %ymm1 + vmovdqa 256(%rsp), %ymm2 + vmovdqa 288(%rsp), %ymm3 + vmovdqa 320(%rsp), %ymm4 + vmovdqa 352(%rsp), %ymm5 + vmovdqa 384(%rsp), %ymm6 + vmovdqa 416(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + vmovdqu %ymm8, 32(%rdx) + vmovdqu %ymm9, 96(%rdx) + vmovdqu %ymm10, 160(%rdx) + vmovdqu %ymm11, 224(%rdx) + vmovdqu %ymm12, 288(%rdx) + vmovdqu %ymm13, 352(%rdx) + vmovdqu %ymm14, 416(%rdx) + vmovdqu %ymm15, 480(%rdx) +.Lchacha_blocks_avx2_mainloop1_cont: + addq $512, %rdx + subq $512, %rcx + cmp $512, %rcx + jae .Lchacha_blocks_avx2_atleast512 + cmp $256, %rcx + jb .Lchacha_blocks_avx2_below256_fixup +.Lchacha_blocks_avx2_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 128(%rsp) + movl %r8d, 4+128(%rsp) + movl %r9d, 8+128(%rsp) + movl %r10d, 12+128(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 160(%rsp) + movl %r8d, 4+160(%rsp) + movl %r9d, 8+160(%rsp) + movl %r10d, 12+160(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + vpbroadcastd 0(%rsp), %xmm0 + vpbroadcastd 4+0(%rsp), %xmm1 + vpbroadcastd 8+0(%rsp), %xmm2 + vpbroadcastd 12+0(%rsp), %xmm3 + vpbroadcastd 16(%rsp), %xmm4 + vpbroadcastd 4+16(%rsp), %xmm5 + vpbroadcastd 8+16(%rsp), %xmm6 + vpbroadcastd 12+16(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 +.Lchacha_blocks_avx2_mainloop2: + vpaddd %xmm0, %xmm4, %xmm0 + vpaddd %xmm1, %xmm5, %xmm1 + vpxor %xmm12, %xmm0, %xmm12 + vpxor %xmm13, %xmm1, %xmm13 + vpaddd %xmm2, %xmm6, %xmm2 + vpaddd %xmm3, %xmm7, %xmm3 + vpxor %xmm14, %xmm2, %xmm14 + vpxor %xmm15, %xmm3, %xmm15 + vpshufb 448(%rsp), %xmm12, %xmm12 + vpshufb 448(%rsp), %xmm13, %xmm13 + vpaddd %xmm8, %xmm12, %xmm8 + vpaddd %xmm9, %xmm13, %xmm9 + vpshufb 448(%rsp), %xmm14, %xmm14 + vpshufb 448(%rsp), %xmm15, %xmm15 + vpaddd %xmm10, %xmm14, %xmm10 + vpaddd %xmm11, %xmm15, %xmm11 + vmovdqa %xmm12, 96(%rsp) + vpxor %xmm4, %xmm8, %xmm4 + vpxor %xmm5, %xmm9, %xmm5 + vpslld $ 12, %xmm4, %xmm12 + vpsrld $20, %xmm4, %xmm4 + vpxor %xmm4, %xmm12, %xmm4 + vpslld $ 12, %xmm5, %xmm12 + vpsrld $20, %xmm5, %xmm5 + vpxor %xmm5, %xmm12, %xmm5 + vpxor %xmm6, %xmm10, %xmm6 + vpxor %xmm7, %xmm11, %xmm7 + vpslld $ 12, %xmm6, %xmm12 + vpsrld $20, %xmm6, %xmm6 + vpxor %xmm6, %xmm12, %xmm6 + vpslld $ 12, %xmm7, %xmm12 + vpsrld $20, %xmm7, %xmm7 + vpxor %xmm7, %xmm12, %xmm7 + vpaddd %xmm0, %xmm4, %xmm0 + vpaddd %xmm1, %xmm5, %xmm1 + vpxor 96(%rsp), %xmm0, %xmm12 + vpxor %xmm13, %xmm1, %xmm13 + vpaddd %xmm2, %xmm6, %xmm2 + vpaddd %xmm3, %xmm7, %xmm3 + vpxor %xmm14, %xmm2, %xmm14 + vpxor %xmm15, %xmm3, %xmm15 + vpshufb 480(%rsp), %xmm12, %xmm12 + vpshufb 480(%rsp), %xmm13, %xmm13 + vpaddd %xmm8, %xmm12, %xmm8 + vpaddd %xmm9, %xmm13, %xmm9 + vpshufb 480(%rsp), %xmm14, %xmm14 + vpshufb 480(%rsp), %xmm15, %xmm15 + vpaddd %xmm10, %xmm14, %xmm10 + vpaddd %xmm11, %xmm15, %xmm11 + vmovdqa %xmm12, 96(%rsp) + vpxor %xmm4, %xmm8, %xmm4 + vpxor %xmm5, %xmm9, %xmm5 + vpslld $ 7, %xmm4, %xmm12 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm4, %xmm12, %xmm4 + vpslld $ 7, %xmm5, %xmm12 + vpsrld $25, %xmm5, %xmm5 + vpxor %xmm5, %xmm12, %xmm5 + vpxor %xmm6, %xmm10, %xmm6 + vpxor %xmm7, %xmm11, %xmm7 + vpslld $ 7, %xmm6, %xmm12 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm6, %xmm12, %xmm6 + vpslld $ 7, %xmm7, %xmm12 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm7, %xmm12, %xmm7 + vpaddd %xmm0, %xmm5, %xmm0 + vpaddd %xmm1, %xmm6, %xmm1 + vpxor %xmm15, %xmm0, %xmm15 + vpxor 96(%rsp), %xmm1, %xmm12 + vpaddd %xmm2, %xmm7, %xmm2 + vpaddd %xmm3, %xmm4, %xmm3 + vpxor %xmm13, %xmm2, %xmm13 + vpxor %xmm14, %xmm3, %xmm14 + vpshufb 448(%rsp), %xmm15, %xmm15 + vpshufb 448(%rsp), %xmm12, %xmm12 + vpaddd %xmm10, %xmm15, %xmm10 + vpaddd %xmm11, %xmm12, %xmm11 + vpshufb 448(%rsp), %xmm13, %xmm13 + vpshufb 448(%rsp), %xmm14, %xmm14 + vpaddd %xmm8, %xmm13, %xmm8 + vpaddd %xmm9, %xmm14, %xmm9 + vmovdqa %xmm15, 96(%rsp) + vpxor %xmm5, %xmm10, %xmm5 + vpxor %xmm6, %xmm11, %xmm6 + vpslld $ 12, %xmm5, %xmm15 + vpsrld $20, %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm5 + vpslld $ 12, %xmm6, %xmm15 + vpsrld $20, %xmm6, %xmm6 + vpxor %xmm6, %xmm15, %xmm6 + vpxor %xmm7, %xmm8, %xmm7 + vpxor %xmm4, %xmm9, %xmm4 + vpslld $ 12, %xmm7, %xmm15 + vpsrld $20, %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm7 + vpslld $ 12, %xmm4, %xmm15 + vpsrld $20, %xmm4, %xmm4 + vpxor %xmm4, %xmm15, %xmm4 + vpaddd %xmm0, %xmm5, %xmm0 + vpaddd %xmm1, %xmm6, %xmm1 + vpxor 96(%rsp), %xmm0, %xmm15 + vpxor %xmm12, %xmm1, %xmm12 + vpaddd %xmm2, %xmm7, %xmm2 + vpaddd %xmm3, %xmm4, %xmm3 + vpxor %xmm13, %xmm2, %xmm13 + vpxor %xmm14, %xmm3, %xmm14 + vpshufb 480(%rsp), %xmm15, %xmm15 + vpshufb 480(%rsp), %xmm12, %xmm12 + vpaddd %xmm10, %xmm15, %xmm10 + vpaddd %xmm11, %xmm12, %xmm11 + vpshufb 480(%rsp), %xmm13, %xmm13 + vpshufb 480(%rsp), %xmm14, %xmm14 + vpaddd %xmm8, %xmm13, %xmm8 + vpaddd %xmm9, %xmm14, %xmm9 + vmovdqa %xmm15, 96(%rsp) + vpxor %xmm5, %xmm10, %xmm5 + vpxor %xmm6, %xmm11, %xmm6 + vpslld $ 7, %xmm5, %xmm15 + vpsrld $25, %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm5 + vpslld $ 7, %xmm6, %xmm15 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm6, %xmm15, %xmm6 + vpxor %xmm7, %xmm8, %xmm7 + vpxor %xmm4, %xmm9, %xmm4 + vpslld $ 7, %xmm7, %xmm15 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm7 + vpslld $ 7, %xmm4, %xmm15 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm4, %xmm15, %xmm4 + vmovdqa 96(%rsp), %xmm15 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop2 + vmovdqa %xmm8, 192(%rsp) + vmovdqa %xmm9, 208(%rsp) + vmovdqa %xmm10, 224(%rsp) + vmovdqa %xmm11, 240(%rsp) + vmovdqa %xmm12, 256(%rsp) + vmovdqa %xmm13, 272(%rsp) + vmovdqa %xmm14, 288(%rsp) + vmovdqa %xmm15, 304(%rsp) + vpbroadcastd 0(%rsp), %xmm8 + vpbroadcastd 4+0(%rsp), %xmm9 + vpbroadcastd 8+0(%rsp), %xmm10 + vpbroadcastd 12+0(%rsp), %xmm11 + vpbroadcastd 16(%rsp), %xmm12 + vpbroadcastd 4+16(%rsp), %xmm13 + vpbroadcastd 8+16(%rsp), %xmm14 + vpbroadcastd 12+16(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput2 + vpxor 0(%rsi), %xmm0, %xmm0 + vpxor 16(%rsi), %xmm1, %xmm1 + vpxor 64(%rsi), %xmm2, %xmm2 + vpxor 80(%rsi), %xmm3, %xmm3 + vpxor 128(%rsi), %xmm4, %xmm4 + vpxor 144(%rsi), %xmm5, %xmm5 + vpxor 192(%rsi), %xmm6, %xmm6 + vpxor 208(%rsi), %xmm7, %xmm7 + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 64(%rdx) + vmovdqu %xmm3, 80(%rdx) + vmovdqu %xmm4, 128(%rdx) + vmovdqu %xmm5, 144(%rdx) + vmovdqu %xmm6, 192(%rdx) + vmovdqu %xmm7, 208(%rdx) + vmovdqa 192(%rsp), %xmm0 + vmovdqa 208(%rsp), %xmm1 + vmovdqa 224(%rsp), %xmm2 + vmovdqa 240(%rsp), %xmm3 + vmovdqa 256(%rsp), %xmm4 + vmovdqa 272(%rsp), %xmm5 + vmovdqa 288(%rsp), %xmm6 + vmovdqa 304(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + vpxor 32(%rsi), %xmm0, %xmm0 + vpxor 48(%rsi), %xmm1, %xmm1 + vpxor 96(%rsi), %xmm2, %xmm2 + vpxor 112(%rsi), %xmm3, %xmm3 + vpxor 160(%rsi), %xmm4, %xmm4 + vpxor 176(%rsi), %xmm5, %xmm5 + vpxor 224(%rsi), %xmm6, %xmm6 + vpxor 240(%rsi), %xmm7, %xmm7 + vmovdqu %xmm0, 32(%rdx) + vmovdqu %xmm1, 48(%rdx) + vmovdqu %xmm2, 96(%rdx) + vmovdqu %xmm3, 112(%rdx) + vmovdqu %xmm4, 160(%rdx) + vmovdqu %xmm5, 176(%rdx) + vmovdqu %xmm6, 224(%rdx) + vmovdqu %xmm7, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_avx2_mainloop2_cont +.Lchacha_blocks_avx2_noinput2: + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 64(%rdx) + vmovdqu %xmm3, 80(%rdx) + vmovdqu %xmm4, 128(%rdx) + vmovdqu %xmm5, 144(%rdx) + vmovdqu %xmm6, 192(%rdx) + vmovdqu %xmm7, 208(%rdx) + vmovdqa 192(%rsp), %xmm0 + vmovdqa 208(%rsp), %xmm1 + vmovdqa 224(%rsp), %xmm2 + vmovdqa 240(%rsp), %xmm3 + vmovdqa 256(%rsp), %xmm4 + vmovdqa 272(%rsp), %xmm5 + vmovdqa 288(%rsp), %xmm6 + vmovdqa 304(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + vmovdqu %xmm0, 32(%rdx) + vmovdqu %xmm1, 48(%rdx) + vmovdqu %xmm2, 96(%rdx) + vmovdqu %xmm3, 112(%rdx) + vmovdqu %xmm4, 160(%rdx) + vmovdqu %xmm5, 176(%rdx) + vmovdqu %xmm6, 224(%rdx) + vmovdqu %xmm7, 240(%rdx) +.Lchacha_blocks_avx2_mainloop2_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_avx2_atleast256 +.Lchacha_blocks_avx2_below256_fixup: + vmovdqa 448(%rsp), %xmm6 + vmovdqa 480(%rsp), %xmm7 + vmovdqa 0(%rsp), %xmm8 + vmovdqa 16(%rsp), %xmm9 + vmovdqa 32(%rsp), %xmm10 + vmovdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_avx2_below256: + vmovq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_avx2_done + cmpq $64, %rcx + jae .Lchacha_blocks_avx2_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput3 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_avx2_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_avx2_copyinput + movq %rsp, %rsi +.Lchacha_blocks_avx2_noinput3: + movq %rsp, %rdx +.Lchacha_blocks_avx2_above63: + vmovdqa %xmm8, %xmm0 + vmovdqa %xmm9, %xmm1 + vmovdqa %xmm10, %xmm2 + vmovdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_avx2_mainloop3: + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm6, %xmm3, %xmm3 + vpaddd %xmm2, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm1 + vpslld $12, %xmm1, %xmm4 + vpsrld $20, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufd $0x93, %xmm0, %xmm0 + vpaddd %xmm2, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm1 + vpshufd $0x39, %xmm2, %xmm2 + vpslld $7, %xmm1, %xmm4 + vpsrld $25, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm6, %xmm3, %xmm3 + vpaddd %xmm2, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm1 + vpslld $12, %xmm1, %xmm4 + vpsrld $20, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufd $0x39, %xmm0, %xmm0 + vpaddd %xmm2, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm1 + vpshufd $0x93, %xmm2, %xmm2 + vpslld $7, %xmm1, %xmm4 + vpsrld $25, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop3 + vpaddd %xmm0, %xmm8, %xmm0 + vpaddd %xmm1, %xmm9, %xmm1 + vpaddd %xmm2, %xmm10, %xmm2 + vpaddd %xmm3, %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput4 + vpxor 0(%rsi), %xmm0, %xmm0 + vpxor 16(%rsi), %xmm1, %xmm1 + vpxor 32(%rsi), %xmm2, %xmm2 + vpxor 48(%rsi), %xmm3, %xmm3 + addq $64, %rsi +.Lchacha_blocks_avx2_noinput4: + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 32(%rdx) + vmovdqu %xmm3, 48(%rdx) + vpaddq %xmm11, %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_avx2_mainloop3_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_avx2_below256 +.Lchacha_blocks_avx2_mainloop3_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_avx2_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_avx2_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_avx2_copyoutput +.Lchacha_blocks_avx2_done: + vmovdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + vzeroall + movl $(63 + 512), %eax + ret +.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks; + +.data +.align 16 +.LC: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ + diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 1ffa458..7d85719 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -47,6 +47,13 @@ # define USE_SSSE3 1 #endif +/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) +# define USE_AVX2 1 +#endif + #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ @@ -70,6 +77,9 @@ typedef struct CHACHA20_context_s #ifdef USE_SSSE3 unsigned int use_ssse3:1; #endif +#ifdef USE_AVX2 + unsigned int use_avx2:1; +#endif chacha20_core_t core; } CHACHA20_context_t; @@ -89,6 +99,21 @@ chacha20_core_ssse3 (u32 * dst, CHACHA20_context_t * ctx) #endif /* USE_SSSE3 */ +#ifdef USE_AVX2 + +unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, + const byte *in, + byte *out, + size_t bytes); + +static unsigned int +chacha20_core_avx2 (u32 * dst, CHACHA20_context_t * ctx) +{ + return _gcry_chacha20_amd64_avx2_blocks(ctx->input, NULL, (byte *)dst, 64); +} + +#endif /* USE_AVX2 */ + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -312,6 +337,12 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, ctx->core = chacha20_core_ssse3; #endif +#ifdef USE_AVX2 + ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; + if (ctx->use_avx2) + ctx->core = chacha20_core_avx2; +#endif + (void)features; chacha20_keysetup (ctx, key, keylen); @@ -387,6 +418,11 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx, if (0) { } +#ifdef USE_AVX2 + else if (ctx->use_avx2) + burn = _gcry_chacha20_amd64_avx2_blocks(ctx->input, inbuf, outbuf, + bytes); +#endif #ifdef USE_SSSE3 else if (ctx->use_ssse3) burn = _gcry_chacha20_amd64_ssse3_blocks(ctx->input, inbuf, outbuf, @@ -440,7 +476,7 @@ selftest (void) { CHACHA20_context_t ctx; byte scratch[127 + 1]; - byte buf[256 + 64 + 4]; + byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ diff --git a/configure.ac b/configure.ac index 7d0dc88..1da37dd 100644 --- a/configure.ac +++ b/configure.ac @@ -1815,6 +1815,7 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; esac fi From jussi.kivilinna at iki.fi Sat May 3 19:26:04 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 03 May 2014 20:26:04 +0300 Subject: [PATCH 2/3] chacha20: add SSSE3 assembly implementation In-Reply-To: <20140503172559.23791.31470.stgit@localhost6.localdomain6> References: <20140503172559.23791.31470.stgit@localhost6.localdomain6> Message-ID: <20140503172604.23791.53356.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'chacha20-ssse3-amd64.S'. * cipher/chacha20-ssse3-amd64.S: New. * cipher/chacha20.c (USE_SSSE3): New macro. (CHACHA20_context_s): Add 'use_ssse3'. [USE_SSSE3] (_gcry_chacha20_amd64_ssse3_blocks) (chacha20_core_ssse3): New. (chacha20_do_setkey): Check HW features for SSSE3. (chacha20_do_encrypt_stream) [USE_SSSE3]: Use SSSE3 optimized implementation, if enabled. * configure.ac [host=x86-64]: Add 'chacha20-ssse3-amd64.lo'. -- Add SSSE3 optimized implementation for ChaCha20. Based on implementation by Andrew Moon. Before (Intel Haswell): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.16 ns/B 441.4 MiB/s 6.91 c/B STREAM dec | 2.16 ns/B 441.4 MiB/s 6.91 c/B After: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.742 ns/B 1284.8 MiB/s 2.38 c/B STREAM dec | 0.741 ns/B 1286.5 MiB/s 2.37 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/chacha20-ssse3-amd64.S | 610 +++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 48 +++ configure.ac | 7 4 files changed, 665 insertions(+), 2 deletions(-) create mode 100644 cipher/chacha20-ssse3-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index bc7959a..27ca7ac 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c \ +chacha20.c chacha20-ssse3-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S new file mode 100644 index 0000000..aaa7e5b --- /dev/null +++ b/cipher/chacha20-ssse3-amd64.S @@ -0,0 +1,610 @@ +/* chacha20-ssse3-amd64.S - AMD64/SSSE3 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20 + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +.text + +.align 8 +.globl _gcry_chacha20_amd64_ssse3_blocks +.type _gcry_chacha20_amd64_ssse3_blocks, at function; +_gcry_chacha20_amd64_ssse3_blocks: +.Lchacha_blocks_ssse3_local: + pushq %rbx + pushq %rbp + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + leaq .LC RIP, %rax + movdqa 0(%rax), %xmm6 + movdqa 16(%rax), %xmm7 + movdqu 0(%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movl $20, %eax + movq $1, %r9 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movdqa %xmm6, 80(%rsp) + movdqa %xmm7, 96(%rsp) + movq %rax, 64(%rsp) + cmpq $256, %rcx + jb .Lchacha_blocks_ssse3_below256 + pshufd $0x00, %xmm8, %xmm0 + pshufd $0x55, %xmm8, %xmm1 + pshufd $0xaa, %xmm8, %xmm2 + pshufd $0xff, %xmm8, %xmm3 + movdqa %xmm0, 128(%rsp) + movdqa %xmm1, 144(%rsp) + movdqa %xmm2, 160(%rsp) + movdqa %xmm3, 176(%rsp) + pshufd $0x00, %xmm9, %xmm0 + pshufd $0x55, %xmm9, %xmm1 + pshufd $0xaa, %xmm9, %xmm2 + pshufd $0xff, %xmm9, %xmm3 + movdqa %xmm0, 192(%rsp) + movdqa %xmm1, 208(%rsp) + movdqa %xmm2, 224(%rsp) + movdqa %xmm3, 240(%rsp) + pshufd $0x00, %xmm10, %xmm0 + pshufd $0x55, %xmm10, %xmm1 + pshufd $0xaa, %xmm10, %xmm2 + pshufd $0xff, %xmm10, %xmm3 + movdqa %xmm0, 256(%rsp) + movdqa %xmm1, 272(%rsp) + movdqa %xmm2, 288(%rsp) + movdqa %xmm3, 304(%rsp) + pshufd $0xaa, %xmm11, %xmm0 + pshufd $0xff, %xmm11, %xmm1 + movdqa %xmm0, 352(%rsp) + movdqa %xmm1, 368(%rsp) + jmp .Lchacha_blocks_ssse3_atleast256 +.p2align 6,,63 + # align to 4 mod 64 + nop;nop;nop;nop; +.Lchacha_blocks_ssse3_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 320(%rsp) + movl %r8d, 4+320(%rsp) + movl %r9d, 8+320(%rsp) + movl %r10d, 12+320(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 336(%rsp) + movl %r8d, 4+336(%rsp) + movl %r9d, 8+336(%rsp) + movl %r10d, 12+336(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + movdqa 128(%rsp), %xmm0 + movdqa 144(%rsp), %xmm1 + movdqa 160(%rsp), %xmm2 + movdqa 176(%rsp), %xmm3 + movdqa 192(%rsp), %xmm4 + movdqa 208(%rsp), %xmm5 + movdqa 224(%rsp), %xmm6 + movdqa 240(%rsp), %xmm7 + movdqa 256(%rsp), %xmm8 + movdqa 272(%rsp), %xmm9 + movdqa 288(%rsp), %xmm10 + movdqa 304(%rsp), %xmm11 + movdqa 320(%rsp), %xmm12 + movdqa 336(%rsp), %xmm13 + movdqa 352(%rsp), %xmm14 + movdqa 368(%rsp), %xmm15 +.Lchacha_blocks_ssse3_mainloop1: + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshufb 80(%rsp), %xmm12 + pshufb 80(%rsp), %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + pshufb 80(%rsp), %xmm14 + pshufb 80(%rsp), %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa %xmm4, %xmm12 + pslld $ 12, %xmm4 + psrld $20, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 12, %xmm5 + psrld $20, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 12, %xmm6 + psrld $20, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 12, %xmm7 + psrld $20, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshufb 96(%rsp), %xmm12 + pshufb 96(%rsp), %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + pshufb 96(%rsp), %xmm14 + pshufb 96(%rsp), %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa %xmm4, %xmm12 + pslld $ 7, %xmm4 + psrld $25, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 7, %xmm5 + psrld $25, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 7, %xmm6 + psrld $25, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 7, %xmm7 + psrld $25, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshufb 80(%rsp), %xmm15 + pshufb 80(%rsp), %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + pshufb 80(%rsp), %xmm13 + pshufb 80(%rsp), %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa %xmm5, %xmm15 + pslld $ 12, %xmm5 + psrld $20, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 12, %xmm6 + psrld $20, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 12, %xmm7 + psrld $20, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 12, %xmm4 + psrld $20, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshufb 96(%rsp), %xmm15 + pshufb 96(%rsp), %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + pshufb 96(%rsp), %xmm13 + pshufb 96(%rsp), %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa %xmm5, %xmm15 + pslld $ 7, %xmm5 + psrld $25, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 7, %xmm6 + psrld $25, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 7, %xmm7 + psrld $25, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 7, %xmm4 + psrld $25, %xmm15 + pxor %xmm15, %xmm4 + subq $2, %rax + movdqa 112(%rsp), %xmm15 + jnz .Lchacha_blocks_ssse3_mainloop1 + paddd 128(%rsp), %xmm0 + paddd 144(%rsp), %xmm1 + paddd 160(%rsp), %xmm2 + paddd 176(%rsp), %xmm3 + paddd 192(%rsp), %xmm4 + paddd 208(%rsp), %xmm5 + paddd 224(%rsp), %xmm6 + paddd 240(%rsp), %xmm7 + paddd 256(%rsp), %xmm8 + paddd 272(%rsp), %xmm9 + paddd 288(%rsp), %xmm10 + paddd 304(%rsp), %xmm11 + paddd 320(%rsp), %xmm12 + paddd 336(%rsp), %xmm13 + paddd 352(%rsp), %xmm14 + paddd 368(%rsp), %xmm15 + movdqa %xmm8, 384(%rsp) + movdqa %xmm9, 400(%rsp) + movdqa %xmm10, 416(%rsp) + movdqa %xmm11, 432(%rsp) + movdqa %xmm12, 448(%rsp) + movdqa %xmm13, 464(%rsp) + movdqa %xmm14, 480(%rsp) + movdqa %xmm15, 496(%rsp) + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm8, %xmm5 + movdqa %xmm10, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm1 + punpcklqdq %xmm6, %xmm3 + punpcklqdq %xmm9, %xmm5 + punpcklqdq %xmm11, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 64(%rsi), %xmm9 + movdqu 80(%rsi), %xmm11 + movdqu 128(%rsi), %xmm12 + movdqu 144(%rsi), %xmm13 + movdqu 192(%rsi), %xmm14 + movdqu 208(%rsi), %xmm15 + pxor %xmm2, %xmm5 + pxor %xmm6, %xmm7 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm1 + pxor %xmm13, %xmm3 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm6 + movdqu 96(%rsi), %xmm9 + movdqu 112(%rsi), %xmm11 + movdqu 160(%rsi), %xmm12 + movdqu 176(%rsi), %xmm13 + movdqu 224(%rsi), %xmm14 + movdqu 240(%rsi), %xmm15 + pxor %xmm2, %xmm1 + pxor %xmm6, %xmm5 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm3 + pxor %xmm13, %xmm7 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_ssse3_mainloop_cont +.Lchacha_blocks_ssse3_noinput1: + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) +.Lchacha_blocks_ssse3_mainloop_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_ssse3_atleast256 + movdqa 80(%rsp), %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa 0(%rsp), %xmm8 + movdqa 16(%rsp), %xmm9 + movdqa 32(%rsp), %xmm10 + movdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_ssse3_below256: + movq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_ssse3_done + cmpq $64, %rcx + jae .Lchacha_blocks_ssse3_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput2 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_ssse3_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_ssse3_copyinput + movq %rsp, %rsi +.Lchacha_blocks_ssse3_noinput2: + movq %rsp, %rdx +.Lchacha_blocks_ssse3_above63: + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_ssse3_mainloop2: + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm6, %xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1, %xmm4 + pslld $12, %xmm4 + psrld $20, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm7, %xmm3 + pshufd $0x93, %xmm0, %xmm0 + paddd %xmm3, %xmm2 + pshufd $0x4e, %xmm3, %xmm3 + pxor %xmm2, %xmm1 + pshufd $0x39, %xmm2, %xmm2 + movdqa %xmm1, %xmm4 + pslld $7, %xmm4 + psrld $25, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm6, %xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1, %xmm4 + pslld $12, %xmm4 + psrld $20, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm7, %xmm3 + pshufd $0x39, %xmm0, %xmm0 + paddd %xmm3, %xmm2 + pshufd $0x4e, %xmm3, %xmm3 + pxor %xmm2, %xmm1 + pshufd $0x93, %xmm2, %xmm2 + movdqa %xmm1, %xmm4 + pslld $7, %xmm4 + psrld $25, %xmm1 + pxor %xmm4, %xmm1 + subq $2, %rax + jnz .Lchacha_blocks_ssse3_mainloop2 + paddd %xmm8, %xmm0 + paddd %xmm9, %xmm1 + paddd %xmm10, %xmm2 + paddd %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput3 + movdqu 0(%rsi), %xmm12 + movdqu 16(%rsi), %xmm13 + movdqu 32(%rsi), %xmm14 + movdqu 48(%rsi), %xmm15 + pxor %xmm12, %xmm0 + pxor %xmm13, %xmm1 + pxor %xmm14, %xmm2 + pxor %xmm15, %xmm3 + addq $64, %rsi +.Lchacha_blocks_ssse3_noinput3: + movdqu %xmm0, 0(%rdx) + movdqu %xmm1, 16(%rdx) + movdqu %xmm2, 32(%rdx) + movdqu %xmm3, 48(%rdx) + paddq %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_ssse3_mainloop2_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_ssse3_below256 +.Lchacha_blocks_ssse3_mainloop2_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_ssse3_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_ssse3_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_ssse3_copyoutput +.Lchacha_blocks_ssse3_done: + movdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + popq %rbp + popq %rbx + movl $(63 + 512 + 16), %eax + ret +.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks; + +.data +.align 16; +.LC: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index f250789..1ffa458 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -40,6 +40,14 @@ #include "bufhelp.h" +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) +# define USE_SSSE3 1 +#endif + + #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ @@ -59,10 +67,29 @@ typedef struct CHACHA20_context_s u32 input[CHACHA20_INPUT_LENGTH]; u32 pad[CHACHA20_INPUT_LENGTH]; unsigned int unused; /* bytes in the pad. */ +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; +#endif chacha20_core_t core; } CHACHA20_context_t; +#ifdef USE_SSSE3 + +unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, + const byte *in, + byte *out, + size_t bytes); + +static unsigned int +chacha20_core_ssse3 (u32 * dst, CHACHA20_context_t * ctx) +{ + return _gcry_chacha20_amd64_ssse3_blocks(ctx->input, NULL, (byte *)dst, 64); +} + +#endif /* USE_SSSE3 */ + + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -262,6 +289,7 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, { static int initialized; static const char *selftest_failed; + unsigned int features = _gcry_get_hw_features (); if (!initialized) { @@ -278,6 +306,14 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, ctx->core = chacha20_core; +#ifdef USE_SSSE3 + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if (ctx->use_ssse3) + ctx->core = chacha20_core_ssse3; +#endif + + (void)features; + chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ @@ -348,7 +384,17 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx, { size_t nblocks = length / CHACHA20_BLOCK_SIZE; size_t bytes = nblocks * CHACHA20_BLOCK_SIZE; - burn = chacha20_blocks(ctx->input, inbuf, outbuf, bytes); + + if (0) + { } +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + burn = _gcry_chacha20_amd64_ssse3_blocks(ctx->input, inbuf, outbuf, + bytes); +#endif + else + burn = chacha20_blocks(ctx->input, inbuf, outbuf, bytes); + length -= bytes; outbuf += bytes; inbuf += bytes; diff --git a/configure.ac b/configure.ac index 724c4ee..7d0dc88 100644 --- a/configure.ac +++ b/configure.ac @@ -1810,6 +1810,13 @@ LIST_MEMBER(chacha20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" + ;; + esac fi LIST_MEMBER(dsa, $enabled_pubkey_ciphers) From cvs at cvs.gnupg.org Wed May 7 13:28:27 2014 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Wed, 07 May 2014 13:28:27 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-65-gfc6ff6f Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via fc6ff6f73a51bcbbbb3757dc1386da40aa3ae75d (commit) from a79c4ad7c56ee4410f17beb73eeb58b0dd36bfc6 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit fc6ff6f73a51bcbbbb3757dc1386da40aa3ae75d Author: Werner Koch Date: Wed May 7 11:05:36 2014 +0200 Bump LT version. * configure.ac: Bumb LT version to C21/A1/R0. -- This is to avoid conflicts with the 1.6 series. Note that if we add a new interface to 1.6 we would need to bump age again. diff --git a/configure.ac b/configure.ac index 79f79ef..6539a96 100644 --- a/configure.ac +++ b/configure.ac @@ -51,11 +51,12 @@ m4_define([mym4_full_version],[mym4_version[]mym4_betastring]) AC_INIT([libgcrypt],[mym4_full_version],[http://bugs.gnupg.org]) # LT Version numbers, remember to change them just *before* a release. +# ====== NOTE: Already bumped to 21/1/0 for the 1.7.0 release. ==== # (Interfaces removed: CURRENT++, AGE=0, REVISION=0) # (Interfaces added: CURRENT++, AGE++, REVISION=0) # (No interfaces changed: REVISION++) -LIBGCRYPT_LT_CURRENT=20 -LIBGCRYPT_LT_AGE=0 +LIBGCRYPT_LT_CURRENT=21 +LIBGCRYPT_LT_AGE=1 LIBGCRYPT_LT_REVISION=0 ----------------------------------------------------------------------- Summary of changes: configure.ac | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From gniibe at fsij.org Wed May 7 13:31:19 2014 From: gniibe at fsij.org (NIIBE Yutaka) Date: Wed, 07 May 2014 20:31:19 +0900 Subject: [PATCH] ECC: (flags param) is only for key generation In-Reply-To: <1398393442.1511.7.camel@cfw2.gniibe.org> References: <1398386360.1511.3.camel@cfw2.gniibe.org> <1398393442.1511.7.camel@cfw2.gniibe.org> Message-ID: <1399462279.3518.1.camel@latx1.gniibe.org> On 2014-04-25 at 11:37 +0900, NIIBE Yutaka wrote: > On 2014-04-25 at 09:39 +0900, NIIBE Yutaka wrote: > > But, I think that "(flags param)" only makes sense for key generation. > > This is revised patch, full version. I checked the history of the change. commit ed45fd2e60c88e2f005282e6eadd018b59dcf65b Author: Werner Koch Date: Fri Nov 8 09:53:32 2013 +0100 ecc: Make "noparam" the default and replace by "param". commit ba892a0a874c8b2a83dbf0940608cd7e2911ce01 Author: Werner Koch Date: Fri Oct 25 15:44:03 2013 +0200 ecc: Add flags "noparam" and "comp". It seems that it's intentional. Let me confirm that (flags param) is required for key when we don't use curvename. Then, (1) We need to update the code which uses ECC to add (flags param) or use curvename. (2) We need to modify the code of libgcrypt itself to get flags of param from KEYPARMS. Specifically, ecc_sign and ecc_verify should be modified. Currently, ecc_sign and ecc_verify don't get flags from KEYPARMS, but only from DATA. -- From wk at gnupg.org Wed May 7 17:39:42 2014 From: wk at gnupg.org (Werner Koch) Date: Wed, 07 May 2014 17:39:42 +0200 Subject: [PATCH] ECC sign & verify In-Reply-To: <1398386360.1511.3.camel@cfw2.gniibe.org> (NIIBE Yutaka's message of "Fri, 25 Apr 2014 09:39:20 +0900") References: <1398386360.1511.3.camel@cfw2.gniibe.org> Message-ID: <87ppjp4myp.fsf@vigenere.g10code.de> On Fri, 25 Apr 2014 02:39, gniibe at fsij.org said: > But, I think that "(flags param)" only makes sense for key generation. No. It must also be used with the other functions if you want to oerride parameters which are derived from the curve name. Without that flag the code does not try to parse extra parameters: @item param @cindex param For ECC key generation also return the domain parameters. For ECC signing and verification override default parameters by provided domain parameters of the public or private key. Thus for those functions the use of "parm" requres a few more CPU cycles. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Wed May 7 17:44:31 2014 From: wk at gnupg.org (Werner Koch) Date: Wed, 07 May 2014 17:44:31 +0200 Subject: [PATCH] ECC: (flags param) is only for key generation In-Reply-To: <1399462279.3518.1.camel@latx1.gniibe.org> (NIIBE Yutaka's message of "Wed, 07 May 2014 20:31:19 +0900") References: <1398386360.1511.3.camel@cfw2.gniibe.org> <1398393442.1511.7.camel@cfw2.gniibe.org> <1399462279.3518.1.camel@latx1.gniibe.org> Message-ID: <87lhud4mqo.fsf@vigenere.g10code.de> On Wed, 7 May 2014 13:31, gniibe at fsij.org said: > It seems that it's intentional. Right. I looked at code using ECC fucntions of Libgcrypt and found that this change won't break too much. And if it breaks something it will help to clean up that code. > Let me confirm that > > (flags param) > > is required for key when we don't use curvename. Yes. But you should use curvename - it is much more efficient. > (1) We need to update the code which uses ECC to add (flags param) > or use curvename. Right. I fixed GnuPG today. > (2) We need to modify the code of libgcrypt itself to get flags of > param from KEYPARMS. > Specifically, ecc_sign and ecc_verify should be modified. > Currently, ecc_sign and ecc_verify don't get flags from KEYPARMS, > but only from DATA. That is indeed a bug. The param flag does not make sense in DATA. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Fri May 9 13:22:36 2014 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Fri, 09 May 2014 13:22:36 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-67-g246b7aa Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 246b7aaae1ee459f440260bbc4ec2c01c5dc3362 (commit) via 42f097486eb1ab92f30fdab56865eec4af685def (commit) from fc6ff6f73a51bcbbbb3757dc1386da40aa3ae75d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 246b7aaae1ee459f440260bbc4ec2c01c5dc3362 Author: Werner Koch Date: Fri May 9 12:35:15 2014 +0200 mpi: Fix a subtle bug setting spurious bits with in mpi_set_bit. * mpi/mpi-bit.c (_gcry_mpi_set_bit, _gcry_mpi_set_highbit): Clear allocated but not used bits before resizing. * tests/t-mpi-bits.c (set_bit_with_resize): New. -- Reported-by: Martin Sewelies. This bug is probably with us for many years. Probably due to different memory allocation patterns, it did first revealed itself with 1.6. It could be the reason for other heisenbugs. Signed-off-by: Werner Koch diff --git a/mpi/mpi-bit.c b/mpi/mpi-bit.c index fcafda0..e217040 100644 --- a/mpi/mpi-bit.c +++ b/mpi/mpi-bit.c @@ -116,7 +116,7 @@ _gcry_mpi_test_bit( gcry_mpi_t a, unsigned int n ) void _gcry_mpi_set_bit( gcry_mpi_t a, unsigned int n ) { - unsigned int limbno, bitno; + unsigned int i, limbno, bitno; if (mpi_is_immutable (a)) { @@ -129,6 +129,8 @@ _gcry_mpi_set_bit( gcry_mpi_t a, unsigned int n ) if ( limbno >= a->nlimbs ) { + for (i=a->nlimbs; i < a->alloced; i++) + a->d[i] = 0; mpi_resize (a, limbno+1 ); a->nlimbs = limbno+1; } @@ -141,7 +143,7 @@ _gcry_mpi_set_bit( gcry_mpi_t a, unsigned int n ) void _gcry_mpi_set_highbit( gcry_mpi_t a, unsigned int n ) { - unsigned int limbno, bitno; + unsigned int i, limbno, bitno; if (mpi_is_immutable (a)) { @@ -154,6 +156,8 @@ _gcry_mpi_set_highbit( gcry_mpi_t a, unsigned int n ) if ( limbno >= a->nlimbs ) { + for (i=a->nlimbs; i < a->alloced; i++) + a->d[i] = 0; mpi_resize (a, limbno+1 ); a->nlimbs = limbno+1; } diff --git a/tests/t-mpi-bit.c b/tests/t-mpi-bit.c index b1c999e..3d7b793 100644 --- a/tests/t-mpi-bit.c +++ b/tests/t-mpi-bit.c @@ -327,6 +327,58 @@ test_lshift (int pass) } +/* Bug fixed on 2014-05-09: + a = gcry_mpi_new (1523); + gcry_mpi_set_bit (a, 1536); + didn't initialized all limbs in A. */ +static void +set_bit_with_resize (void) +{ + gcry_mpi_t a; + int i; + + wherestr = "set_bit_with_resize"; + show ("checking that set_bit initializes all limbs\n"); + + a = gcry_mpi_new (1536); + gcry_mpi_set_bit (a, 1536); + + if (!gcry_mpi_test_bit (a, 1536)) + fail ("failed to set a bit\n"); + for (i=0; i < 1536; i++) + { + if (gcry_mpi_test_bit (a, i)) + { + fail ("spurious bit detected\n"); + break; + } + } + if (gcry_mpi_test_bit (a, 1537)) + fail ("more bits set than expected\n"); + gcry_mpi_release (a); + + wherestr = "set_highbit_with_resize"; + show ("checking that set_highbit initializes all limbs\n"); + + a = gcry_mpi_new (1536); + gcry_mpi_set_highbit (a, 1536); + + if (!gcry_mpi_test_bit (a, 1536)) + fail ("failed to set a bit\n"); + for (i=0; i < 1536; i++) + { + if (gcry_mpi_test_bit (a, i)) + { + fail ("spurious bit detected\n"); + break; + } + } + if (gcry_mpi_test_bit (a, 1537)) + fail ("more bits set than expected\n"); + gcry_mpi_release (a); +} + + int main (int argc, char **argv) { @@ -356,6 +408,8 @@ main (int argc, char **argv) for (i=0; i < 5; i++) test_lshift (i); /* Run several times due to random initializations. */ + set_bit_with_resize (); + show ("All tests completed. Errors: %d\n", error_count); return error_count ? 1 : 0; } commit 42f097486eb1ab92f30fdab56865eec4af685def Author: Werner Koch Date: Fri May 9 12:11:30 2014 +0200 Comment typo fix -- diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c index 1f1754a..fdce578 100644 --- a/mpi/mpiutil.c +++ b/mpi/mpiutil.c @@ -28,7 +28,7 @@ #include "mpi-internal.h" #include "mod-source-info.h" -/* Constatns allocated right away at strtartup. */ +/* Constants allocated right away at startup. */ static gcry_mpi_t constants[MPI_NUMBER_OF_CONSTANTS]; ----------------------------------------------------------------------- Summary of changes: mpi/mpi-bit.c | 8 ++++++-- mpi/mpiutil.c | 2 +- tests/t-mpi-bit.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 3 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From funman at videolan.org Fri May 9 19:06:24 2014 From: funman at videolan.org (=?UTF-8?q?Rafa=C3=ABl=20Carr=C3=A9?=) Date: Fri, 9 May 2014 19:06:24 +0200 Subject: [PATCH 1/2] mpi: fix code path selection for x86_64 on Darwin. In-Reply-To: <1399655185-29990-1-git-send-email-funman@videolan.org> References: <1399655185-29990-1-git-send-email-funman@videolan.org> Message-ID: <1399655185-29990-2-git-send-email-funman@videolan.org> From: Konstantin Pavlov This patch fixes build error observed on current Darwin platform with asm enabled: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -g -O2 -MT mpih-mul1-asm.lo -MD -MP -MF .deps/mpih-mul1-asm.Tpo -c mpih-mul1-asm.S -fno-common -DPIC -o .libs/mpih-mul1-asm.o mpih-mul1-asm.S:43:9: error: invalid alignment value Signed-off-by: Rafa?l Carr? --- mpi/config.links | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mpi/config.links b/mpi/config.links index 0217d35..595e879 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -113,6 +113,12 @@ case "${host}" in path="i586 i386" mpi_cpu_arch="x86" ;; + x86_64-apple-darwin*) + echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h + cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h + path="amd64" + mpi_cpu_arch="x86" + ;; i[3467]86*-*-*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h -- 1.9.2 From funman at videolan.org Fri May 9 19:06:23 2014 From: funman at videolan.org (=?UTF-8?q?Rafa=C3=ABl=20Carr=C3=A9?=) Date: Fri, 9 May 2014 19:06:23 +0200 Subject: [PATCH 0/]2 Message-ID: <1399655185-29990-1-git-send-email-funman@videolan.org> [PATCH 1/2] mpi: fix code path selection for x86_64 on Darwin. Resending after http://article.gmane.org/gmane.comp.encryption.gpg.libgcrypt.devel/3643 [PATCH 2/2] _GCRY_GCC_ATTR_SENTINEL: Fix typo Typo spotted by -Wundef From funman at videolan.org Fri May 9 19:06:25 2014 From: funman at videolan.org (=?UTF-8?q?Rafa=C3=ABl=20Carr=C3=A9?=) Date: Fri, 9 May 2014 19:06:25 +0200 Subject: [PATCH 2/2] _GCRY_GCC_ATTR_SENTINEL: Fix typo In-Reply-To: <1399655185-29990-1-git-send-email-funman@videolan.org> References: <1399655185-29990-1-git-send-email-funman@videolan.org> Message-ID: <1399655185-29990-3-git-send-email-funman@videolan.org> --- src/gcrypt.h.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index c84a3f7..e2d3245 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -102,7 +102,7 @@ extern "C" { #define _GCRY_GCC_ATTR_PRINTF(f,a) __attribute__ ((format (printf,f,a))) -#if _GCRT_GCC_VERSION >= 40000 +#if _GCRY_GCC_VERSION >= 40000 #define _GCRY_GCC_ATTR_SENTINEL(a) __attribute__ ((sentinel(a))) #endif -- 1.9.2 From jussi.kivilinna at iki.fi Sat May 10 23:56:33 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 11 May 2014 00:56:33 +0300 Subject: [PATCH 1/3] Add Poly1305 MAC Message-ID: <20140510215633.11373.59196.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'mac-poly1305.c', 'poly1305.c' and 'poly1305-internal.h'. * cipher/mac-internal.h (poly1305mac_context_s): New. (gcry_mac_handle): Add 'u.poly1305mac'. (_gcry_mac_type_spec_poly1305mac): New. * cipher/mac-poly1305.c: New. * cipher/mac.c (mac_list): Add Poly1305. * cipher/poly1305-internal.h: New. * cipher/poly1305.c: New. * src/gcrypt.h.in: Add 'GCRY_MAC_POLY1305'. * tests/basic.c (check_mac): Add Poly1035 test vectors; Allow overriding lengths of data and key buffers. * tests/bench-slope.c (mac_bench): Increase max algo number from 500 to 600. * tests/benchmark.c (mac_bench): Ditto. -- Patch adds Bernstein's Poly1305 message authentication code to libgcrypt. Implementation is based on Andrew Moon's public domain implementation from: https://github.com/floodyberry/poly1305-opt The algorithm added by this patch is the plain Poly1305 without AES and takes 32-bit key that must not be reused. Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 3 cipher/mac-internal.h | 17 + cipher/mac-poly1305.c | 213 ++++++++++++ cipher/mac.c | 1 cipher/poly1305-internal.h | 90 +++++ cipher/poly1305.c | 780 ++++++++++++++++++++++++++++++++++++++++++++ src/gcrypt.h.in | 4 tests/basic.c | 120 ++++++- tests/bench-slope.c | 2 tests/benchmark.c | 2 10 files changed, 1221 insertions(+), 11 deletions(-) create mode 100644 cipher/mac-poly1305.c create mode 100644 cipher/poly1305-internal.h create mode 100644 cipher/poly1305.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 26d13d2..a8b86e6 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -45,7 +45,8 @@ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ -mac-hmac.c mac-cmac.c mac-gmac.c \ +mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ +poly1305.c poly1305-internal.h \ kdf.c kdf-internal.h \ hmac-tests.c \ bithelp.h \ diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 9895a54..81b6185 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -17,9 +17,17 @@ * License along with this program; if not, see . */ +#include + +#include "g10lib.h" + + /* The data object used to hold a handle to an encryption object. */ struct gcry_mac_handle; +/* The data object used to hold poly1305-mac context. */ +struct poly1305mac_context_s; + /* * @@ -84,7 +92,6 @@ typedef struct gcry_mac_spec } gcry_mac_spec_t; - /* The handle structure. */ struct gcry_mac_handle { @@ -106,6 +113,9 @@ struct gcry_mac_handle gcry_cipher_hd_t ctx; int cipher_algo; } gmac; + struct { + struct poly1305mac_context_s *ctx; + } poly1305mac; } u; }; @@ -202,3 +212,8 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed; #if USE_CAMELLIA extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; #endif + +/* + * The Poly1305 MAC algorithm specifications (mac-poly1305.c). + */ +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac; diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c new file mode 100644 index 0000000..e265b64 --- /dev/null +++ b/cipher/mac-poly1305.c @@ -0,0 +1,213 @@ +/* mac-poly1305.c - Poly1305 based MACs + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "mac-internal.h" +#include "poly1305-internal.h" + + +struct poly1305mac_context_s { + poly1305_context_t ctx; + struct { + unsigned int key_set:1; + unsigned int tag:1; + } marks; + byte tag[POLY1305_TAGLEN]; + byte key[POLY1305_KEYLEN]; +}; + + +static gcry_err_code_t +poly1305mac_open (gcry_mac_hd_t h) +{ + struct poly1305mac_context_s *mac_ctx; + int secure = (h->magic == CTX_MAGIC_SECURE); + + if (secure) + mac_ctx = xtrycalloc_secure (1, sizeof(*mac_ctx)); + else + mac_ctx = xtrycalloc (1, sizeof(*mac_ctx)); + + if (!mac_ctx) + return gpg_err_code_from_syserror (); + + h->u.poly1305mac.ctx = mac_ctx; + + return 0; +} + + +static void +poly1305mac_close (gcry_mac_hd_t h) +{ + xfree(h->u.poly1305mac.ctx); +} + + +static gcry_err_code_t +poly1305mac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + + mac_ctx->marks.key_set = 0; + mac_ctx->marks.tag = 0; + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + memcpy(mac_ctx->key, key, POLY1305_KEYLEN); + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + { + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + return err; + } + + mac_ctx->marks.key_set = 1; + + return 0; +} + + +static gcry_err_code_t +poly1305mac_reset (gcry_mac_hd_t h) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set) + return GPG_ERR_INV_STATE; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + + mac_ctx->marks.key_set = 1; + mac_ctx->marks.tag = 0; + + return _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); +} + + +static gcry_err_code_t +poly1305mac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set || mac_ctx->marks.tag) + return GPG_ERR_INV_STATE; + + _gcry_poly1305_update (&mac_ctx->ctx, buf, buflen); + return 0; +} + + +static gcry_err_code_t +poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set) + return GPG_ERR_INV_STATE; + + if (!mac_ctx->marks.tag) + { + _gcry_poly1305_finish(&mac_ctx->ctx, mac_ctx->tag); + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + mac_ctx->marks.tag = 1; + } + + if (*outlen <= POLY1305_TAGLEN) + buf_cpy (outbuf, mac_ctx->tag, *outlen); + else + { + buf_cpy (outbuf, mac_ctx->tag, POLY1305_TAGLEN); + *outlen = POLY1305_TAGLEN; + } + + return 0; +} + + +static gcry_err_code_t +poly1305mac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + size_t outlen = 0; + + /* Check and finalize tag. */ + err = poly1305mac_read(h, NULL, &outlen); + if (err) + return err; + + if (buflen > POLY1305_TAGLEN) + return GPG_ERR_INV_LENGTH; + + return buf_eq_const (buf, mac_ctx->tag, buflen) ? 0 : GPG_ERR_CHECKSUM; +} + + +static unsigned int +poly1305mac_get_maclen (int algo) +{ + (void)algo; + + return POLY1305_TAGLEN; +} + + +static unsigned int +poly1305mac_get_keylen (int algo) +{ + (void)algo; + + return POLY1305_KEYLEN; +} + + +static gcry_mac_spec_ops_t poly1305mac_ops = { + poly1305mac_open, + poly1305mac_close, + poly1305mac_setkey, + NULL, + poly1305mac_reset, + poly1305mac_write, + poly1305mac_read, + poly1305mac_verify, + poly1305mac_get_maclen, + poly1305mac_get_keylen +}; + + +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac = { + GCRY_MAC_POLY1305, {0, 0}, "POLY1305", + &poly1305mac_ops +}; diff --git a/cipher/mac.c b/cipher/mac.c index 7805467..e583369 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -101,6 +101,7 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_GOST28147 &_gcry_mac_type_spec_cmac_gost28147, #endif + &_gcry_mac_type_spec_poly1305mac, NULL, }; diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h new file mode 100644 index 0000000..d89db16 --- /dev/null +++ b/cipher/poly1305-internal.h @@ -0,0 +1,90 @@ +/* poly1305-internal.h - Poly1305 internals + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef G10_POLY1305_INTERNAL_H +#define G10_POLY1305_INTERNAL_H + +#include +#include +#include +#include +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" + + +#define POLY1305_TAGLEN 16 +#define POLY1305_KEYLEN 32 + + +/* Block-size used in default implementation. */ +#define POLY1305_REF_BLOCKSIZE 16 + +/* State size of default implementation. */ +#define POLY1305_REF_STATESIZE 64 + + +/* Largest block-size used in any implementation (optimized implementations + * might use block-size multiple of 16). */ +#define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE + +/* Largest state-size used in any implementation. */ +#define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE + +/* Minimum alignment for state pointer passed to implementations. */ +#define POLY1305_STATE_ALIGNMENT 32 + + +typedef struct poly1305_key_s +{ + byte b[POLY1305_KEYLEN]; +} poly1305_key_t; + + +typedef struct poly1305_ops_s +{ + unsigned int (*block_size) (void); + void (*init_ext) (void *ctx, const poly1305_key_t * key); + unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes); + unsigned int (*finish_ext) (void *ctx, const byte * m, size_t remaining, + byte mac[POLY1305_TAGLEN]); +} poly1305_ops_t; + + +typedef struct poly1305_context_s +{ + byte state[POLY1305_LARGEST_STATESIZE + POLY1305_STATE_ALIGNMENT]; + const poly1305_ops_t *ops; + unsigned int leftover, block_size; + byte buffer[POLY1305_LARGEST_BLOCKSIZE]; +} poly1305_context_t; + + +gcry_err_code_t _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, + size_t keylen); + +void _gcry_poly1305_finish (poly1305_context_t * ctx, + byte mac[POLY1305_TAGLEN]); + +void _gcry_poly1305_update (poly1305_context_t * ctx, const byte * buf, + size_t buflen); + + +#endif /* G10_POLY1305_INTERNAL_H */ diff --git a/cipher/poly1305.c b/cipher/poly1305.c new file mode 100644 index 0000000..00ffea8 --- /dev/null +++ b/cipher/poly1305.c @@ -0,0 +1,780 @@ +/* poly1305.c - Poly1305 internals and generic implementation + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* The code is based on public-domain Poly1305 implementation by + * Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include +#include +#include +#include + +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "poly1305-internal.h" + + +static const char *selftest (void); + + + +#ifdef HAVE_U64_TYPEDEF + +/* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit + * multiplication and 64 bit addition. + */ + +typedef struct poly1305_state_ref32_s +{ + u32 r[5]; + u32 h[5]; + u32 pad[4]; + byte final; +} poly1305_state_ref32_t; + + +static unsigned int +poly1305_block_size_ref32 (void) +{ + return POLY1305_REF_BLOCKSIZE; +} + + +static void +poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + + gcry_assert (sizeof (*st) + POLY1305_STATE_ALIGNMENT <= + sizeof (((poly1305_context_t *) 0)->state)); + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = (buf_get_le32 (&key->b[0])) & 0x3ffffff; + st->r[1] = (buf_get_le32 (&key->b[3]) >> 2) & 0x3ffff03; + st->r[2] = (buf_get_le32 (&key->b[6]) >> 4) & 0x3ffc0ff; + st->r[3] = (buf_get_le32 (&key->b[9]) >> 6) & 0x3f03fff; + st->r[4] = (buf_get_le32 (&key->b[12]) >> 8) & 0x00fffff; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* save pad for later */ + st->pad[0] = buf_get_le32 (&key->b[16]); + st->pad[1] = buf_get_le32 (&key->b[20]); + st->pad[2] = buf_get_le32 (&key->b[24]); + st->pad[3] = buf_get_le32 (&key->b[28]); + + st->final = 0; +} + + +static unsigned int +poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + const u32 hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ + u32 r0, r1, r2, r3, r4; + u32 s1, s2, s3, s4; + u32 h0, h1, h2, h3, h4; + u64 d0, d1, d2, d3, d4; + u32 c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (bytes >= POLY1305_REF_BLOCKSIZE) + { + /* h += m[i] */ + h0 += (buf_get_le32 (m + 0)) & 0x3ffffff; + h1 += (buf_get_le32 (m + 3) >> 2) & 0x3ffffff; + h2 += (buf_get_le32 (m + 6) >> 4) & 0x3ffffff; + h3 += (buf_get_le32 (m + 9) >> 6) & 0x3ffffff; + h4 += (buf_get_le32 (m + 12) >> 8) | hibit; + + /* h *= r */ + d0 = + ((u64) h0 * r0) + ((u64) h1 * s4) + + ((u64) h2 * s3) + ((u64) h3 * s2) + ((u64) h4 * s1); + d1 = + ((u64) h0 * r1) + ((u64) h1 * r0) + + ((u64) h2 * s4) + ((u64) h3 * s3) + ((u64) h4 * s2); + d2 = + ((u64) h0 * r2) + ((u64) h1 * r1) + + ((u64) h2 * r0) + ((u64) h3 * s4) + ((u64) h4 * s3); + d3 = + ((u64) h0 * r3) + ((u64) h1 * r2) + + ((u64) h2 * r1) + ((u64) h3 * r0) + ((u64) h4 * s4); + d4 = + ((u64) h0 * r4) + ((u64) h1 * r3) + + ((u64) h2 * r2) + ((u64) h3 * r1) + ((u64) h4 * r0); + + /* (partial) h %= p */ + c = (u32) (d0 >> 26); + h0 = (u32) d0 & 0x3ffffff; + d1 += c; + c = (u32) (d1 >> 26); + h1 = (u32) d1 & 0x3ffffff; + d2 += c; + c = (u32) (d2 >> 26); + h2 = (u32) d2 & 0x3ffffff; + d3 += c; + c = (u32) (d3 >> 26); + h3 = (u32) d3 & 0x3ffffff; + d4 += c; + c = (u32) (d4 >> 26); + h4 = (u32) d4 & 0x3ffffff; + h0 += c * 5; + c = (h0 >> 26); + h0 = h0 & 0x3ffffff; + h1 += c; + + m += POLY1305_REF_BLOCKSIZE; + bytes -= POLY1305_REF_BLOCKSIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; + + return (16 * sizeof (u32) + 5 * sizeof (u64) + 5 * sizeof (void *)); +} + + +static unsigned int +poly1305_finish_ext_ref32 (void *state, const byte * m, + size_t remaining, byte mac[POLY1305_TAGLEN]) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + u32 h0, h1, h2, h3, h4, c; + u32 g0, g1, g2, g3, g4; + u64 f; + u32 mask; + unsigned int burn = 0; + + /* process the remaining block */ + if (remaining) + { + byte final[POLY1305_REF_BLOCKSIZE] = { 0 }; + size_t i; + for (i = 0; i < remaining; i++) + final[i] = m[i]; + final[remaining] = 1; + st->final = 1; + burn = poly1305_blocks_ref32 (st, final, POLY1305_REF_BLOCKSIZE); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; + h1 = h1 & 0x3ffffff; + h2 += c; + c = h2 >> 26; + h2 = h2 & 0x3ffffff; + h3 += c; + c = h3 >> 26; + h3 = h3 & 0x3ffffff; + h4 += c; + c = h4 >> 26; + h4 = h4 & 0x3ffffff; + h0 += c * 5; + c = h0 >> 26; + h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 26; + g0 &= 0x3ffffff; + g1 = h1 + c; + c = g1 >> 26; + g1 &= 0x3ffffff; + g2 = h2 + c; + c = g2 >> 26; + g2 &= 0x3ffffff; + g3 = h3 + c; + c = g3 >> 26; + g3 &= 0x3ffffff; + g4 = h4 + c - (1 << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof (u32) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + pad) % (2^128) */ + f = (u64) h0 + st->pad[0]; + h0 = (u32) f; + f = (u64) h1 + st->pad[1] + (f >> 32); + h1 = (u32) f; + f = (u64) h2 + st->pad[2] + (f >> 32); + h2 = (u32) f; + f = (u64) h3 + st->pad[3] + (f >> 32); + h3 = (u32) f; + + buf_put_le32 (mac + 0, h0); + buf_put_le32 (mac + 4, h1); + buf_put_le32 (mac + 8, h2); + buf_put_le32 (mac + 12, h3); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->r[3] = 0; + st->r[4] = 0; + st->pad[0] = 0; + st->pad[1] = 0; + st->pad[2] = 0; + st->pad[3] = 0; + + /* burn_stack */ + return (13 * sizeof (u32) + sizeof (u64) + + POLY1305_REF_BLOCKSIZE + 6 * sizeof (void *)) + burn; +} + + +static const poly1305_ops_t poly1305_default_ops = { + poly1305_block_size_ref32, + poly1305_init_ext_ref32, + poly1305_blocks_ref32, + poly1305_finish_ext_ref32 +}; + +#else /* !HAVE_U64_TYPEDEF */ + +/* Reference unoptimized poly1305 implementation using 8 bit * 8 bit = 16 bit + * multiplication and 16 bit addition, used when we don't have 'u64'. + */ + +typedef struct poly1305_state_ref8_t +{ + byte h[17]; + byte r[17]; + byte pad[17]; + byte final; +} poly1305_state_ref8_t; + + +static unsigned int +poly1305_block_size_ref8 (void) +{ + return POLY1305_REF_BLOCKSIZE; +} + + +static void +poly1305_init_ext_ref8 (void *state, const poly1305_key_t * key) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + size_t i; + + /* h = 0 */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = key->b[0]; + st->r[1] = key->b[1]; + st->r[2] = key->b[2]; + st->r[3] = key->b[3] & 0x0f; + st->r[4] = key->b[4] & 0xfc; + st->r[5] = key->b[5]; + st->r[6] = key->b[6]; + st->r[7] = key->b[7] & 0x0f; + st->r[8] = key->b[8] & 0xfc; + st->r[9] = key->b[9]; + st->r[10] = key->b[10]; + st->r[11] = key->b[11] & 0x0f; + st->r[12] = key->b[12] & 0xfc; + st->r[13] = key->b[13]; + st->r[14] = key->b[14]; + st->r[15] = key->b[15] & 0x0f; + st->r[16] = 0; + + /* save pad for later */ + for (i = 0; i < 16; i++) + st->pad[i] = key->b[i + 16]; + st->pad[16] = 0; + + st->final = 0; +} + + +static void +poly1305_add_ref8 (byte h[17], const byte c[17]) +{ + u16 u; + unsigned int i; + for (u = 0, i = 0; i < 17; i++) + { + u += (u16) h[i] + (u16) c[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } +} + + +static void +poly1305_squeeze_ref8 (byte h[17], u32 hr[17]) +{ + u32 u; + unsigned int i; + u = 0; + for (i = 0; i < 16; i++) + { + u += hr[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } + u += hr[16]; + h[16] = (byte) u & 0x03; + u >>= 2; + u += (u << 2); /* u *= 5; */ + for (i = 0; i < 16; i++) + { + u += h[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } + h[16] += (byte) u; +} + + +static void +poly1305_freeze_ref8 (byte h[17]) +{ + static const byte minusp[17] = { + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfc + }; + byte horig[17], negative; + unsigned int i; + + /* compute h + -p */ + for (i = 0; i < 17; i++) + horig[i] = h[i]; + poly1305_add_ref8 (h, minusp); + + /* select h if h < p, or h + -p if h >= p */ + negative = -(h[16] >> 7); + for (i = 0; i < 17; i++) + h[i] ^= negative & (horig[i] ^ h[i]); +} + + +static unsigned int +poly1305_blocks_ref8 (void *state, const byte * m, size_t bytes) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + const byte hibit = st->final ^ 1; /* 1 << 128 */ + + while (bytes >= POLY1305_REF_BLOCKSIZE) + { + u32 hr[17], u; + byte c[17]; + unsigned int i, j; + + /* h += m */ + for (i = 0; i < 16; i++) + c[i] = m[i]; + c[16] = hibit; + poly1305_add_ref8 (st->h, c); + + /* h *= r */ + for (i = 0; i < 17; i++) + { + u = 0; + for (j = 0; j <= i; j++) + { + u += (u16) st->h[j] * st->r[i - j]; + } + for (j = i + 1; j < 17; j++) + { + u32 v = (u16) st->h[j] * st->r[i + 17 - j]; + v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ + u += v; + } + hr[i] = u; + } + + /* (partial) h %= p */ + poly1305_squeeze_ref8 (st->h, hr); + + m += POLY1305_REF_BLOCKSIZE; + bytes -= POLY1305_REF_BLOCKSIZE; + } + + /* burn_stack */ + return (18 + 2) * sizeof (u32) + 18 + 6 * sizeof (void *) + + 6 * sizeof (void *); +} + + +static unsigned int +poly1305_finish_ext_ref8 (void *state, const byte * m, size_t remaining, + byte mac[POLY1305_TAGLEN]) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + size_t i; + unsigned int burn = 0; + + /* process the remaining block */ + if (remaining) + { + byte final[POLY1305_REF_BLOCKSIZE] = { 0 }; + for (i = 0; i < remaining; i++) + final[i] = m[i]; + final[remaining] = 1; + st->final = 1; + burn = poly1305_blocks_ref8 (st, final, POLY1305_REF_BLOCKSIZE); + } + + /* fully reduce h */ + poly1305_freeze_ref8 (st->h); + + /* h = (h + pad) % (1 << 128) */ + poly1305_add_ref8 (st->h, st->pad); + for (i = 0; i < 16; i++) + mac[i] = st->h[i]; + + /* zero out the state */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + for (i = 0; i < 17; i++) + st->r[i] = 0; + for (i = 0; i < 17; i++) + st->pad[i] = 0; + + /* burn_stack */ + return POLY1305_REF_BLOCKSIZE + 18 + 16 * sizeof (void *) + burn; +} + + +static const poly1305_ops_t poly1305_default_ops = { + poly1305_block_size_ref8, + poly1305_init_ext_ref8, + poly1305_blocks_ref8, + poly1305_finish_ext_ref8 +}; + +#endif /* HAVE_U64_TYPEDEF */ + + + +static inline void * +poly1305_get_state (poly1305_context_t * ctx) +{ + byte *c = ctx->state; + c += POLY1305_STATE_ALIGNMENT - 1; + c -= (uintptr_t) c & (POLY1305_STATE_ALIGNMENT - 1); + return c; +} + + +static void +poly1305_init (poly1305_context_t * ctx, const poly1305_key_t * key) +{ + void *state = poly1305_get_state (ctx); + + ctx->block_size = ctx->ops->block_size (); + ctx->leftover = 0; + + ctx->ops->init_ext (state, key); +} + + +void +_gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes) +{ + void *state = poly1305_get_state (ctx); + unsigned int burn = 0; + + /* handle leftover */ + if (ctx->leftover) + { + size_t want = (ctx->block_size - ctx->leftover); + if (want > bytes) + want = bytes; + buf_cpy (ctx->buffer + ctx->leftover, m, want); + bytes -= want; + m += want; + ctx->leftover += want; + if (ctx->leftover < ctx->block_size) + return; + burn = ctx->ops->blocks (state, ctx->buffer, ctx->block_size); + ctx->leftover = 0; + } + + /* process full blocks */ + if (bytes >= ctx->block_size) + { + size_t want = (bytes & ~(ctx->block_size - 1)); + burn = ctx->ops->blocks (state, m, want); + m += want; + bytes -= want; + } + + /* store leftover */ + if (bytes) + { + buf_cpy (ctx->buffer + ctx->leftover, m, bytes); + ctx->leftover += bytes; + } + + if (burn) + _gcry_burn_stack (burn); +} + + +void +_gcry_poly1305_finish (poly1305_context_t * ctx, byte mac[POLY1305_TAGLEN]) +{ + void *state = poly1305_get_state (ctx); + unsigned int burn; + + burn = ctx->ops->finish_ext (state, ctx->buffer, ctx->leftover, mac); + + _gcry_burn_stack (burn); +} + + +gcry_err_code_t +_gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, + size_t keylen) +{ + static int initialized; + static const char *selftest_failed; + poly1305_key_t keytmp; + + if (!initialized) + { + initialized = 1; + selftest_failed = selftest (); + if (selftest_failed) + log_error ("Poly1305 selftest failed (%s)\n", selftest_failed); + } + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + ctx->ops = &poly1305_default_ops; + + buf_cpy (keytmp.b, key, POLY1305_KEYLEN); + poly1305_init (ctx, &keytmp); + + wipememory (&keytmp, sizeof (keytmp)); + + return 0; +} + + +static void +poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes, + const byte * key) +{ + poly1305_context_t ctx; + + memset (&ctx, 0, sizeof (ctx)); + + _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN); + _gcry_poly1305_update (&ctx, m, bytes); + _gcry_poly1305_finish (&ctx, mac); + + wipememory (&ctx, sizeof (ctx)); +} + + +static const char * +selftest (void) +{ + /* example from nacl */ + static const byte nacl_key[POLY1305_KEYLEN] = { + 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91, + 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25, + 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65, + 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80, + }; + + static const byte nacl_msg[131] = { + 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73, + 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce, + 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4, + 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a, + 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b, + 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72, + 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2, + 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38, + 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a, + 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae, + 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea, + 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda, + 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde, + 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3, + 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6, + 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74, + 0xe3, 0x55, 0xa5 + }; + + static const byte nacl_mac[16] = { + 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5, + 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 + }; + + /* generates a final value of (2^130 - 2) == 3 */ + static const byte wrap_key[POLY1305_KEYLEN] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + static const byte wrap_msg[16] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + + static const byte wrap_mac[16] = { + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + /* mac of the macs of messages of length 0 to 256, where the key and messages + * have all their values set to the length + */ + static const byte total_key[POLY1305_KEYLEN] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + + static const byte total_mac[16] = { + 0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd, + 0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39 + }; + + poly1305_context_t ctx; + poly1305_context_t total_ctx; + byte all_key[POLY1305_KEYLEN]; + byte all_msg[256]; + byte mac[16]; + size_t i, j; + + memset (&ctx, 0, sizeof (ctx)); + memset (&total_ctx, 0, sizeof (total_ctx)); + + memset (mac, 0, sizeof (mac)); + poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key); + if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 1 failed."; + + /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so + * make sure everything still works varying between them */ + memset (mac, 0, sizeof (mac)); + _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN); + _gcry_poly1305_update (&ctx, nacl_msg + 0, 32); + _gcry_poly1305_update (&ctx, nacl_msg + 32, 64); + _gcry_poly1305_update (&ctx, nacl_msg + 96, 16); + _gcry_poly1305_update (&ctx, nacl_msg + 112, 8); + _gcry_poly1305_update (&ctx, nacl_msg + 120, 4); + _gcry_poly1305_update (&ctx, nacl_msg + 124, 2); + _gcry_poly1305_update (&ctx, nacl_msg + 126, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 127, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 128, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 129, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 130, 1); + _gcry_poly1305_finish (&ctx, mac); + if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 2 failed."; + + memset (mac, 0, sizeof (mac)); + poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key); + if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 3 failed."; + + _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN); + for (i = 0; i < 256; i++) + { + /* set key and message to 'i,i,i..' */ + for (j = 0; j < sizeof (all_key); j++) + all_key[j] = i; + for (j = 0; j < i; j++) + all_msg[j] = i; + poly1305_auth (mac, all_msg, i, all_key); + _gcry_poly1305_update (&total_ctx, mac, 16); + } + _gcry_poly1305_finish (&total_ctx, mac); + if (memcmp (total_mac, mac, sizeof (total_mac)) != 0) + return "Poly1305 test 4 failed."; + + return NULL; +} diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index d4e9bb2..8648e96 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1341,7 +1341,9 @@ enum gcry_mac_algos GCRY_MAC_GMAC_CAMELLIA = 402, GCRY_MAC_GMAC_TWOFISH = 403, GCRY_MAC_GMAC_SERPENT = 404, - GCRY_MAC_GMAC_SEED = 405 + GCRY_MAC_GMAC_SEED = 405, + + GCRY_MAC_POLY1305 = 501 }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index 406d82d..cc0f4c1 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5041,6 +5041,8 @@ check_mac (void) const char *key; const char *expect; const char *iv; + unsigned int dlen; + unsigned int klen; } algos[] = { { GCRY_MAC_HMAC_MD5, "what do ya want for nothing?", "Jefe", @@ -5491,6 +5493,109 @@ check_mac (void) "\xc9\xfc\xa7\x29\xab\x60\xad\xa0", "\x20\x4b\xdb\x1b\xd6\x21\x54\xbf\x08\x92\x2a\xaa\x54\xee\xd7\x05", "\x05\xad\x13\xa5\xe2\xc2\xab\x66\x7e\x1a\x6f\xbc" }, + /* from NaCl */ + { GCRY_MAC_POLY1305, + "\x8e\x99\x3b\x9f\x48\x68\x12\x73\xc2\x96\x50\xba\x32\xfc\x76\xce" + "\x48\x33\x2e\xa7\x16\x4d\x96\xa4\x47\x6f\xb8\xc5\x31\xa1\x18\x6a" + "\xc0\xdf\xc1\x7c\x98\xdc\xe8\x7b\x4d\xa7\xf0\x11\xec\x48\xc9\x72" + "\x71\xd2\xc2\x0f\x9b\x92\x8f\xe2\x27\x0d\x6f\xb8\x63\xd5\x17\x38" + "\xb4\x8e\xee\xe3\x14\xa7\xcc\x8a\xb9\x32\x16\x45\x48\xe5\x26\xae" + "\x90\x22\x43\x68\x51\x7a\xcf\xea\xbd\x6b\xb3\x73\x2b\xc0\xe9\xda" + "\x99\x83\x2b\x61\xca\x01\xb6\xde\x56\x24\x4a\x9e\x88\xd5\xf9\xb3" + "\x79\x73\xf6\x22\xa4\x3d\x14\xa6\x59\x9b\x1f\x65\x4c\xb4\x5a\x74" + "\xe3\x55\xa5", + "\xee\xa6\xa7\x25\x1c\x1e\x72\x91\x6d\x11\xc2\xcb\x21\x4d\x3c\x25" + "\x25\x39\x12\x1d\x8e\x23\x4e\x65\x2d\x65\x1f\xa4\xc8\xcf\xf8\x80", + "\xf3\xff\xc7\x70\x3f\x94\x00\xe5\x2a\x7d\xfb\x4b\x3d\x33\x05\xd9" }, + /* from draft-nir-cfrg-chacha20-poly1305-03 */ + { GCRY_MAC_POLY1305, + "Cryptographic Forum Research Group", + "\x85\xd6\xbe\x78\x57\x55\x6d\x33\x7f\x44\x52\xfe\x42\xd5\x06\xa8" + "\x01\x03\x80\x8a\xfb\x0d\xb2\xfd\x4a\xbf\xf6\xaf\x41\x49\xf5\x1b", + "\xa8\x06\x1d\xc1\x30\x51\x36\xc6\xc2\x2b\x8b\xaf\x0c\x01\x27\xa9" }, + { GCRY_MAC_POLY1305, + "'Twas brillig, and the slithy toves\n" + "Did gyre and gimble in the wabe:\n" + "All mimsy were the borogoves,\n" + "And the mome raths outgrabe.", + "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + "\x45\x41\x66\x9a\x7e\xaa\xee\x61\xe7\x08\xdc\x7c\xbc\xc5\xeb\x62" }, + { GCRY_MAC_POLY1305, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + NULL, + 191, 32 }, + { GCRY_MAC_POLY1305, + "Any submission to the IETF intended by the Contributor for " + "publication as all or part of an IETF Internet-Draft or RFC and " + "any statement made within the context of an IETF activity is " + "considered an \"IETF Contribution\". Such statements include " + "oral statements in IETF sessions, as well as written and " + "electronic communications made at any time or place, which are " + "addressed to", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e", + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "Any submission to the IETF intended by the Contributor for " + "publication as all or part of an IETF Internet-Draft or RFC and " + "any statement made within the context of an IETF activity is " + "considered an \"IETF Contribution\". Such statements include " + "oral statements in IETF sessions, as well as written and " + "electronic communications made at any time or place, which are " + "addressed to", + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xf3\x47\x7e\x7c\xd9\x54\x17\xaf\x89\xa6\xb8\x79\x4c\x31\x0c\xf0", + NULL, + 0, 32 }, + /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ + { GCRY_MAC_POLY1305, + "\xf3\xf6", + "\x85\x1f\xc4\x0c\x34\x67\xac\x0b\xe0\x5c\xc2\x04\x04\xf3\xf7\x00" + "\x58\x0b\x3b\x0f\x94\x47\xbb\x1e\x69\xd0\x95\xb5\x92\x8b\x6d\xbc", + "\xf4\xc6\x33\xc3\x04\x4f\xc1\x45\xf8\x4f\x33\x5c\xb8\x19\x53\xde", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "", + "\xa0\xf3\x08\x00\x00\xf4\x64\x00\xd0\xc7\xe9\x07\x6c\x83\x44\x03" + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "\x66\x3c\xea\x19\x0f\xfb\x83\xd8\x95\x93\xf3\xf4\x76\xb6\xbc\x24" + "\xd7\xe6\x79\x10\x7e\xa2\x6a\xdb\x8c\xaf\x66\x52\xd0\x65\x61\x36", + "\x48\x44\x3d\x0b\xb0\xd2\x11\x09\xc8\x9a\x10\x0b\x5c\xe2\xc2\x08" + "\x83\x14\x9c\x69\xb5\x61\xdd\x88\x29\x8a\x17\x98\xb1\x07\x16\xef", + "\x0e\xe1\xc1\x6b\xb7\x3f\x0f\x4f\xd1\x98\x81\x75\x3c\x01\xcd\xbe", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "\xab\x08\x12\x72\x4a\x7f\x1e\x34\x27\x42\xcb\xed\x37\x4d\x94\xd1" + "\x36\xc6\xb8\x79\x5d\x45\xb3\x81\x98\x30\xf2\xc0\x44\x91\xfa\xf0" + "\x99\x0c\x62\xe4\x8b\x80\x18\xb2\xc3\xe4\xa0\xfa\x31\x34\xcb\x67" + "\xfa\x83\xe1\x58\xc9\x94\xd9\x61\xc4\xcb\x21\x09\x5c\x1b\xf9", + "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" + "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", + "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, { 0 }, }; int i; @@ -5500,6 +5605,8 @@ check_mac (void) for (i = 0; algos[i].algo; i++) { + size_t klen, dlen; + if (gcry_mac_test_algo (algos[i].algo)) { show_mac_not_available (algos[i].algo); @@ -5520,13 +5627,14 @@ check_mac (void) algos[i].algo, (int)strlen(algos[i].key), (int)strlen(algos[i].data)); - check_one_mac (algos[i].algo, algos[i].data, strlen (algos[i].data), - algos[i].key, strlen(algos[i].key), algos[i].iv, - algos[i].iv ? strlen(algos[i].iv) : 0, + klen = algos[i].klen ? algos[i].klen : strlen(algos[i].key); + dlen = algos[i].dlen ? algos[i].dlen : strlen (algos[i].data); + + check_one_mac (algos[i].algo, algos[i].data, dlen, algos[i].key, klen, + algos[i].iv, algos[i].iv ? strlen(algos[i].iv) : 0, algos[i].expect, 0); - check_one_mac (algos[i].algo, algos[i].data, strlen (algos[i].data), - algos[i].key, strlen(algos[i].key), algos[i].iv, - algos[i].iv ? strlen(algos[i].iv) : 0, + check_one_mac (algos[i].algo, algos[i].data, dlen, algos[i].key, klen, + algos[i].iv, algos[i].iv ? strlen(algos[i].iv) : 0, algos[i].expect, 1); } diff --git a/tests/bench-slope.c b/tests/bench-slope.c index bd05064..3d8ae37 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -1405,7 +1405,7 @@ mac_bench (char **argv, int argc) } else { - for (i = 1; i < 500; i++) + for (i = 1; i < 600; i++) if (!gcry_mac_test_algo (i)) _mac_bench (i); } diff --git a/tests/benchmark.c b/tests/benchmark.c index 5efc083..9fd716d 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -461,7 +461,7 @@ mac_bench ( const char *algoname ) if (!algoname) { - for (i=1; i < 500; i++) + for (i=1; i < 600; i++) if (in_fips_mode && i == GCRY_MAC_HMAC_MD5) ; /* Don't use MD5 in fips mode. */ else if ( !gcry_mac_test_algo (i) ) From jussi.kivilinna at iki.fi Sat May 10 23:56:43 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 11 May 2014 00:56:43 +0300 Subject: [PATCH 3/3] Add Poly1305 based cipher AEAD mode In-Reply-To: <20140510215633.11373.59196.stgit@localhost6.localdomain6> References: <20140510215633.11373.59196.stgit@localhost6.localdomain6> Message-ID: <20140510215643.11373.29319.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'cipher-poly1305.c'. * cipher/cipher-internal.h (gcry_cipher_handle): Add 'u_mode.poly1305'. (_gcry_cipher_poly1305_encrypt, _gcry_cipher_poly1305_decrypt) (_gcry_cipher_poly1305_setiv, _gcry_cipher_poly1305_authenticate) (_gcry_cipher_poly1305_get_tag, _gcry_cipher_poly1305_check_tag): New. * cipher/cipher-poly1305.c: New. * cipher/cipher.c (_gcry_cipher_open_internal, cipher_setkey) (cipher_reset, cipher_encrypt, cipher_decrypt, _gcry_cipher_setiv) (_gcry_cipher_authenticate, _gcry_cipher_gettag) (_gcry_cipher_checktag): Handle 'GCRY_CIPHER_MODE_POLY1305'. (cipher_setiv): Move handling of 'GCRY_CIPHER_MODE_GCM' to ... (_gcry_cipher_setiv): ... here, as with other modes. * src/gcrypt.h.in: Add 'GCRY_CIPHER_MODE_POLY1305'. * tests/basic.c (_check_poly1305_cipher, check_poly1305_cipher): New. (check_ciphers): Add Poly1305 check. (check_cipher_modes): Call 'check_poly1305_cipher'. * tests/bench-slope.c (bench_gcm_encrypt_do_bench): Rename to bench_aead_... and take nonce as argument. (bench_gcm_decrypt_do_bench, bench_gcm_authenticate_do_bench): Ditto. (bench_gcm_encrypt_do_bench, bench_gcm_decrypt_do_bench) (bench_gcm_authenticate_do_bench, bench_poly1305_encrypt_do_bench) (bench_poly1305_decrypt_do_bench) (bench_poly1305_authenticate_do_bench, poly1305_encrypt_ops) (poly1305_decrypt_ops, poly1305_authenticate_ops): New. (cipher_modes): Add Poly1305. (cipher_bench_one): Add special handling for Poly1305. -- Patch adds Poly1305 based AEAD cipher mode to libgcrypt. ChaCha20 variant of this mode is proposed for use in TLS and ipsec: https://tools.ietf.org/html/draft-agl-tls-chacha20poly1305-04 http://tools.ietf.org/html/draft-nir-ipsecme-chacha20-poly1305-02 Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/cipher-internal.h | 38 +++++ cipher/cipher-poly1305.c | 296 +++++++++++++++++++++++++++++++++++++++++++ cipher/cipher.c | 51 +++++++ src/gcrypt.h.in | 21 ++- tests/basic.c | 319 ++++++++++++++++++++++++++++++++++++++++++++++ tests/bench-slope.c | 108 +++++++++++++--- 7 files changed, 805 insertions(+), 30 deletions(-) create mode 100644 cipher/cipher-poly1305.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a8b86e6..4468647 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -40,7 +40,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ -cipher-ccm.c cipher-cmac.c cipher-gcm.c \ +cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-poly1305.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index cdac445..f6bda66 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -20,6 +20,9 @@ #ifndef G10_CIPHER_INTERNAL_H #define G10_CIPHER_INTERNAL_H +#include "./poly1305-internal.h" + + /* The maximum supported size of a block in bytes. */ #define MAX_BLOCKSIZE 16 @@ -154,6 +157,17 @@ struct gcry_cipher_handle } ccm; #endif + /* Mode specific storage for Poly1305 mode. */ + struct { + /* byte counter for AAD and data. */ + u32 bytecount[2]; + + unsigned int aad_finalized:1; + unsigned int bytecount_over_limits:1; + + poly1305_context_t ctx; + } poly1305; + /* Mode specific storage for CMAC mode. */ struct { unsigned int tag:1; /* Set to 1 if tag has been finalized. */ @@ -319,4 +333,28 @@ void _gcry_cipher_gcm_setkey /* */ (gcry_cipher_hd_t c); +/*-- cipher-poly1305.c --*/ +gcry_err_code_t _gcry_cipher_poly1305_encrypt +/* */ (gcry_cipher_hd_t c, + unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen); +gcry_err_code_t _gcry_cipher_poly1305_decrypt +/* */ (gcry_cipher_hd_t c, + unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen); +gcry_err_code_t _gcry_cipher_poly1305_setiv +/* */ (gcry_cipher_hd_t c, + const unsigned char *iv, size_t ivlen); +gcry_err_code_t _gcry_cipher_poly1305_authenticate +/* */ (gcry_cipher_hd_t c, + const unsigned char *aadbuf, size_t aadbuflen); +gcry_err_code_t _gcry_cipher_poly1305_get_tag +/* */ (gcry_cipher_hd_t c, + unsigned char *outtag, size_t taglen); +gcry_err_code_t _gcry_cipher_poly1305_check_tag +/* */ (gcry_cipher_hd_t c, + const unsigned char *intag, size_t taglen); +void _gcry_cipher_poly1305_setkey +/* */ (gcry_cipher_hd_t c); + #endif /*G10_CIPHER_INTERNAL_H*/ diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c new file mode 100644 index 0000000..a22ffa3 --- /dev/null +++ b/cipher/cipher-poly1305.c @@ -0,0 +1,296 @@ +/* cipher-pol1305.c - Poly1305 based AEAD cipher mode + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" +#include "./poly1305-internal.h" + + +static inline int +poly1305_bytecounter_add (u32 ctr[2], size_t add) +{ + int overflow = 0; + + if (sizeof(add) > sizeof(u32)) + { + u32 high_add = ((add >> 31) >> 1) & 0xffffffff; + ctr[1] += high_add; + if (ctr[1] < high_add) + overflow = 1; + } + + ctr[0] += add; + if (ctr[0] >= add) + return overflow; + + ctr[1] += 1; + return (ctr[1] < 1) || overflow; +} + + +static void +poly1305_fill_bytecount (gcry_cipher_hd_t c) +{ + u32 lenbuf[2]; + + lenbuf[0] = le_bswap32(c->u_mode.poly1305.bytecount[0]); + lenbuf[1] = le_bswap32(c->u_mode.poly1305.bytecount[1]); + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, (byte*)lenbuf, + sizeof(lenbuf)); + + wipememory(lenbuf, sizeof(lenbuf)); +} + + +static void +poly1305_aad_finish (gcry_cipher_hd_t c) +{ + /* Start of encryption marks end of AAD stream. */ + poly1305_fill_bytecount(c); + + c->u_mode.poly1305.aad_finalized = 1; + + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; +} + + +static gcry_err_code_t +poly1305_set_zeroiv (gcry_cipher_hd_t c) +{ + byte zero[8] = { 0, }; + + return _gcry_cipher_poly1305_setiv (c, zero, sizeof(zero)); +} + + +gcry_err_code_t +_gcry_cipher_poly1305_authenticate (gcry_cipher_hd_t c, + const byte * aadbuf, size_t aadbuflen) +{ + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + if (c->u_mode.poly1305.aad_finalized) + return GPG_ERR_INV_STATE; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + + if (!c->marks.iv) + poly1305_set_zeroiv(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, aadbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, aadbuf, aadbuflen); + + return 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c, + byte *outbuf, size_t outbuflen, + const byte *inbuf, size_t inbuflen) +{ + gcry_err_code_t err; + + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, inbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, inbuflen); + + return 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c, + byte *outbuf, size_t outbuflen, + const byte *inbuf, size_t inbuflen) +{ + gcry_err_code_t err; + + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, inbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, inbuflen); + + c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); + return 0; +} + + +static gcry_err_code_t +_gcry_cipher_poly1305_tag (gcry_cipher_hd_t c, + byte * outbuf, size_t outbuflen, int check) +{ + gcry_err_code_t err; + + if (outbuflen < GCRY_GCM_BLOCK_LEN) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (!c->marks.tag) + { + /* Write data-length to poly1305. */ + poly1305_fill_bytecount(c); + + _gcry_poly1305_finish(&c->u_mode.poly1305.ctx, c->u_iv.iv); + + c->marks.tag = 1; + } + + if (check) + return buf_eq_const(outbuf, c->u_iv.iv, outbuflen) ? + GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM; + + memcpy (outbuf, c->u_iv.iv, outbuflen); + return GPG_ERR_NO_ERROR; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_get_tag (gcry_cipher_hd_t c, unsigned char *outtag, + size_t taglen) +{ + return _gcry_cipher_poly1305_tag (c, outtag, taglen, 0); +} + +gcry_err_code_t +_gcry_cipher_poly1305_check_tag (gcry_cipher_hd_t c, const unsigned char *intag, + size_t taglen) +{ + return _gcry_cipher_poly1305_tag (c, (unsigned char *) intag, taglen, 1); +} + + +void +_gcry_cipher_poly1305_setkey (gcry_cipher_hd_t c) +{ + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; + + c->u_mode.poly1305.bytecount_over_limits = 0; + c->u_mode.poly1305.aad_finalized = 0; + c->marks.tag = 0; + c->marks.iv = 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen) +{ + byte tmpbuf[64]; /* size of ChaCha20/Salsa20 block */ + gcry_err_code_t err; + + if (!iv && ivlen > 0) + return GPG_ERR_INV_ARG; + + memset(&c->u_mode.poly1305.ctx, 0, sizeof(c->u_mode.poly1305.ctx)); + + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; + + c->u_mode.poly1305.bytecount_over_limits = 0; + c->u_mode.poly1305.aad_finalized = 0; + c->marks.tag = 0; + c->marks.iv = 0; + + /* Set up IV for stream cipher. */ + c->spec->setiv (&c->context.c, iv, ivlen); + + /* Get the first block from ChaCha20/Salsa20. */ + memset(tmpbuf, 0, sizeof(tmpbuf)); + c->spec->stencrypt(&c->context.c, tmpbuf, tmpbuf, sizeof(tmpbuf)); + + /* Use the first 32-bytes as Poly1305 key. */ + err = _gcry_poly1305_init (&c->u_mode.poly1305.ctx, tmpbuf, POLY1305_KEYLEN); + + wipememory(tmpbuf, sizeof(tmpbuf)); + + if (err) + return err; + + c->marks.iv = 1; + return 0; +} diff --git a/cipher/cipher.c b/cipher/cipher.c index 4751302..da59061 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -418,6 +418,15 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, err = GPG_ERR_INV_CIPHER_MODE; break; + case GCRY_CIPHER_MODE_POLY1305: + if (!spec->stencrypt || !spec->stdecrypt || !spec->setiv) + err = GPG_ERR_INV_CIPHER_MODE; + else if (spec->algo != GCRY_CIPHER_SALSA20 && + spec->algo != GCRY_CIPHER_SALSA20R12 && + spec->algo != GCRY_CIPHER_CHACHA20) + err = GPG_ERR_INV_CIPHER_MODE; + break; + case GCRY_CIPHER_MODE_STREAM: if (!spec->stencrypt || !spec->stdecrypt) err = GPG_ERR_INV_CIPHER_MODE; @@ -611,6 +620,10 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen) _gcry_cipher_gcm_setkey (c); break; + case GCRY_CIPHER_MODE_POLY1305: + _gcry_cipher_poly1305_setkey (c); + break; + default: break; }; @@ -627,10 +640,6 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen) static gcry_err_code_t cipher_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen) { - /* GCM has its own IV handler */ - if (c->mode == GCRY_CIPHER_MODE_GCM) - return _gcry_cipher_gcm_setiv (c, iv, ivlen); - /* If the cipher has its own IV handler, we use only this one. This is currently used for stream ciphers requiring a nonce. */ if (c->spec->setiv) @@ -699,6 +708,10 @@ cipher_reset (gcry_cipher_hd_t c) } break; + case GCRY_CIPHER_MODE_POLY1305: + memset (&c->u_mode.poly1305, 0, sizeof c->u_mode.poly1305); + break; + #ifdef HAVE_U64_TYPEDEF case GCRY_CIPHER_MODE_CCM: memset (&c->u_mode.ccm, 0, sizeof c->u_mode.ccm); @@ -811,6 +824,11 @@ cipher_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, rc = _gcry_cipher_gcm_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_encrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->spec->stencrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -919,6 +937,11 @@ cipher_decrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, rc = _gcry_cipher_gcm_decrypt (c, outbuf, outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_decrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->spec->stdecrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -1000,6 +1023,14 @@ _gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen) rc = _gcry_cipher_ccm_set_nonce (hd, iv, ivlen); break; + case GCRY_CIPHER_MODE_GCM: + rc = _gcry_cipher_gcm_setiv (hd, iv, ivlen); + break; + + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_setiv (hd, iv, ivlen); + break; + default: rc = cipher_setiv (hd, iv, ivlen); break; @@ -1050,6 +1081,10 @@ _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf, rc = _gcry_cipher_gcm_authenticate (hd, abuf, abuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_authenticate (hd, abuf, abuflen); + break; + default: log_error ("gcry_cipher_authenticate: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; @@ -1079,6 +1114,10 @@ _gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag, size_t taglen) rc = _gcry_cipher_gcm_get_tag (hd, outtag, taglen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_get_tag (hd, outtag, taglen); + break; + default: log_error ("gcry_cipher_gettag: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; @@ -1108,6 +1147,10 @@ _gcry_cipher_checktag (gcry_cipher_hd_t hd, const void *intag, size_t taglen) rc = _gcry_cipher_gcm_check_tag (hd, intag, taglen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_check_tag (hd, intag, taglen); + break; + default: log_error ("gcry_cipher_checktag: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 3145020..bd38a24 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -895,16 +895,17 @@ enum gcry_cipher_algos supported for each algorithm. */ enum gcry_cipher_modes { - GCRY_CIPHER_MODE_NONE = 0, /* Not yet specified. */ - GCRY_CIPHER_MODE_ECB = 1, /* Electronic codebook. */ - GCRY_CIPHER_MODE_CFB = 2, /* Cipher feedback. */ - GCRY_CIPHER_MODE_CBC = 3, /* Cipher block chaining. */ - GCRY_CIPHER_MODE_STREAM = 4, /* Used with stream ciphers. */ - GCRY_CIPHER_MODE_OFB = 5, /* Outer feedback. */ - GCRY_CIPHER_MODE_CTR = 6, /* Counter. */ - GCRY_CIPHER_MODE_AESWRAP= 7, /* AES-WRAP algorithm. */ - GCRY_CIPHER_MODE_CCM = 8, /* Counter with CBC-MAC. */ - GCRY_CIPHER_MODE_GCM = 9 /* Galois Counter Mode. */ + GCRY_CIPHER_MODE_NONE = 0, /* Not yet specified. */ + GCRY_CIPHER_MODE_ECB = 1, /* Electronic codebook. */ + GCRY_CIPHER_MODE_CFB = 2, /* Cipher feedback. */ + GCRY_CIPHER_MODE_CBC = 3, /* Cipher block chaining. */ + GCRY_CIPHER_MODE_STREAM = 4, /* Used with stream ciphers. */ + GCRY_CIPHER_MODE_OFB = 5, /* Outer feedback. */ + GCRY_CIPHER_MODE_CTR = 6, /* Counter. */ + GCRY_CIPHER_MODE_AESWRAP = 7, /* AES-WRAP algorithm. */ + GCRY_CIPHER_MODE_CCM = 8, /* Counter with CBC-MAC. */ + GCRY_CIPHER_MODE_GCM = 9, /* Galois Counter Mode. */ + GCRY_CIPHER_MODE_POLY1305 = 10, /* Poly1305 based AEAD mode. */ }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index de10617..9740919 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -1620,6 +1620,320 @@ check_gcm_cipher (void) static void +_check_poly1305_cipher (unsigned int step) +{ + struct tv + { + int algo; + char key[MAX_DATA_LEN]; + char iv[MAX_DATA_LEN]; + int ivlen; + unsigned char aad[MAX_DATA_LEN]; + int aadlen; + unsigned char plaintext[MAX_DATA_LEN]; + int inlen; + char out[MAX_DATA_LEN]; + char tag[MAX_DATA_LEN]; + } tv[] = + { + /* draft-agl-tls-chacha20poly1305-04 */ + { GCRY_CIPHER_CHACHA20, + "\x42\x90\xbc\xb1\x54\x17\x35\x31\xf3\x14\xaf\x57\xf3\xbe\x3b\x50" + "\x06\xda\x37\x1e\xce\x27\x2a\xfa\x1b\x5d\xbd\xd1\x10\x0a\x10\x07", + "\xcd\x7c\xf6\x7b\xe3\x9c\x79\x4a", 8, + "\x87\xe2\x29\xd4\x50\x08\x45\xa0\x79\xc0", 10, + "\x86\xd0\x99\x74\x84\x0b\xde\xd2\xa5\xca", 10, + "\xe3\xe4\x46\xf7\xed\xe9\xa1\x9b\x62\xa4", + "\x67\x7d\xab\xf4\xe3\xd2\x4b\x87\x6b\xb2\x84\x75\x38\x96\xe1\xd6" }, + /* draft-nir-cfrg-chacha20-poly1305-03 */ + { GCRY_CIPHER_CHACHA20, + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + "\x07\x00\x00\x00\x40\x41\x42\x43\x44\x45\x46\x47", 12, + "\x50\x51\x52\x53\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7", 12, + "Ladies and Gentlemen of the class of '99: If I could offer you " + "only one tip for the future, sunscreen would be it.", 114, + "\xd3\x1a\x8d\x34\x64\x8e\x60\xdb\x7b\x86\xaf\xbc\x53\xef\x7e\xc2" + "\xa4\xad\xed\x51\x29\x6e\x08\xfe\xa9\xe2\xb5\xa7\x36\xee\x62\xd6" + "\x3d\xbe\xa4\x5e\x8c\xa9\x67\x12\x82\xfa\xfb\x69\xda\x92\x72\x8b" + "\x1a\x71\xde\x0a\x9e\x06\x0b\x29\x05\xd6\xa5\xb6\x7e\xcd\x3b\x36" + "\x92\xdd\xbd\x7f\x2d\x77\x8b\x8c\x98\x03\xae\xe3\x28\x09\x1b\x58" + "\xfa\xb3\x24\xe4\xfa\xd6\x75\x94\x55\x85\x80\x8b\x48\x31\xd7\xbc" + "\x3f\xf4\xde\xf0\x8e\x4b\x7a\x9d\xe5\x76\xd2\x65\x86\xce\xc6\x4b" + "\x61\x16", + "\x18\xfb\x11\xa5\x03\x1a\xd1\x3a\x7e\x3b\x03\xd4\x6e\xe3\xa6\xa7" } + }; + + gcry_cipher_hd_t hde, hdd; + unsigned char out[MAX_DATA_LEN]; + unsigned char tag[16]; + int i, keylen; + gcry_error_t err = 0; + size_t pos, poslen; + int byteNum; + + if (verbose) + fprintf (stderr, " Starting POLY1305 checks.\n"); + + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) + { + if (verbose) + fprintf (stderr, " checking POLY1305 mode for %s [%i]\n", + gcry_cipher_algo_name (tv[i].algo), + tv[i].algo); + err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_POLY1305, 0); + if (!err) + err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_POLY1305, 0); + if (err) + { + fail ("poly1305, gcry_cipher_open failed: %s\n", gpg_strerror (err)); + return; + } + + keylen = gcry_cipher_get_algo_keylen(tv[i].algo); + if (!keylen) + { + fail ("poly1305, gcry_cipher_get_algo_keylen failed\n"); + return; + } + + err = gcry_cipher_setkey (hde, tv[i].key, keylen); + if (!err) + err = gcry_cipher_setkey (hdd, tv[i].key, keylen); + if (err) + { + fail ("poly1305, gcry_cipher_setkey failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("poly1305, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + for (pos = 0; pos < tv[i].aadlen; pos += step) + { + poslen = (pos + step < tv[i].aadlen) ? step : tv[i].aadlen - pos; + + err = gcry_cipher_authenticate(hde, tv[i].aad + pos, poslen); + if (err) + { + fail ("poly1305, gcry_cipher_authenticate (%d) (%d:%d) failed: " + "%s\n", i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + err = gcry_cipher_authenticate(hdd, tv[i].aad + pos, poslen); + if (err) + { + fail ("poly1305, de gcry_cipher_authenticate (%d) (%d:%d) failed: " + "%s\n", i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + for (pos = 0; pos < tv[i].inlen; pos += step) + { + poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos; + + err = gcry_cipher_encrypt (hde, out + pos, poslen, + tv[i].plaintext + pos, poslen); + if (err) + { + fail ("poly1305, gcry_cipher_encrypt (%d) (%d:%d) failed: %s\n", + i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("poly1305, encrypt mismatch entry %d (step %d)\n", i, step); + + for (pos = 0; pos < tv[i].inlen; pos += step) + { + poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos; + + err = gcry_cipher_decrypt (hdd, out + pos, poslen, NULL, 0); + if (err) + { + fail ("poly1305, gcry_cipher_decrypt (%d) (%d:%d) failed: %s\n", + i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("poly1305, decrypt mismatch entry %d (step %d)\n", i, step); + + err = gcry_cipher_gettag (hde, out, 16); + if (err) + { + fail ("poly1305, gcry_cipher_gettag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, out, 16)) + fail ("poly1305, encrypt tag mismatch entry %d\n", i); + + + err = gcry_cipher_checktag (hdd, out, 16); + if (err) + { + fail ("poly1305, gcry_cipher_checktag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_reset(hde); + if (!err) + err = gcry_cipher_reset(hdd); + if (err) + { + fail ("poly1305, gcry_cipher_reset (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + /* gcry_cipher_reset clears the IV */ + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("poly1305, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + /* this time we authenticate, encrypt and decrypt one byte at a time */ + for (byteNum = 0; byteNum < tv[i].aadlen; ++byteNum) + { + err = gcry_cipher_authenticate(hde, tv[i].aad + byteNum, 1); + if (err) + { + fail ("poly1305, gcry_cipher_authenticate (%d) (byte-buf) failed: " + "%s\n", i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + err = gcry_cipher_authenticate(hdd, tv[i].aad + byteNum, 1); + if (err) + { + fail ("poly1305, de gcry_cipher_authenticate (%d) (byte-buf) " + "failed: %s\n", i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_encrypt (hde, out+byteNum, 1, + (tv[i].plaintext) + byteNum, + 1); + if (err) + { + fail ("poly1305, gcry_cipher_encrypt (%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("poly1305, encrypt mismatch entry %d, (byte-buf)\n", i); + + err = gcry_cipher_gettag (hde, tag, 16); + if (err) + { + fail ("poly1305, gcry_cipher_gettag(%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, tag, 16)) + fail ("poly1305, encrypt tag mismatch entry %d, (byte-buf)\n", i); + + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_decrypt (hdd, out+byteNum, 1, NULL, 0); + if (err) + { + fail ("poly1305, gcry_cipher_decrypt (%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("poly1305, decrypt mismatch entry %d\n", i); + + err = gcry_cipher_checktag (hdd, tag, 16); + if (err) + { + fail ("poly1305, gcry_cipher_checktag(%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + } + if (verbose) + fprintf (stderr, " Completed POLY1305 checks.\n"); +} + + +static void +check_poly1305_cipher (void) +{ + /* Large buffers, no splitting. */ + _check_poly1305_cipher(0xffffffff); + /* Split input to one byte buffers. */ + _check_poly1305_cipher(1); + /* Split input to 7 byte buffers. */ + _check_poly1305_cipher(7); + /* Split input to 16 byte buffers. */ + _check_poly1305_cipher(16); +} + + +static void check_ccm_cipher (void) { #ifdef HAVE_U64_TYPEDEF @@ -4019,6 +4333,10 @@ check_ciphers (void) gcry_cipher_algo_name (algos2[i])); check_one_cipher (algos2[i], GCRY_CIPHER_MODE_STREAM, 0); + if (algos2[i] == GCRY_CIPHER_CHACHA20 || + algos2[i] == GCRY_CIPHER_SALSA20 || + algos2[i] == GCRY_CIPHER_SALSA20R12) + check_one_cipher (algos2[i], GCRY_CIPHER_MODE_POLY1305, 0); } /* we have now run all cipher's selftests */ @@ -4042,6 +4360,7 @@ check_cipher_modes(void) check_ofb_cipher (); check_ccm_cipher (); check_gcm_cipher (); + check_poly1305_cipher (); check_stream_cipher (); check_stream_cipher_large_block (); diff --git a/tests/bench-slope.c b/tests/bench-slope.c index a911ef8..7bf587f 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -907,15 +907,14 @@ static struct bench_ops ccm_authenticate_ops = { static void -bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) +bench_aead_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen, + const char *nonce, size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16]; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen); if (err) @@ -937,15 +936,14 @@ bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) } static void -bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) +bench_aead_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen, + const char *nonce, size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16] = { 0, }; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen); if (err) @@ -969,17 +967,16 @@ bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) } static void -bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, - size_t buflen) +bench_aead_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen, const char *nonce, + size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16] = { 0, }; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; char data = 0xff; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_authenticate (hd, buf, buflen); if (err) @@ -1009,6 +1006,34 @@ bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, } } + +static void +bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + static struct bench_ops gcm_encrypt_ops = { &bench_encrypt_init, &bench_encrypt_free, @@ -1028,6 +1053,49 @@ static struct bench_ops gcm_authenticate_ops = { }; +static void +bench_poly1305_encrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_poly1305_decrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_poly1305_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static struct bench_ops poly1305_encrypt_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_encrypt_do_bench +}; + +static struct bench_ops poly1305_decrypt_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_decrypt_do_bench +}; + +static struct bench_ops poly1305_authenticate_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_authenticate_do_bench +}; + + static struct bench_cipher_mode cipher_modes[] = { {GCRY_CIPHER_MODE_ECB, "ECB enc", &encrypt_ops}, {GCRY_CIPHER_MODE_ECB, "ECB dec", &decrypt_ops}, @@ -1047,6 +1115,9 @@ static struct bench_cipher_mode cipher_modes[] = { {GCRY_CIPHER_MODE_GCM, "GCM enc", &gcm_encrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM dec", &gcm_decrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM auth", &gcm_authenticate_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 enc", &poly1305_encrypt_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 dec", &poly1305_decrypt_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 auth", &poly1305_authenticate_ops}, {0}, }; @@ -1066,8 +1137,9 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) if (!blklen) return; - /* Stream cipher? Only test with ECB. */ - if (blklen == 1 && mode.mode != GCRY_CIPHER_MODE_ECB) + /* Stream cipher? Only test with "ECB" and POLY1305. */ + if (blklen == 1 && (mode.mode != GCRY_CIPHER_MODE_ECB && + mode.mode != GCRY_CIPHER_MODE_POLY1305)) return; if (blklen == 1 && mode.mode == GCRY_CIPHER_MODE_ECB) { @@ -1075,6 +1147,12 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) mode.name = mode.ops == &encrypt_ops ? "STREAM enc" : "STREAM dec"; } + /* Poly1305 has restrictions for cipher algorithm */ + if (mode.mode == GCRY_CIPHER_MODE_POLY1305 && + (algo != GCRY_CIPHER_SALSA20 && algo != GCRY_CIPHER_SALSA20R12 && + algo != GCRY_CIPHER_CHACHA20)) + return; + /* CCM has restrictions for block-size */ if (mode.mode == GCRY_CIPHER_MODE_CCM && blklen != GCRY_CCM_BLOCK_LEN) return; From jussi.kivilinna at iki.fi Sat May 10 23:56:38 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 11 May 2014 00:56:38 +0300 Subject: [PATCH 2/3] Add Poly1305-AES (-Camellia, etc) MACs In-Reply-To: <20140510215633.11373.59196.stgit@localhost6.localdomain6> References: <20140510215633.11373.59196.stgit@localhost6.localdomain6> Message-ID: <20140510215638.11373.12993.stgit@localhost6.localdomain6> * cipher/mac-internal.h (_gcry_mac_type_spec_poly1305_aes) (_gcry_mac_type_spec_poly1305_camellia) (_gcry_mac_type_spec_poly1305_twofish) (_gcry_mac_type_spec_poly1305_serpent) (_gcry_mac_type_spec_poly1305_seed): New. * cipher/mac-poly1305.c (poly1305mac_context_s): Add 'hd' and 'nonce_set'. (poly1305mac_open, poly1305mac_close, poly1305mac_setkey): Add handling for Poly1305-*** MACs. (poly1305mac_prepare_key, poly1305mac_setiv): New. (poly1305mac_reset, poly1305mac_write, poly1305mac_read): Add handling for 'nonce_set'. (poly1305mac_ops): Add 'poly1305mac_setiv'. (_gcry_mac_type_spec_poly1305_aes) (_gcry_mac_type_spec_poly1305_camellia) (_gcry_mac_type_spec_poly1305_twofish) (_gcry_mac_type_spec_poly1305_serpent) (_gcry_mac_type_spec_poly1305_seed): New. * cipher/mac.c (mac_list): Add Poly1305-AES, Poly1305-Twofish, Poly1305-Serpent, Poly1305-SEED and Poly1305-Camellia. * src/gcrypt.h.in: Add 'GCRY_MAC_POLY1305_AES', 'GCRY_MAC_POLY1305_CAMELLIA', 'GCRY_MAC_POLY1305_TWOFISH', 'GCRY_MAC_POLY1305_SERPENT' and 'GCRY_MAC_POLY1305_SEED'. * tests/basic.c (check_mac): Add Poly1305-AES test vectors. * tests/bench-slope.c (bench_mac_init): Set IV for Poly1305-*** MACs. * tests/bench-slope.c (mac_bench): Set IV for Poly1305-*** MACs. -- Patch adds Bernstein's Poly1305-AES message authentication code to libgcrypt and other variants of Poly1305-<128-bit block cipher>. Signed-off-by: Jussi Kivilinna --- cipher/mac-internal.h | 15 ++++ cipher/mac-poly1305.c | 174 +++++++++++++++++++++++++++++++++++++++++++++---- cipher/mac.c | 5 + src/gcrypt.h.in | 7 ++ tests/basic.c | 33 +++++++++ tests/bench-slope.c | 13 ++++ tests/benchmark.c | 12 +++ 7 files changed, 244 insertions(+), 15 deletions(-) diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 81b6185..f65a8ae 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -217,3 +217,18 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; * The Poly1305 MAC algorithm specifications (mac-poly1305.c). */ extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac; +#if USE_AES +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes; +#endif +#if USE_CAMELLIA +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia; +#endif +#if USE_TWOFISH +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish; +#endif +#if USE_SERPENT +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent; +#endif +#if USE_SEED +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed; +#endif diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c index e265b64..76b369a 100644 --- a/cipher/mac-poly1305.c +++ b/cipher/mac-poly1305.c @@ -30,8 +30,10 @@ struct poly1305mac_context_s { poly1305_context_t ctx; + gcry_cipher_hd_t hd; struct { unsigned int key_set:1; + unsigned int nonce_set:1; unsigned int tag:1; } marks; byte tag[POLY1305_TAGLEN]; @@ -44,6 +46,9 @@ poly1305mac_open (gcry_mac_hd_t h) { struct poly1305mac_context_s *mac_ctx; int secure = (h->magic == CTX_MAGIC_SECURE); + unsigned int flags = (secure ? GCRY_CIPHER_SECURE : 0); + gcry_err_code_t err; + int cipher_algo; if (secure) mac_ctx = xtrycalloc_secure (1, sizeof(*mac_ctx)); @@ -55,14 +60,71 @@ poly1305mac_open (gcry_mac_hd_t h) h->u.poly1305mac.ctx = mac_ctx; + switch (h->spec->algo) + { + default: + /* already checked. */ + case GCRY_MAC_POLY1305: + /* plain Poly1305. */ + cipher_algo = -1; + return 0; + case GCRY_MAC_POLY1305_AES: + cipher_algo = GCRY_CIPHER_AES; + break; + case GCRY_MAC_POLY1305_CAMELLIA: + cipher_algo = GCRY_CIPHER_CAMELLIA128; + break; + case GCRY_MAC_POLY1305_TWOFISH: + cipher_algo = GCRY_CIPHER_TWOFISH; + break; + case GCRY_MAC_POLY1305_SERPENT: + cipher_algo = GCRY_CIPHER_SERPENT128; + break; + case GCRY_MAC_POLY1305_SEED: + cipher_algo = GCRY_CIPHER_SEED; + break; + } + + err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo, + GCRY_CIPHER_MODE_ECB, flags); + if (err) + goto err_free; + return 0; + +err_free: + xfree(h->u.poly1305mac.ctx); + return err; } static void poly1305mac_close (gcry_mac_hd_t h) { - xfree(h->u.poly1305mac.ctx); + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (h->spec->algo != GCRY_MAC_POLY1305) + _gcry_cipher_close (mac_ctx->hd); + + xfree(mac_ctx); +} + + +static gcry_err_code_t +poly1305mac_prepare_key (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + size_t block_keylen = keylen - 16; + + /* Need at least 16 + 1 byte key. */ + if (keylen <= 16) + return GPG_ERR_INV_KEYLEN; + + /* For Poly1305-AES, first part of key is passed to Poly1305 as is. */ + memcpy (mac_ctx->key, key + block_keylen, 16); + + /* Remaining part is used as key for the block cipher. */ + return _gcry_cipher_setkey (mac_ctx->hd, key, block_keylen); } @@ -77,22 +139,74 @@ poly1305mac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); mac_ctx->marks.key_set = 0; + mac_ctx->marks.nonce_set = 0; mac_ctx->marks.tag = 0; - if (keylen != POLY1305_KEYLEN) - return GPG_ERR_INV_KEYLEN; - - memcpy(mac_ctx->key, key, POLY1305_KEYLEN); + if (h->spec->algo != GCRY_MAC_POLY1305) + { + err = poly1305mac_prepare_key (h, key, keylen); + if (err) + return err; - err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); - if (err) + /* Poly1305-AES/etc also need nonce. */ + mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 0; + } + else { - memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); - return err; + /* For plain Poly1305, key is the nonce and setup is complete now. */ + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + memcpy (mac_ctx->key, key, keylen); + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + { + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + return err; + } + + mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 1; } - mac_ctx->marks.key_set = 1; + return 0; +} + + +static gcry_err_code_t +poly1305mac_setiv (gcry_mac_hd_t h, const unsigned char *iv, size_t ivlen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + + if (h->spec->algo == GCRY_MAC_POLY1305) + return GPG_ERR_INV_ARG; + + if (ivlen != 16) + return GPG_ERR_INV_ARG; + if (!mac_ctx->marks.key_set) + return 0; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + mac_ctx->marks.nonce_set = 0; + mac_ctx->marks.tag = 0; + + /* Prepare second part of the poly1305 key. */ + + err = _gcry_cipher_encrypt (mac_ctx->hd, mac_ctx->key + 16, 16, iv, 16); + if (err) + return err; + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + return err; + + mac_ctx->marks.nonce_set = 1; return 0; } @@ -102,13 +216,14 @@ poly1305mac_reset (gcry_mac_hd_t h) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set) return GPG_ERR_INV_STATE; memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 1; mac_ctx->marks.tag = 0; return _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); @@ -120,7 +235,8 @@ poly1305mac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set || mac_ctx->marks.tag) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set || + mac_ctx->marks.tag) return GPG_ERR_INV_STATE; _gcry_poly1305_update (&mac_ctx->ctx, buf, buflen); @@ -133,7 +249,7 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set) return GPG_ERR_INV_STATE; if (!mac_ctx->marks.tag) @@ -197,7 +313,7 @@ static gcry_mac_spec_ops_t poly1305mac_ops = { poly1305mac_open, poly1305mac_close, poly1305mac_setkey, - NULL, + poly1305mac_setiv, poly1305mac_reset, poly1305mac_write, poly1305mac_read, @@ -211,3 +327,33 @@ gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac = { GCRY_MAC_POLY1305, {0, 0}, "POLY1305", &poly1305mac_ops }; +#if USE_AES +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes = { + GCRY_MAC_POLY1305_AES, {0, 0}, "POLY1305_AES", + &poly1305mac_ops +}; +#endif +#if USE_CAMELLIA +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia = { + GCRY_MAC_POLY1305_CAMELLIA, {0, 0}, "POLY1305_CAMELLIA", + &poly1305mac_ops +}; +#endif +#if USE_TWOFISH +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish = { + GCRY_MAC_POLY1305_TWOFISH, {0, 0}, "POLY1305_TWOFISH", + &poly1305mac_ops +}; +#endif +#if USE_SERPENT +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent = { + GCRY_MAC_POLY1305_SERPENT, {0, 0}, "POLY1305_SERPENT", + &poly1305mac_ops +}; +#endif +#if USE_SEED +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed = { + GCRY_MAC_POLY1305_SEED, {0, 0}, "POLY1305_SEED", + &poly1305mac_ops +}; +#endif diff --git a/cipher/mac.c b/cipher/mac.c index e583369..30117b9 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -75,14 +75,17 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_AES &_gcry_mac_type_spec_cmac_aes, &_gcry_mac_type_spec_gmac_aes, + &_gcry_mac_type_spec_poly1305mac_aes, #endif #if USE_TWOFISH &_gcry_mac_type_spec_cmac_twofish, &_gcry_mac_type_spec_gmac_twofish, + &_gcry_mac_type_spec_poly1305mac_twofish, #endif #if USE_SERPENT &_gcry_mac_type_spec_cmac_serpent, &_gcry_mac_type_spec_gmac_serpent, + &_gcry_mac_type_spec_poly1305mac_serpent, #endif #if USE_RFC2268 &_gcry_mac_type_spec_cmac_rfc2268, @@ -90,10 +93,12 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_SEED &_gcry_mac_type_spec_cmac_seed, &_gcry_mac_type_spec_gmac_seed, + &_gcry_mac_type_spec_poly1305mac_seed, #endif #if USE_CAMELLIA &_gcry_mac_type_spec_cmac_camellia, &_gcry_mac_type_spec_gmac_camellia, + &_gcry_mac_type_spec_poly1305mac_camellia, #endif #ifdef USE_IDEA &_gcry_mac_type_spec_cmac_idea, diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 8648e96..3145020 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1343,7 +1343,12 @@ enum gcry_mac_algos GCRY_MAC_GMAC_SERPENT = 404, GCRY_MAC_GMAC_SEED = 405, - GCRY_MAC_POLY1305 = 501 + GCRY_MAC_POLY1305 = 501, + GCRY_MAC_POLY1305_AES = 502, + GCRY_MAC_POLY1305_CAMELLIA = 503, + GCRY_MAC_POLY1305_TWOFISH = 504, + GCRY_MAC_POLY1305_SERPENT = 505, + GCRY_MAC_POLY1305_SEED = 506 }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index cc0f4c1..de10617 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5596,6 +5596,39 @@ check_mac (void) "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, + /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ + { GCRY_MAC_POLY1305_AES, + "\xf3\xf6", + "\xec\x07\x4c\x83\x55\x80\x74\x17\x01\x42\x5b\x62\x32\x35\xad\xd6" + "\x85\x1f\xc4\x0c\x34\x67\xac\x0b\xe0\x5c\xc2\x04\x04\xf3\xf7\x00", + "\xf4\xc6\x33\xc3\x04\x4f\xc1\x45\xf8\x4f\x33\x5c\xb8\x19\x53\xde", + "\xfb\x44\x73\x50\xc4\xe8\x68\xc5\x2a\xc3\x27\x5c\xf9\xd4\x32\x7e", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "", + "\x75\xde\xaa\x25\xc0\x9f\x20\x8e\x1d\xc4\xce\x6b\x5c\xad\x3f\xbf" + "\xa0\xf3\x08\x00\x00\xf4\x64\x00\xd0\xc7\xe9\x07\x6c\x83\x44\x03", + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + "\x61\xee\x09\x21\x8d\x29\xb0\xaa\xed\x7e\x15\x4a\x2c\x55\x09\xcc", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "\x66\x3c\xea\x19\x0f\xfb\x83\xd8\x95\x93\xf3\xf4\x76\xb6\xbc\x24" + "\xd7\xe6\x79\x10\x7e\xa2\x6a\xdb\x8c\xaf\x66\x52\xd0\x65\x61\x36", + "\x6a\xcb\x5f\x61\xa7\x17\x6d\xd3\x20\xc5\xc1\xeb\x2e\xdc\xdc\x74" + "\x48\x44\x3d\x0b\xb0\xd2\x11\x09\xc8\x9a\x10\x0b\x5c\xe2\xc2\x08", + "\x0e\xe1\xc1\x6b\xb7\x3f\x0f\x4f\xd1\x98\x81\x75\x3c\x01\xcd\xbe", + "\xae\x21\x2a\x55\x39\x97\x29\x59\x5d\xea\x45\x8b\xc6\x21\xff\x0e", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "\xab\x08\x12\x72\x4a\x7f\x1e\x34\x27\x42\xcb\xed\x37\x4d\x94\xd1" + "\x36\xc6\xb8\x79\x5d\x45\xb3\x81\x98\x30\xf2\xc0\x44\x91\xfa\xf0" + "\x99\x0c\x62\xe4\x8b\x80\x18\xb2\xc3\xe4\xa0\xfa\x31\x34\xcb\x67" + "\xfa\x83\xe1\x58\xc9\x94\xd9\x61\xc4\xcb\x21\x09\x5c\x1b\xf9", + "\xe1\xa5\x66\x8a\x4d\x5b\x66\xa5\xf6\x8c\xc5\x42\x4e\xd5\x98\x2d" + "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07", + "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b", + "\x9a\xe8\x31\xe7\x43\x97\x8d\x3a\x23\x52\x7c\x71\x28\x14\x9e\x3a", + 0, 32 }, { 0 }, }; int i; diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 3d8ae37..a911ef8 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -1316,6 +1316,19 @@ bench_mac_init (struct bench_obj *obj) exit (1); } + switch (mode->algo) + { + default: + break; + case GCRY_MAC_POLY1305_AES: + case GCRY_MAC_POLY1305_CAMELLIA: + case GCRY_MAC_POLY1305_TWOFISH: + case GCRY_MAC_POLY1305_SERPENT: + case GCRY_MAC_POLY1305_SEED: + gcry_mac_setiv (hd, key, 16); + break; + } + obj->priv = hd; return 0; diff --git a/tests/benchmark.c b/tests/benchmark.c index 9fd716d..042e721 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -509,6 +509,18 @@ mac_bench ( const char *algoname ) for (i=0; i < bufsize; i++) buf[i] = i; + if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SEED) + { + static const char iv[16] = { 1, 2, 3, 4, }; + err = gcry_mac_setiv(hd, iv, sizeof(iv)); + if (err) + { + fprintf (stderr, PGM ": error setting nonce for mac algorithm `%s': %s\n", + algoname, gpg_strerror (err)); + exit (1); + } + } + printf ("%-20s", gcry_mac_algo_name (algo)); start_timer (); From cvs at cvs.gnupg.org Sun May 11 11:06:09 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sun, 11 May 2014 11:06:09 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-70-ga39ee75 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via a39ee7555691d18cae97560f130aaf952bfbd278 (commit) via def7d4cad386271c6d4e2f10aabe0cb4abd871e4 (commit) via 23f33d57c9b6f2295a8ddfc9a8eee5a2c30cf406 (commit) from 246b7aaae1ee459f440260bbc4ec2c01c5dc3362 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit a39ee7555691d18cae97560f130aaf952bfbd278 Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 chacha20: add AVX2/AMD64 assembly implementation * cipher/Makefile.am: Add 'chacha20-avx2-amd64.S'. * cipher/chacha20-avx2-amd64.S: New. * cipher/chacha20.c (USE_AVX2): New macro. [USE_AVX2] (_gcry_chacha20_amd64_avx2_blocks): New. (chacha20_do_setkey): Select AVX2 implementation if there is HW support. (selftest): Increase size of buf by 256. * configure.ac [host=x86-64]: Add 'chacha20-avx2-amd64.lo'. -- Add AVX2 optimized implementation for ChaCha20. Based on implementation by Andrew Moon. SSSE3 (Intel Haswell): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.742 ns/B 1284.8 MiB/s 2.38 c/B STREAM dec | 0.741 ns/B 1286.5 MiB/s 2.37 c/B AVX2: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.393 ns/B 2428.0 MiB/s 1.26 c/B STREAM dec | 0.392 ns/B 2433.6 MiB/s 1.25 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 27ca7ac..26d13d2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c chacha20-ssse3-amd64.S \ +chacha20.c chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S new file mode 100644 index 0000000..c50a0c0 --- /dev/null +++ b/cipher/chacha20-avx2-amd64.S @@ -0,0 +1,949 @@ +/* chacha20-avx2-amd64.S - AMD64/AVX2 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && USE_CHACHA20 + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +.text + +.align 8 +.globl _gcry_chacha20_amd64_avx2_blocks +.type _gcry_chacha20_amd64_avx2_blocks, at function; +_gcry_chacha20_amd64_avx2_blocks: +.Lchacha_blocks_avx2_local: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + leaq .LC RIP, %rax + vmovdqu 0(%rax), %xmm6 + vmovdqu 16(%rax), %xmm7 + vmovdqu 0(%rdi), %xmm8 + vmovdqu 16(%rdi), %xmm9 + vmovdqu 32(%rdi), %xmm10 + vmovdqu 48(%rdi), %xmm11 + movl $20, %eax + movq $1, %r9 + vmovdqa %xmm8, 0(%rsp) + vmovdqa %xmm9, 16(%rsp) + vmovdqa %xmm10, 32(%rsp) + vmovdqa %xmm11, 48(%rsp) + movq %rax, 64(%rsp) + vmovdqa %xmm6, 448(%rsp) + vmovdqa %xmm6, 464(%rsp) + vmovdqa %xmm7, 480(%rsp) + vmovdqa %xmm7, 496(%rsp) + cmpq $512, %rcx + jae .Lchacha_blocks_avx2_atleast512 + cmp $256, %rcx + jae .Lchacha_blocks_avx2_atleast256 + jmp .Lchacha_blocks_avx2_below256 + .p2align 6,,63 +.Lchacha_blocks_avx2_atleast512: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + leaq 5(%rax), %r11 + leaq 6(%rax), %r12 + leaq 7(%rax), %r13 + leaq 8(%rax), %r14 + movl %eax, 128(%rsp) + movl %r8d, 4+128(%rsp) + movl %r9d, 8+128(%rsp) + movl %r10d, 12+128(%rsp) + movl %ebx, 16+128(%rsp) + movl %r11d, 20+128(%rsp) + movl %r12d, 24+128(%rsp) + movl %r13d, 28+128(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + shrq $32, %rbx + shrq $32, %r11 + shrq $32, %r12 + shrq $32, %r13 + movl %eax, 160(%rsp) + movl %r8d, 4+160(%rsp) + movl %r9d, 8+160(%rsp) + movl %r10d, 12+160(%rsp) + movl %ebx, 16+160(%rsp) + movl %r11d, 20+160(%rsp) + movl %r12d, 24+160(%rsp) + movl %r13d, 28+160(%rsp) + movq %r14, 48(%rsp) + movq 64(%rsp), %rax + vpbroadcastd 0(%rsp), %ymm0 + vpbroadcastd 4+0(%rsp), %ymm1 + vpbroadcastd 8+0(%rsp), %ymm2 + vpbroadcastd 12+0(%rsp), %ymm3 + vpbroadcastd 16(%rsp), %ymm4 + vpbroadcastd 4+16(%rsp), %ymm5 + vpbroadcastd 8+16(%rsp), %ymm6 + vpbroadcastd 12+16(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 +.Lchacha_blocks_avx2_mainloop1: + vpaddd %ymm0, %ymm4, %ymm0 + vpaddd %ymm1, %ymm5, %ymm1 + vpxor %ymm12, %ymm0, %ymm12 + vpxor %ymm13, %ymm1, %ymm13 + vpaddd %ymm2, %ymm6, %ymm2 + vpaddd %ymm3, %ymm7, %ymm3 + vpxor %ymm14, %ymm2, %ymm14 + vpxor %ymm15, %ymm3, %ymm15 + vpshufb 448(%rsp), %ymm12, %ymm12 + vpshufb 448(%rsp), %ymm13, %ymm13 + vpaddd %ymm8, %ymm12, %ymm8 + vpaddd %ymm9, %ymm13, %ymm9 + vpshufb 448(%rsp), %ymm14, %ymm14 + vpshufb 448(%rsp), %ymm15, %ymm15 + vpaddd %ymm10, %ymm14, %ymm10 + vpaddd %ymm11, %ymm15, %ymm11 + vmovdqa %ymm12, 96(%rsp) + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm5, %ymm9, %ymm5 + vpslld $ 12, %ymm4, %ymm12 + vpsrld $20, %ymm4, %ymm4 + vpxor %ymm4, %ymm12, %ymm4 + vpslld $ 12, %ymm5, %ymm12 + vpsrld $20, %ymm5, %ymm5 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm6, %ymm10, %ymm6 + vpxor %ymm7, %ymm11, %ymm7 + vpslld $ 12, %ymm6, %ymm12 + vpsrld $20, %ymm6, %ymm6 + vpxor %ymm6, %ymm12, %ymm6 + vpslld $ 12, %ymm7, %ymm12 + vpsrld $20, %ymm7, %ymm7 + vpxor %ymm7, %ymm12, %ymm7 + vpaddd %ymm0, %ymm4, %ymm0 + vpaddd %ymm1, %ymm5, %ymm1 + vpxor 96(%rsp), %ymm0, %ymm12 + vpxor %ymm13, %ymm1, %ymm13 + vpaddd %ymm2, %ymm6, %ymm2 + vpaddd %ymm3, %ymm7, %ymm3 + vpxor %ymm14, %ymm2, %ymm14 + vpxor %ymm15, %ymm3, %ymm15 + vpshufb 480(%rsp), %ymm12, %ymm12 + vpshufb 480(%rsp), %ymm13, %ymm13 + vpaddd %ymm8, %ymm12, %ymm8 + vpaddd %ymm9, %ymm13, %ymm9 + vpshufb 480(%rsp), %ymm14, %ymm14 + vpshufb 480(%rsp), %ymm15, %ymm15 + vpaddd %ymm10, %ymm14, %ymm10 + vpaddd %ymm11, %ymm15, %ymm11 + vmovdqa %ymm12, 96(%rsp) + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm5, %ymm9, %ymm5 + vpslld $ 7, %ymm4, %ymm12 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm4, %ymm12, %ymm4 + vpslld $ 7, %ymm5, %ymm12 + vpsrld $25, %ymm5, %ymm5 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm6, %ymm10, %ymm6 + vpxor %ymm7, %ymm11, %ymm7 + vpslld $ 7, %ymm6, %ymm12 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm6, %ymm12, %ymm6 + vpslld $ 7, %ymm7, %ymm12 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm7, %ymm12, %ymm7 + vpaddd %ymm0, %ymm5, %ymm0 + vpaddd %ymm1, %ymm6, %ymm1 + vpxor %ymm15, %ymm0, %ymm15 + vpxor 96(%rsp), %ymm1, %ymm12 + vpaddd %ymm2, %ymm7, %ymm2 + vpaddd %ymm3, %ymm4, %ymm3 + vpxor %ymm13, %ymm2, %ymm13 + vpxor %ymm14, %ymm3, %ymm14 + vpshufb 448(%rsp), %ymm15, %ymm15 + vpshufb 448(%rsp), %ymm12, %ymm12 + vpaddd %ymm10, %ymm15, %ymm10 + vpaddd %ymm11, %ymm12, %ymm11 + vpshufb 448(%rsp), %ymm13, %ymm13 + vpshufb 448(%rsp), %ymm14, %ymm14 + vpaddd %ymm8, %ymm13, %ymm8 + vpaddd %ymm9, %ymm14, %ymm9 + vmovdqa %ymm15, 96(%rsp) + vpxor %ymm5, %ymm10, %ymm5 + vpxor %ymm6, %ymm11, %ymm6 + vpslld $ 12, %ymm5, %ymm15 + vpsrld $20, %ymm5, %ymm5 + vpxor %ymm5, %ymm15, %ymm5 + vpslld $ 12, %ymm6, %ymm15 + vpsrld $20, %ymm6, %ymm6 + vpxor %ymm6, %ymm15, %ymm6 + vpxor %ymm7, %ymm8, %ymm7 + vpxor %ymm4, %ymm9, %ymm4 + vpslld $ 12, %ymm7, %ymm15 + vpsrld $20, %ymm7, %ymm7 + vpxor %ymm7, %ymm15, %ymm7 + vpslld $ 12, %ymm4, %ymm15 + vpsrld $20, %ymm4, %ymm4 + vpxor %ymm4, %ymm15, %ymm4 + vpaddd %ymm0, %ymm5, %ymm0 + vpaddd %ymm1, %ymm6, %ymm1 + vpxor 96(%rsp), %ymm0, %ymm15 + vpxor %ymm12, %ymm1, %ymm12 + vpaddd %ymm2, %ymm7, %ymm2 + vpaddd %ymm3, %ymm4, %ymm3 + vpxor %ymm13, %ymm2, %ymm13 + vpxor %ymm14, %ymm3, %ymm14 + vpshufb 480(%rsp), %ymm15, %ymm15 + vpshufb 480(%rsp), %ymm12, %ymm12 + vpaddd %ymm10, %ymm15, %ymm10 + vpaddd %ymm11, %ymm12, %ymm11 + vpshufb 480(%rsp), %ymm13, %ymm13 + vpshufb 480(%rsp), %ymm14, %ymm14 + vpaddd %ymm8, %ymm13, %ymm8 + vpaddd %ymm9, %ymm14, %ymm9 + vmovdqa %ymm15, 96(%rsp) + vpxor %ymm5, %ymm10, %ymm5 + vpxor %ymm6, %ymm11, %ymm6 + vpslld $ 7, %ymm5, %ymm15 + vpsrld $25, %ymm5, %ymm5 + vpxor %ymm5, %ymm15, %ymm5 + vpslld $ 7, %ymm6, %ymm15 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm6, %ymm15, %ymm6 + vpxor %ymm7, %ymm8, %ymm7 + vpxor %ymm4, %ymm9, %ymm4 + vpslld $ 7, %ymm7, %ymm15 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm7, %ymm15, %ymm7 + vpslld $ 7, %ymm4, %ymm15 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm4, %ymm15, %ymm4 + vmovdqa 96(%rsp), %ymm15 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop1 + vmovdqa %ymm8, 192(%rsp) + vmovdqa %ymm9, 224(%rsp) + vmovdqa %ymm10, 256(%rsp) + vmovdqa %ymm11, 288(%rsp) + vmovdqa %ymm12, 320(%rsp) + vmovdqa %ymm13, 352(%rsp) + vmovdqa %ymm14, 384(%rsp) + vmovdqa %ymm15, 416(%rsp) + vpbroadcastd 0(%rsp), %ymm8 + vpbroadcastd 4+0(%rsp), %ymm9 + vpbroadcastd 8+0(%rsp), %ymm10 + vpbroadcastd 12+0(%rsp), %ymm11 + vpbroadcastd 16(%rsp), %ymm12 + vpbroadcastd 4+16(%rsp), %ymm13 + vpbroadcastd 8+16(%rsp), %ymm14 + vpbroadcastd 12+16(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput1 + vpxor 0(%rsi), %ymm8, %ymm8 + vpxor 64(%rsi), %ymm9, %ymm9 + vpxor 128(%rsi), %ymm10, %ymm10 + vpxor 192(%rsi), %ymm11, %ymm11 + vpxor 256(%rsi), %ymm12, %ymm12 + vpxor 320(%rsi), %ymm13, %ymm13 + vpxor 384(%rsi), %ymm14, %ymm14 + vpxor 448(%rsi), %ymm15, %ymm15 + vmovdqu %ymm8, 0(%rdx) + vmovdqu %ymm9, 64(%rdx) + vmovdqu %ymm10, 128(%rdx) + vmovdqu %ymm11, 192(%rdx) + vmovdqu %ymm12, 256(%rdx) + vmovdqu %ymm13, 320(%rdx) + vmovdqu %ymm14, 384(%rdx) + vmovdqu %ymm15, 448(%rdx) + vmovdqa 192(%rsp), %ymm0 + vmovdqa 224(%rsp), %ymm1 + vmovdqa 256(%rsp), %ymm2 + vmovdqa 288(%rsp), %ymm3 + vmovdqa 320(%rsp), %ymm4 + vmovdqa 352(%rsp), %ymm5 + vmovdqa 384(%rsp), %ymm6 + vmovdqa 416(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + vpxor 32(%rsi), %ymm8, %ymm8 + vpxor 96(%rsi), %ymm9, %ymm9 + vpxor 160(%rsi), %ymm10, %ymm10 + vpxor 224(%rsi), %ymm11, %ymm11 + vpxor 288(%rsi), %ymm12, %ymm12 + vpxor 352(%rsi), %ymm13, %ymm13 + vpxor 416(%rsi), %ymm14, %ymm14 + vpxor 480(%rsi), %ymm15, %ymm15 + vmovdqu %ymm8, 32(%rdx) + vmovdqu %ymm9, 96(%rdx) + vmovdqu %ymm10, 160(%rdx) + vmovdqu %ymm11, 224(%rdx) + vmovdqu %ymm12, 288(%rdx) + vmovdqu %ymm13, 352(%rdx) + vmovdqu %ymm14, 416(%rdx) + vmovdqu %ymm15, 480(%rdx) + addq $512, %rsi + jmp .Lchacha_blocks_avx2_mainloop1_cont +.Lchacha_blocks_avx2_noinput1: + vmovdqu %ymm8, 0(%rdx) + vmovdqu %ymm9, 64(%rdx) + vmovdqu %ymm10, 128(%rdx) + vmovdqu %ymm11, 192(%rdx) + vmovdqu %ymm12, 256(%rdx) + vmovdqu %ymm13, 320(%rdx) + vmovdqu %ymm14, 384(%rdx) + vmovdqu %ymm15, 448(%rdx) + vmovdqa 192(%rsp), %ymm0 + vmovdqa 224(%rsp), %ymm1 + vmovdqa 256(%rsp), %ymm2 + vmovdqa 288(%rsp), %ymm3 + vmovdqa 320(%rsp), %ymm4 + vmovdqa 352(%rsp), %ymm5 + vmovdqa 384(%rsp), %ymm6 + vmovdqa 416(%rsp), %ymm7 + vpbroadcastd 32(%rsp), %ymm8 + vpbroadcastd 4+32(%rsp), %ymm9 + vpbroadcastd 8+32(%rsp), %ymm10 + vpbroadcastd 12+32(%rsp), %ymm11 + vmovdqa 128(%rsp), %ymm12 + vmovdqa 160(%rsp), %ymm13 + vpbroadcastd 8+48(%rsp), %ymm14 + vpbroadcastd 12+48(%rsp), %ymm15 + vpaddd %ymm8, %ymm0, %ymm0 + vpaddd %ymm9, %ymm1, %ymm1 + vpaddd %ymm10, %ymm2, %ymm2 + vpaddd %ymm11, %ymm3, %ymm3 + vpaddd %ymm12, %ymm4, %ymm4 + vpaddd %ymm13, %ymm5, %ymm5 + vpaddd %ymm14, %ymm6, %ymm6 + vpaddd %ymm15, %ymm7, %ymm7 + vpunpckldq %ymm1, %ymm0, %ymm8 + vpunpckldq %ymm3, %ymm2, %ymm9 + vpunpckhdq %ymm1, %ymm0, %ymm12 + vpunpckhdq %ymm3, %ymm2, %ymm13 + vpunpckldq %ymm5, %ymm4, %ymm10 + vpunpckldq %ymm7, %ymm6, %ymm11 + vpunpckhdq %ymm5, %ymm4, %ymm14 + vpunpckhdq %ymm7, %ymm6, %ymm15 + vpunpcklqdq %ymm9, %ymm8, %ymm0 + vpunpcklqdq %ymm11, %ymm10, %ymm1 + vpunpckhqdq %ymm9, %ymm8, %ymm2 + vpunpckhqdq %ymm11, %ymm10, %ymm3 + vpunpcklqdq %ymm13, %ymm12, %ymm4 + vpunpcklqdq %ymm15, %ymm14, %ymm5 + vpunpckhqdq %ymm13, %ymm12, %ymm6 + vpunpckhqdq %ymm15, %ymm14, %ymm7 + vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 + vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 + vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 + vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 + vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 + vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 + vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 + vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 + vmovdqu %ymm8, 32(%rdx) + vmovdqu %ymm9, 96(%rdx) + vmovdqu %ymm10, 160(%rdx) + vmovdqu %ymm11, 224(%rdx) + vmovdqu %ymm12, 288(%rdx) + vmovdqu %ymm13, 352(%rdx) + vmovdqu %ymm14, 416(%rdx) + vmovdqu %ymm15, 480(%rdx) +.Lchacha_blocks_avx2_mainloop1_cont: + addq $512, %rdx + subq $512, %rcx + cmp $512, %rcx + jae .Lchacha_blocks_avx2_atleast512 + cmp $256, %rcx + jb .Lchacha_blocks_avx2_below256_fixup +.Lchacha_blocks_avx2_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 128(%rsp) + movl %r8d, 4+128(%rsp) + movl %r9d, 8+128(%rsp) + movl %r10d, 12+128(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 160(%rsp) + movl %r8d, 4+160(%rsp) + movl %r9d, 8+160(%rsp) + movl %r10d, 12+160(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + vpbroadcastd 0(%rsp), %xmm0 + vpbroadcastd 4+0(%rsp), %xmm1 + vpbroadcastd 8+0(%rsp), %xmm2 + vpbroadcastd 12+0(%rsp), %xmm3 + vpbroadcastd 16(%rsp), %xmm4 + vpbroadcastd 4+16(%rsp), %xmm5 + vpbroadcastd 8+16(%rsp), %xmm6 + vpbroadcastd 12+16(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 +.Lchacha_blocks_avx2_mainloop2: + vpaddd %xmm0, %xmm4, %xmm0 + vpaddd %xmm1, %xmm5, %xmm1 + vpxor %xmm12, %xmm0, %xmm12 + vpxor %xmm13, %xmm1, %xmm13 + vpaddd %xmm2, %xmm6, %xmm2 + vpaddd %xmm3, %xmm7, %xmm3 + vpxor %xmm14, %xmm2, %xmm14 + vpxor %xmm15, %xmm3, %xmm15 + vpshufb 448(%rsp), %xmm12, %xmm12 + vpshufb 448(%rsp), %xmm13, %xmm13 + vpaddd %xmm8, %xmm12, %xmm8 + vpaddd %xmm9, %xmm13, %xmm9 + vpshufb 448(%rsp), %xmm14, %xmm14 + vpshufb 448(%rsp), %xmm15, %xmm15 + vpaddd %xmm10, %xmm14, %xmm10 + vpaddd %xmm11, %xmm15, %xmm11 + vmovdqa %xmm12, 96(%rsp) + vpxor %xmm4, %xmm8, %xmm4 + vpxor %xmm5, %xmm9, %xmm5 + vpslld $ 12, %xmm4, %xmm12 + vpsrld $20, %xmm4, %xmm4 + vpxor %xmm4, %xmm12, %xmm4 + vpslld $ 12, %xmm5, %xmm12 + vpsrld $20, %xmm5, %xmm5 + vpxor %xmm5, %xmm12, %xmm5 + vpxor %xmm6, %xmm10, %xmm6 + vpxor %xmm7, %xmm11, %xmm7 + vpslld $ 12, %xmm6, %xmm12 + vpsrld $20, %xmm6, %xmm6 + vpxor %xmm6, %xmm12, %xmm6 + vpslld $ 12, %xmm7, %xmm12 + vpsrld $20, %xmm7, %xmm7 + vpxor %xmm7, %xmm12, %xmm7 + vpaddd %xmm0, %xmm4, %xmm0 + vpaddd %xmm1, %xmm5, %xmm1 + vpxor 96(%rsp), %xmm0, %xmm12 + vpxor %xmm13, %xmm1, %xmm13 + vpaddd %xmm2, %xmm6, %xmm2 + vpaddd %xmm3, %xmm7, %xmm3 + vpxor %xmm14, %xmm2, %xmm14 + vpxor %xmm15, %xmm3, %xmm15 + vpshufb 480(%rsp), %xmm12, %xmm12 + vpshufb 480(%rsp), %xmm13, %xmm13 + vpaddd %xmm8, %xmm12, %xmm8 + vpaddd %xmm9, %xmm13, %xmm9 + vpshufb 480(%rsp), %xmm14, %xmm14 + vpshufb 480(%rsp), %xmm15, %xmm15 + vpaddd %xmm10, %xmm14, %xmm10 + vpaddd %xmm11, %xmm15, %xmm11 + vmovdqa %xmm12, 96(%rsp) + vpxor %xmm4, %xmm8, %xmm4 + vpxor %xmm5, %xmm9, %xmm5 + vpslld $ 7, %xmm4, %xmm12 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm4, %xmm12, %xmm4 + vpslld $ 7, %xmm5, %xmm12 + vpsrld $25, %xmm5, %xmm5 + vpxor %xmm5, %xmm12, %xmm5 + vpxor %xmm6, %xmm10, %xmm6 + vpxor %xmm7, %xmm11, %xmm7 + vpslld $ 7, %xmm6, %xmm12 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm6, %xmm12, %xmm6 + vpslld $ 7, %xmm7, %xmm12 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm7, %xmm12, %xmm7 + vpaddd %xmm0, %xmm5, %xmm0 + vpaddd %xmm1, %xmm6, %xmm1 + vpxor %xmm15, %xmm0, %xmm15 + vpxor 96(%rsp), %xmm1, %xmm12 + vpaddd %xmm2, %xmm7, %xmm2 + vpaddd %xmm3, %xmm4, %xmm3 + vpxor %xmm13, %xmm2, %xmm13 + vpxor %xmm14, %xmm3, %xmm14 + vpshufb 448(%rsp), %xmm15, %xmm15 + vpshufb 448(%rsp), %xmm12, %xmm12 + vpaddd %xmm10, %xmm15, %xmm10 + vpaddd %xmm11, %xmm12, %xmm11 + vpshufb 448(%rsp), %xmm13, %xmm13 + vpshufb 448(%rsp), %xmm14, %xmm14 + vpaddd %xmm8, %xmm13, %xmm8 + vpaddd %xmm9, %xmm14, %xmm9 + vmovdqa %xmm15, 96(%rsp) + vpxor %xmm5, %xmm10, %xmm5 + vpxor %xmm6, %xmm11, %xmm6 + vpslld $ 12, %xmm5, %xmm15 + vpsrld $20, %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm5 + vpslld $ 12, %xmm6, %xmm15 + vpsrld $20, %xmm6, %xmm6 + vpxor %xmm6, %xmm15, %xmm6 + vpxor %xmm7, %xmm8, %xmm7 + vpxor %xmm4, %xmm9, %xmm4 + vpslld $ 12, %xmm7, %xmm15 + vpsrld $20, %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm7 + vpslld $ 12, %xmm4, %xmm15 + vpsrld $20, %xmm4, %xmm4 + vpxor %xmm4, %xmm15, %xmm4 + vpaddd %xmm0, %xmm5, %xmm0 + vpaddd %xmm1, %xmm6, %xmm1 + vpxor 96(%rsp), %xmm0, %xmm15 + vpxor %xmm12, %xmm1, %xmm12 + vpaddd %xmm2, %xmm7, %xmm2 + vpaddd %xmm3, %xmm4, %xmm3 + vpxor %xmm13, %xmm2, %xmm13 + vpxor %xmm14, %xmm3, %xmm14 + vpshufb 480(%rsp), %xmm15, %xmm15 + vpshufb 480(%rsp), %xmm12, %xmm12 + vpaddd %xmm10, %xmm15, %xmm10 + vpaddd %xmm11, %xmm12, %xmm11 + vpshufb 480(%rsp), %xmm13, %xmm13 + vpshufb 480(%rsp), %xmm14, %xmm14 + vpaddd %xmm8, %xmm13, %xmm8 + vpaddd %xmm9, %xmm14, %xmm9 + vmovdqa %xmm15, 96(%rsp) + vpxor %xmm5, %xmm10, %xmm5 + vpxor %xmm6, %xmm11, %xmm6 + vpslld $ 7, %xmm5, %xmm15 + vpsrld $25, %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm5 + vpslld $ 7, %xmm6, %xmm15 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm6, %xmm15, %xmm6 + vpxor %xmm7, %xmm8, %xmm7 + vpxor %xmm4, %xmm9, %xmm4 + vpslld $ 7, %xmm7, %xmm15 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm7 + vpslld $ 7, %xmm4, %xmm15 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm4, %xmm15, %xmm4 + vmovdqa 96(%rsp), %xmm15 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop2 + vmovdqa %xmm8, 192(%rsp) + vmovdqa %xmm9, 208(%rsp) + vmovdqa %xmm10, 224(%rsp) + vmovdqa %xmm11, 240(%rsp) + vmovdqa %xmm12, 256(%rsp) + vmovdqa %xmm13, 272(%rsp) + vmovdqa %xmm14, 288(%rsp) + vmovdqa %xmm15, 304(%rsp) + vpbroadcastd 0(%rsp), %xmm8 + vpbroadcastd 4+0(%rsp), %xmm9 + vpbroadcastd 8+0(%rsp), %xmm10 + vpbroadcastd 12+0(%rsp), %xmm11 + vpbroadcastd 16(%rsp), %xmm12 + vpbroadcastd 4+16(%rsp), %xmm13 + vpbroadcastd 8+16(%rsp), %xmm14 + vpbroadcastd 12+16(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput2 + vpxor 0(%rsi), %xmm0, %xmm0 + vpxor 16(%rsi), %xmm1, %xmm1 + vpxor 64(%rsi), %xmm2, %xmm2 + vpxor 80(%rsi), %xmm3, %xmm3 + vpxor 128(%rsi), %xmm4, %xmm4 + vpxor 144(%rsi), %xmm5, %xmm5 + vpxor 192(%rsi), %xmm6, %xmm6 + vpxor 208(%rsi), %xmm7, %xmm7 + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 64(%rdx) + vmovdqu %xmm3, 80(%rdx) + vmovdqu %xmm4, 128(%rdx) + vmovdqu %xmm5, 144(%rdx) + vmovdqu %xmm6, 192(%rdx) + vmovdqu %xmm7, 208(%rdx) + vmovdqa 192(%rsp), %xmm0 + vmovdqa 208(%rsp), %xmm1 + vmovdqa 224(%rsp), %xmm2 + vmovdqa 240(%rsp), %xmm3 + vmovdqa 256(%rsp), %xmm4 + vmovdqa 272(%rsp), %xmm5 + vmovdqa 288(%rsp), %xmm6 + vmovdqa 304(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + vpxor 32(%rsi), %xmm0, %xmm0 + vpxor 48(%rsi), %xmm1, %xmm1 + vpxor 96(%rsi), %xmm2, %xmm2 + vpxor 112(%rsi), %xmm3, %xmm3 + vpxor 160(%rsi), %xmm4, %xmm4 + vpxor 176(%rsi), %xmm5, %xmm5 + vpxor 224(%rsi), %xmm6, %xmm6 + vpxor 240(%rsi), %xmm7, %xmm7 + vmovdqu %xmm0, 32(%rdx) + vmovdqu %xmm1, 48(%rdx) + vmovdqu %xmm2, 96(%rdx) + vmovdqu %xmm3, 112(%rdx) + vmovdqu %xmm4, 160(%rdx) + vmovdqu %xmm5, 176(%rdx) + vmovdqu %xmm6, 224(%rdx) + vmovdqu %xmm7, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_avx2_mainloop2_cont +.Lchacha_blocks_avx2_noinput2: + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 64(%rdx) + vmovdqu %xmm3, 80(%rdx) + vmovdqu %xmm4, 128(%rdx) + vmovdqu %xmm5, 144(%rdx) + vmovdqu %xmm6, 192(%rdx) + vmovdqu %xmm7, 208(%rdx) + vmovdqa 192(%rsp), %xmm0 + vmovdqa 208(%rsp), %xmm1 + vmovdqa 224(%rsp), %xmm2 + vmovdqa 240(%rsp), %xmm3 + vmovdqa 256(%rsp), %xmm4 + vmovdqa 272(%rsp), %xmm5 + vmovdqa 288(%rsp), %xmm6 + vmovdqa 304(%rsp), %xmm7 + vpbroadcastd 32(%rsp), %xmm8 + vpbroadcastd 4+32(%rsp), %xmm9 + vpbroadcastd 8+32(%rsp), %xmm10 + vpbroadcastd 12+32(%rsp), %xmm11 + vmovdqa 128(%rsp), %xmm12 + vmovdqa 160(%rsp), %xmm13 + vpbroadcastd 8+48(%rsp), %xmm14 + vpbroadcastd 12+48(%rsp), %xmm15 + vpaddd %xmm8, %xmm0, %xmm0 + vpaddd %xmm9, %xmm1, %xmm1 + vpaddd %xmm10, %xmm2, %xmm2 + vpaddd %xmm11, %xmm3, %xmm3 + vpaddd %xmm12, %xmm4, %xmm4 + vpaddd %xmm13, %xmm5, %xmm5 + vpaddd %xmm14, %xmm6, %xmm6 + vpaddd %xmm15, %xmm7, %xmm7 + vpunpckldq %xmm1, %xmm0, %xmm8 + vpunpckldq %xmm3, %xmm2, %xmm9 + vpunpckhdq %xmm1, %xmm0, %xmm12 + vpunpckhdq %xmm3, %xmm2, %xmm13 + vpunpckldq %xmm5, %xmm4, %xmm10 + vpunpckldq %xmm7, %xmm6, %xmm11 + vpunpckhdq %xmm5, %xmm4, %xmm14 + vpunpckhdq %xmm7, %xmm6, %xmm15 + vpunpcklqdq %xmm9, %xmm8, %xmm0 + vpunpcklqdq %xmm11, %xmm10, %xmm1 + vpunpckhqdq %xmm9, %xmm8, %xmm2 + vpunpckhqdq %xmm11, %xmm10, %xmm3 + vpunpcklqdq %xmm13, %xmm12, %xmm4 + vpunpcklqdq %xmm15, %xmm14, %xmm5 + vpunpckhqdq %xmm13, %xmm12, %xmm6 + vpunpckhqdq %xmm15, %xmm14, %xmm7 + vmovdqu %xmm0, 32(%rdx) + vmovdqu %xmm1, 48(%rdx) + vmovdqu %xmm2, 96(%rdx) + vmovdqu %xmm3, 112(%rdx) + vmovdqu %xmm4, 160(%rdx) + vmovdqu %xmm5, 176(%rdx) + vmovdqu %xmm6, 224(%rdx) + vmovdqu %xmm7, 240(%rdx) +.Lchacha_blocks_avx2_mainloop2_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_avx2_atleast256 +.Lchacha_blocks_avx2_below256_fixup: + vmovdqa 448(%rsp), %xmm6 + vmovdqa 480(%rsp), %xmm7 + vmovdqa 0(%rsp), %xmm8 + vmovdqa 16(%rsp), %xmm9 + vmovdqa 32(%rsp), %xmm10 + vmovdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_avx2_below256: + vmovq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_avx2_done + cmpq $64, %rcx + jae .Lchacha_blocks_avx2_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput3 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_avx2_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_avx2_copyinput + movq %rsp, %rsi +.Lchacha_blocks_avx2_noinput3: + movq %rsp, %rdx +.Lchacha_blocks_avx2_above63: + vmovdqa %xmm8, %xmm0 + vmovdqa %xmm9, %xmm1 + vmovdqa %xmm10, %xmm2 + vmovdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_avx2_mainloop3: + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm6, %xmm3, %xmm3 + vpaddd %xmm2, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm1 + vpslld $12, %xmm1, %xmm4 + vpsrld $20, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufd $0x93, %xmm0, %xmm0 + vpaddd %xmm2, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm1 + vpshufd $0x39, %xmm2, %xmm2 + vpslld $7, %xmm1, %xmm4 + vpsrld $25, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm6, %xmm3, %xmm3 + vpaddd %xmm2, %xmm3, %xmm2 + vpxor %xmm1, %xmm2, %xmm1 + vpslld $12, %xmm1, %xmm4 + vpsrld $20, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + vpaddd %xmm0, %xmm1, %xmm0 + vpxor %xmm3, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufd $0x39, %xmm0, %xmm0 + vpaddd %xmm2, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm1 + vpshufd $0x93, %xmm2, %xmm2 + vpslld $7, %xmm1, %xmm4 + vpsrld $25, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm1 + subq $2, %rax + jnz .Lchacha_blocks_avx2_mainloop3 + vpaddd %xmm0, %xmm8, %xmm0 + vpaddd %xmm1, %xmm9, %xmm1 + vpaddd %xmm2, %xmm10, %xmm2 + vpaddd %xmm3, %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_avx2_noinput4 + vpxor 0(%rsi), %xmm0, %xmm0 + vpxor 16(%rsi), %xmm1, %xmm1 + vpxor 32(%rsi), %xmm2, %xmm2 + vpxor 48(%rsi), %xmm3, %xmm3 + addq $64, %rsi +.Lchacha_blocks_avx2_noinput4: + vmovdqu %xmm0, 0(%rdx) + vmovdqu %xmm1, 16(%rdx) + vmovdqu %xmm2, 32(%rdx) + vmovdqu %xmm3, 48(%rdx) + vpaddq %xmm11, %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_avx2_mainloop3_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_avx2_below256 +.Lchacha_blocks_avx2_mainloop3_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_avx2_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_avx2_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_avx2_copyoutput +.Lchacha_blocks_avx2_done: + vmovdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + vzeroall + movl $(63 + 512), %eax + ret +.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks; + +.data +.align 16 +.LC: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index de8982b..2ac5a32 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -54,6 +54,13 @@ # define USE_SSSE3 1 #endif +/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) +# define USE_AVX2 1 +#endif + struct CHACHA20_context_s; @@ -77,6 +84,13 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, #endif /* USE_SSSE3 */ +#ifdef USE_AVX2 + +unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_AVX2 */ + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -314,6 +328,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, if (features & HWF_INTEL_SSSE3) ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks; #endif +#ifdef USE_AVX2 + if (features & HWF_INTEL_AVX2) + ctx->blocks = _gcry_chacha20_amd64_avx2_blocks; +#endif (void)features; @@ -422,7 +440,7 @@ selftest (void) { CHACHA20_context_t ctx; byte scratch[127 + 1]; - byte buf[256 + 64 + 4]; + byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ diff --git a/configure.ac b/configure.ac index 0342067..3a0fd52 100644 --- a/configure.ac +++ b/configure.ac @@ -1816,6 +1816,7 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; esac fi commit def7d4cad386271c6d4e2f10aabe0cb4abd871e4 Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 chacha20: add SSSE3 assembly implementation * cipher/Makefile.am: Add 'chacha20-ssse3-amd64.S'. * cipher/chacha20-ssse3-amd64.S: New. * cipher/chacha20.c (USE_SSSE3): New macro. [USE_SSSE3] (_gcry_chacha20_amd64_ssse3_blocks): New. (chacha20_do_setkey): Select SSSE3 implementation if there is HW support. * configure.ac [host=x86-64]: Add 'chacha20-ssse3-amd64.lo'. -- Add SSSE3 optimized implementation for ChaCha20. Based on implementation by Andrew Moon. Before (Intel Haswell): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.97 ns/B 483.6 MiB/s 6.31 c/B STREAM dec | 1.97 ns/B 484.0 MiB/s 6.31 c/B After: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.742 ns/B 1284.8 MiB/s 2.38 c/B STREAM dec | 0.741 ns/B 1286.5 MiB/s 2.37 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index bc7959a..27ca7ac 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c \ +chacha20.c chacha20-ssse3-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S new file mode 100644 index 0000000..aaa7e5b --- /dev/null +++ b/cipher/chacha20-ssse3-amd64.S @@ -0,0 +1,610 @@ +/* chacha20-ssse3-amd64.S - AMD64/SSSE3 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20 + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +.text + +.align 8 +.globl _gcry_chacha20_amd64_ssse3_blocks +.type _gcry_chacha20_amd64_ssse3_blocks, at function; +_gcry_chacha20_amd64_ssse3_blocks: +.Lchacha_blocks_ssse3_local: + pushq %rbx + pushq %rbp + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + leaq .LC RIP, %rax + movdqa 0(%rax), %xmm6 + movdqa 16(%rax), %xmm7 + movdqu 0(%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movl $20, %eax + movq $1, %r9 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movdqa %xmm6, 80(%rsp) + movdqa %xmm7, 96(%rsp) + movq %rax, 64(%rsp) + cmpq $256, %rcx + jb .Lchacha_blocks_ssse3_below256 + pshufd $0x00, %xmm8, %xmm0 + pshufd $0x55, %xmm8, %xmm1 + pshufd $0xaa, %xmm8, %xmm2 + pshufd $0xff, %xmm8, %xmm3 + movdqa %xmm0, 128(%rsp) + movdqa %xmm1, 144(%rsp) + movdqa %xmm2, 160(%rsp) + movdqa %xmm3, 176(%rsp) + pshufd $0x00, %xmm9, %xmm0 + pshufd $0x55, %xmm9, %xmm1 + pshufd $0xaa, %xmm9, %xmm2 + pshufd $0xff, %xmm9, %xmm3 + movdqa %xmm0, 192(%rsp) + movdqa %xmm1, 208(%rsp) + movdqa %xmm2, 224(%rsp) + movdqa %xmm3, 240(%rsp) + pshufd $0x00, %xmm10, %xmm0 + pshufd $0x55, %xmm10, %xmm1 + pshufd $0xaa, %xmm10, %xmm2 + pshufd $0xff, %xmm10, %xmm3 + movdqa %xmm0, 256(%rsp) + movdqa %xmm1, 272(%rsp) + movdqa %xmm2, 288(%rsp) + movdqa %xmm3, 304(%rsp) + pshufd $0xaa, %xmm11, %xmm0 + pshufd $0xff, %xmm11, %xmm1 + movdqa %xmm0, 352(%rsp) + movdqa %xmm1, 368(%rsp) + jmp .Lchacha_blocks_ssse3_atleast256 +.p2align 6,,63 + # align to 4 mod 64 + nop;nop;nop;nop; +.Lchacha_blocks_ssse3_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 320(%rsp) + movl %r8d, 4+320(%rsp) + movl %r9d, 8+320(%rsp) + movl %r10d, 12+320(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 336(%rsp) + movl %r8d, 4+336(%rsp) + movl %r9d, 8+336(%rsp) + movl %r10d, 12+336(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + movdqa 128(%rsp), %xmm0 + movdqa 144(%rsp), %xmm1 + movdqa 160(%rsp), %xmm2 + movdqa 176(%rsp), %xmm3 + movdqa 192(%rsp), %xmm4 + movdqa 208(%rsp), %xmm5 + movdqa 224(%rsp), %xmm6 + movdqa 240(%rsp), %xmm7 + movdqa 256(%rsp), %xmm8 + movdqa 272(%rsp), %xmm9 + movdqa 288(%rsp), %xmm10 + movdqa 304(%rsp), %xmm11 + movdqa 320(%rsp), %xmm12 + movdqa 336(%rsp), %xmm13 + movdqa 352(%rsp), %xmm14 + movdqa 368(%rsp), %xmm15 +.Lchacha_blocks_ssse3_mainloop1: + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshufb 80(%rsp), %xmm12 + pshufb 80(%rsp), %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + pshufb 80(%rsp), %xmm14 + pshufb 80(%rsp), %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa %xmm4, %xmm12 + pslld $ 12, %xmm4 + psrld $20, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 12, %xmm5 + psrld $20, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 12, %xmm6 + psrld $20, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 12, %xmm7 + psrld $20, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshufb 96(%rsp), %xmm12 + pshufb 96(%rsp), %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + pshufb 96(%rsp), %xmm14 + pshufb 96(%rsp), %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa %xmm4, %xmm12 + pslld $ 7, %xmm4 + psrld $25, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 7, %xmm5 + psrld $25, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 7, %xmm6 + psrld $25, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 7, %xmm7 + psrld $25, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshufb 80(%rsp), %xmm15 + pshufb 80(%rsp), %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + pshufb 80(%rsp), %xmm13 + pshufb 80(%rsp), %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa %xmm5, %xmm15 + pslld $ 12, %xmm5 + psrld $20, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 12, %xmm6 + psrld $20, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 12, %xmm7 + psrld $20, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 12, %xmm4 + psrld $20, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshufb 96(%rsp), %xmm15 + pshufb 96(%rsp), %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + pshufb 96(%rsp), %xmm13 + pshufb 96(%rsp), %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa %xmm5, %xmm15 + pslld $ 7, %xmm5 + psrld $25, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 7, %xmm6 + psrld $25, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 7, %xmm7 + psrld $25, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 7, %xmm4 + psrld $25, %xmm15 + pxor %xmm15, %xmm4 + subq $2, %rax + movdqa 112(%rsp), %xmm15 + jnz .Lchacha_blocks_ssse3_mainloop1 + paddd 128(%rsp), %xmm0 + paddd 144(%rsp), %xmm1 + paddd 160(%rsp), %xmm2 + paddd 176(%rsp), %xmm3 + paddd 192(%rsp), %xmm4 + paddd 208(%rsp), %xmm5 + paddd 224(%rsp), %xmm6 + paddd 240(%rsp), %xmm7 + paddd 256(%rsp), %xmm8 + paddd 272(%rsp), %xmm9 + paddd 288(%rsp), %xmm10 + paddd 304(%rsp), %xmm11 + paddd 320(%rsp), %xmm12 + paddd 336(%rsp), %xmm13 + paddd 352(%rsp), %xmm14 + paddd 368(%rsp), %xmm15 + movdqa %xmm8, 384(%rsp) + movdqa %xmm9, 400(%rsp) + movdqa %xmm10, 416(%rsp) + movdqa %xmm11, 432(%rsp) + movdqa %xmm12, 448(%rsp) + movdqa %xmm13, 464(%rsp) + movdqa %xmm14, 480(%rsp) + movdqa %xmm15, 496(%rsp) + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm8, %xmm5 + movdqa %xmm10, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm1 + punpcklqdq %xmm6, %xmm3 + punpcklqdq %xmm9, %xmm5 + punpcklqdq %xmm11, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 64(%rsi), %xmm9 + movdqu 80(%rsi), %xmm11 + movdqu 128(%rsi), %xmm12 + movdqu 144(%rsi), %xmm13 + movdqu 192(%rsi), %xmm14 + movdqu 208(%rsi), %xmm15 + pxor %xmm2, %xmm5 + pxor %xmm6, %xmm7 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm1 + pxor %xmm13, %xmm3 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm6 + movdqu 96(%rsi), %xmm9 + movdqu 112(%rsi), %xmm11 + movdqu 160(%rsi), %xmm12 + movdqu 176(%rsi), %xmm13 + movdqu 224(%rsi), %xmm14 + movdqu 240(%rsi), %xmm15 + pxor %xmm2, %xmm1 + pxor %xmm6, %xmm5 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm3 + pxor %xmm13, %xmm7 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_ssse3_mainloop_cont +.Lchacha_blocks_ssse3_noinput1: + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) +.Lchacha_blocks_ssse3_mainloop_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_ssse3_atleast256 + movdqa 80(%rsp), %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa 0(%rsp), %xmm8 + movdqa 16(%rsp), %xmm9 + movdqa 32(%rsp), %xmm10 + movdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_ssse3_below256: + movq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_ssse3_done + cmpq $64, %rcx + jae .Lchacha_blocks_ssse3_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput2 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_ssse3_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_ssse3_copyinput + movq %rsp, %rsi +.Lchacha_blocks_ssse3_noinput2: + movq %rsp, %rdx +.Lchacha_blocks_ssse3_above63: + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_ssse3_mainloop2: + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm6, %xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1, %xmm4 + pslld $12, %xmm4 + psrld $20, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm7, %xmm3 + pshufd $0x93, %xmm0, %xmm0 + paddd %xmm3, %xmm2 + pshufd $0x4e, %xmm3, %xmm3 + pxor %xmm2, %xmm1 + pshufd $0x39, %xmm2, %xmm2 + movdqa %xmm1, %xmm4 + pslld $7, %xmm4 + psrld $25, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm6, %xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1, %xmm4 + pslld $12, %xmm4 + psrld $20, %xmm1 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshufb %xmm7, %xmm3 + pshufd $0x39, %xmm0, %xmm0 + paddd %xmm3, %xmm2 + pshufd $0x4e, %xmm3, %xmm3 + pxor %xmm2, %xmm1 + pshufd $0x93, %xmm2, %xmm2 + movdqa %xmm1, %xmm4 + pslld $7, %xmm4 + psrld $25, %xmm1 + pxor %xmm4, %xmm1 + subq $2, %rax + jnz .Lchacha_blocks_ssse3_mainloop2 + paddd %xmm8, %xmm0 + paddd %xmm9, %xmm1 + paddd %xmm10, %xmm2 + paddd %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_ssse3_noinput3 + movdqu 0(%rsi), %xmm12 + movdqu 16(%rsi), %xmm13 + movdqu 32(%rsi), %xmm14 + movdqu 48(%rsi), %xmm15 + pxor %xmm12, %xmm0 + pxor %xmm13, %xmm1 + pxor %xmm14, %xmm2 + pxor %xmm15, %xmm3 + addq $64, %rsi +.Lchacha_blocks_ssse3_noinput3: + movdqu %xmm0, 0(%rdx) + movdqu %xmm1, 16(%rdx) + movdqu %xmm2, 32(%rdx) + movdqu %xmm3, 48(%rdx) + paddq %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_ssse3_mainloop2_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_ssse3_below256 +.Lchacha_blocks_ssse3_mainloop2_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_ssse3_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_ssse3_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_ssse3_copyoutput +.Lchacha_blocks_ssse3_done: + movdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + popq %rbp + popq %rbx + movl $(63 + 512 + 16), %eax + ret +.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks; + +.data +.align 16; +.LC: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index ff0366d..de8982b 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -47,6 +47,13 @@ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) +# define USE_SSSE3 1 +#endif + struct CHACHA20_context_s; @@ -63,6 +70,14 @@ typedef struct CHACHA20_context_s } CHACHA20_context_t; +#ifdef USE_SSSE3 + +unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_SSSE3 */ + + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -279,6 +294,7 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, { static int initialized; static const char *selftest_failed; + unsigned int features = _gcry_get_hw_features (); if (!initialized) { @@ -294,6 +310,12 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, return GPG_ERR_INV_KEYLEN; ctx->blocks = chacha20_blocks; +#ifdef USE_SSSE3 + if (features & HWF_INTEL_SSSE3) + ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks; +#endif + + (void)features; chacha20_keysetup (ctx, key, keylen); diff --git a/configure.ac b/configure.ac index 7573952..0342067 100644 --- a/configure.ac +++ b/configure.ac @@ -1811,6 +1811,13 @@ LIST_MEMBER(chacha20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" + ;; + esac fi LIST_MEMBER(dsa, $enabled_pubkey_ciphers) commit 23f33d57c9b6f2295a8ddfc9a8eee5a2c30cf406 Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 Add ChaCha20 stream cipher * cipher/Makefile.am: Add 'chacha20.c'. * cipher/chacha20.c: New. * cipher/cipher.c (cipher_list): Add ChaCha20. * configure.ac: Add ChaCha20. * doc/gcrypt.texi: Add ChaCha20. * src/cipher.h (_gcry_cipher_spec_chacha20): New. * src/gcrypt.h.in (GCRY_CIPHER_CHACHA20): Add new algo. * tests/basic.c (MAX_DATA_LEN): Increase to 128 from 100. (check_stream_cipher): Add ChaCha20 test-vectors. (check_ciphers): Add ChaCha20. -- Patch adds Bernstein's ChaCha20 cipher to libgcrypt. Implementation is based on public domain implementations. Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3c20d3c..bc7959a 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -59,6 +59,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ +chacha20.c \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20.c b/cipher/chacha20.c new file mode 100644 index 0000000..ff0366d --- /dev/null +++ b/cipher/chacha20.c @@ -0,0 +1,504 @@ +/* chacha20.c - Bernstein's ChaCha20 cipher + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * For a description of the algorithm, see: + * http://cr.yp.to/chacha.html + */ + +/* The code is based on salsa20.c and public-domain ChaCha implementations: + * chacha-ref.c version 20080118 + * D. J. Bernstein + * Public domain. + * and + * Andrew Moon + * https://github.com/floodyberry/chacha-opt + */ + + +#include +#include +#include +#include +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" + + +#define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ +#define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ +#define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ +#define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ +#define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ +#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) + + +struct CHACHA20_context_s; + + +typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src, + byte *dst, size_t bytes); + +typedef struct CHACHA20_context_s +{ + u32 input[CHACHA20_INPUT_LENGTH]; + u32 pad[CHACHA20_INPUT_LENGTH]; + chacha20_blocks_t blocks; + unsigned int unused; /* bytes in the pad. */ +} CHACHA20_context_t; + + +static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); +static const char *selftest (void); + + + +#define QROUND(a,b,c,d) \ + do { \ + a += b; d = rol(d ^ a, 16); \ + c += d; b = rol(b ^ c, 12); \ + a += b; d = rol(d ^ a, 8); \ + c += d; b = rol(b ^ c, 7); \ + } while (0) + +#define QOUT(ai, bi, ci, di) \ + DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di) + +static unsigned int +chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes) +{ + u32 pad[CHACHA20_INPUT_LENGTH]; + u32 inp[CHACHA20_INPUT_LENGTH]; + unsigned int i; + + /* Note: 'bytes' must be multiple of 64 and not zero. */ + + inp[0] = state[0]; + inp[1] = state[1]; + inp[2] = state[2]; + inp[3] = state[3]; + inp[4] = state[4]; + inp[5] = state[5]; + inp[6] = state[6]; + inp[7] = state[7]; + inp[8] = state[8]; + inp[9] = state[9]; + inp[10] = state[10]; + inp[11] = state[11]; + inp[12] = state[12]; + inp[13] = state[13]; + inp[14] = state[14]; + inp[15] = state[15]; + + do + { + /* First round. */ + pad[0] = inp[0]; + pad[4] = inp[4]; + pad[8] = inp[8]; + pad[12] = inp[12]; + QROUND (pad[0], pad[4], pad[8], pad[12]); + pad[1] = inp[1]; + pad[5] = inp[5]; + pad[9] = inp[9]; + pad[13] = inp[13]; + QROUND (pad[1], pad[5], pad[9], pad[13]); + pad[2] = inp[2]; + pad[6] = inp[6]; + pad[10] = inp[10]; + pad[14] = inp[14]; + QROUND (pad[2], pad[6], pad[10], pad[14]); + pad[3] = inp[3]; + pad[7] = inp[7]; + pad[11] = inp[11]; + pad[15] = inp[15]; + QROUND (pad[3], pad[7], pad[11], pad[15]); + + QROUND (pad[0], pad[5], pad[10], pad[15]); + QROUND (pad[1], pad[6], pad[11], pad[12]); + QROUND (pad[2], pad[7], pad[8], pad[13]); + QROUND (pad[3], pad[4], pad[9], pad[14]); + + for (i = 2; i < 20 - 2; i += 2) + { + QROUND (pad[0], pad[4], pad[8], pad[12]); + QROUND (pad[1], pad[5], pad[9], pad[13]); + QROUND (pad[2], pad[6], pad[10], pad[14]); + QROUND (pad[3], pad[7], pad[11], pad[15]); + + QROUND (pad[0], pad[5], pad[10], pad[15]); + QROUND (pad[1], pad[6], pad[11], pad[12]); + QROUND (pad[2], pad[7], pad[8], pad[13]); + QROUND (pad[3], pad[4], pad[9], pad[14]); + } + + QROUND (pad[0], pad[4], pad[8], pad[12]); + QROUND (pad[1], pad[5], pad[9], pad[13]); + QROUND (pad[2], pad[6], pad[10], pad[14]); + QROUND (pad[3], pad[7], pad[11], pad[15]); + + if (src) + { +#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \ + (pad[idx] + inp[idx]) ^ \ + buf_get_le32(src + (idx) * 4)) + /* Last round. */ + QROUND (pad[0], pad[5], pad[10], pad[15]); + QOUT(0, 5, 10, 15); + QROUND (pad[1], pad[6], pad[11], pad[12]); + QOUT(1, 6, 11, 12); + QROUND (pad[2], pad[7], pad[8], pad[13]); + QOUT(2, 7, 8, 13); + QROUND (pad[3], pad[4], pad[9], pad[14]); + QOUT(3, 4, 9, 14); +#undef DO_OUT + } + else + { +#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx]) + /* Last round. */ + QROUND (pad[0], pad[5], pad[10], pad[15]); + QOUT(0, 5, 10, 15); + QROUND (pad[1], pad[6], pad[11], pad[12]); + QOUT(1, 6, 11, 12); + QROUND (pad[2], pad[7], pad[8], pad[13]); + QOUT(2, 7, 8, 13); + QROUND (pad[3], pad[4], pad[9], pad[14]); + QOUT(3, 4, 9, 14); +#undef DO_OUT + } + + /* Update counter. */ + inp[13] += (!++inp[12]); + + bytes -= CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + src += (src) ? CHACHA20_BLOCK_SIZE : 0; + } + while (bytes >= CHACHA20_BLOCK_SIZE); + + state[12] = inp[12]; + state[13] = inp[13]; + + /* burn_stack */ + return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *)); +} + +#undef QROUND +#undef QOUT + + +static unsigned int +chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx) +{ + return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE); +} + + +static void +chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key, + unsigned int keylen) +{ + /* These constants are the little endian encoding of the string + "expand 32-byte k". For the 128 bit variant, the "32" in that + string will be fixed up to "16". */ + ctx->input[0] = 0x61707865; /* "apxe" */ + ctx->input[1] = 0x3320646e; /* "3 dn" */ + ctx->input[2] = 0x79622d32; /* "yb-2" */ + ctx->input[3] = 0x6b206574; /* "k et" */ + + ctx->input[4] = buf_get_le32 (key + 0); + ctx->input[5] = buf_get_le32 (key + 4); + ctx->input[6] = buf_get_le32 (key + 8); + ctx->input[7] = buf_get_le32 (key + 12); + + if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ + { + ctx->input[8] = buf_get_le32 (key + 16); + ctx->input[9] = buf_get_le32 (key + 20); + ctx->input[10] = buf_get_le32 (key + 24); + ctx->input[11] = buf_get_le32 (key + 28); + } + else /* 128 bits */ + { + ctx->input[8] = ctx->input[4]; + ctx->input[9] = ctx->input[5]; + ctx->input[10] = ctx->input[6]; + ctx->input[11] = ctx->input[7]; + + ctx->input[1] -= 0x02000000; /* Change to "1 dn". */ + ctx->input[2] += 0x00000004; /* Change to "yb-6". */ + } +} + + +static void +chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen) +{ + ctx->input[12] = 0; + + if (ivlen == CHACHA20_MAX_IV_SIZE) + { + ctx->input[13] = buf_get_le32 (iv + 0); + ctx->input[14] = buf_get_le32 (iv + 4); + ctx->input[15] = buf_get_le32 (iv + 8); + } + else if (ivlen == CHACHA20_MIN_IV_SIZE) + { + ctx->input[13] = 0; + ctx->input[14] = buf_get_le32 (iv + 0); + ctx->input[15] = buf_get_le32 (iv + 4); + } + else + { + ctx->input[13] = 0; + ctx->input[14] = 0; + ctx->input[15] = 0; + } +} + + +static gcry_err_code_t +chacha20_do_setkey (CHACHA20_context_t * ctx, + const byte * key, unsigned int keylen) +{ + static int initialized; + static const char *selftest_failed; + + if (!initialized) + { + initialized = 1; + selftest_failed = selftest (); + if (selftest_failed) + log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); + } + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) + return GPG_ERR_INV_KEYLEN; + + ctx->blocks = chacha20_blocks; + + chacha20_keysetup (ctx, key, keylen); + + /* We default to a zero nonce. */ + chacha20_setiv (ctx, NULL, 0); + + return 0; +} + + +static gcry_err_code_t +chacha20_setkey (void *context, const byte * key, unsigned int keylen) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); + _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); + return rc; +} + + +static void +chacha20_setiv (void *context, const byte * iv, size_t ivlen) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + + /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ + if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE) + log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); + + if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE)) + chacha20_ivsetup (ctx, iv, ivlen); + else + chacha20_ivsetup (ctx, NULL, 0); + + /* Reset the unused pad bytes counter. */ + ctx->unused = 0; +} + + + +/* Note: This function requires LENGTH > 0. */ +static void +chacha20_do_encrypt_stream (CHACHA20_context_t * ctx, + byte * outbuf, const byte * inbuf, size_t length) +{ + unsigned int nburn, burn = 0; + + if (ctx->unused) + { + unsigned char *p = (void *) ctx->pad; + size_t n; + + gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); + + n = ctx->unused; + if (n > length) + n = length; + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); + length -= n; + outbuf += n; + inbuf += n; + ctx->unused -= n; + if (!length) + return; + gcry_assert (!ctx->unused); + } + + if (length >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + size_t bytes = nblocks * CHACHA20_BLOCK_SIZE; + burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes); + length -= bytes; + outbuf += bytes; + inbuf += bytes; + } + + if (length > 0) + { + nburn = chacha20_core (ctx->pad, ctx); + burn = nburn > burn ? nburn : burn; + + buf_xor (outbuf, inbuf, ctx->pad, length); + ctx->unused = CHACHA20_BLOCK_SIZE - length; + } + + _gcry_burn_stack (burn); +} + + +static void +chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf, + size_t length) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + + if (length) + chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length); +} + + +static const char * +selftest (void) +{ + CHACHA20_context_t ctx; + byte scratch[127 + 1]; + byte buf[256 + 64 + 4]; + int i; + + /* From draft-strombergson-chacha-test-vectors */ + static byte key_1[] = { + 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, + 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, + 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, + 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d + }; + static const byte nonce_1[] = + { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; + static const byte plaintext_1[127] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + static const byte ciphertext_1[127] = { + 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, + 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, + 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, + 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, + 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, + 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, + 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, + 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, + 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, + 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, + 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, + 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, + 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, + 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, + 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, + 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 + }; + + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + scratch[sizeof (scratch) - 1] = 0; + chacha20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1); + if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) + return "ChaCha20 encryption test 1 failed."; + if (scratch[sizeof (scratch) - 1]) + return "ChaCha20 wrote too much."; + chacha20_setkey (&ctx, key_1, sizeof (key_1)); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1); + if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) + return "ChaCha20 decryption test 1 failed."; + + for (i = 0; i < sizeof buf; i++) + buf[i] = i; + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + /*encrypt */ + chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf); + /*decrypt */ + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, buf, buf, 1); + chacha20_encrypt_stream (&ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); + chacha20_encrypt_stream (&ctx, buf + (sizeof buf) - 1, + buf + (sizeof buf) - 1, 1); + for (i = 0; i < sizeof buf; i++) + if (buf[i] != (byte) i) + return "ChaCha20 encryption test 2 failed."; + + return NULL; +} + + +gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { + GCRY_CIPHER_CHACHA20, + {0, 0}, /* flags */ + "CHACHA20", /* name */ + NULL, /* aliases */ + NULL, /* oids */ + 1, /* blocksize in bytes. */ + CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ + sizeof (CHACHA20_context_t), + chacha20_setkey, + NULL, + NULL, + chacha20_encrypt_stream, + chacha20_encrypt_stream, + NULL, + NULL, + chacha20_setiv +}; diff --git a/cipher/cipher.c b/cipher/cipher.c index 6552ed3..4751302 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -83,6 +83,9 @@ static gcry_cipher_spec_t *cipher_list[] = #if USE_GOST28147 &_gcry_cipher_spec_gost28147, #endif +#if USE_CHACHA20 + &_gcry_cipher_spec_chacha20, +#endif NULL }; diff --git a/configure.ac b/configure.ac index 6539a96..7573952 100644 --- a/configure.ac +++ b/configure.ac @@ -187,7 +187,7 @@ LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" -available_ciphers="$available_ciphers camellia idea salsa20 gost28147" +available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" enabled_ciphers="" # Definitions for public-key ciphers. @@ -1807,6 +1807,12 @@ if test "$found" = "1" ; then AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included]) fi +LIST_MEMBER(chacha20, $enabled_ciphers) +if test "$found" = "1" ; then + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" + AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) +fi + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index c5c3b45..d202b8b 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -1564,6 +1564,10 @@ This is the Salsa20/12 - reduced round version of Salsa20 stream cipher. The GOST 28147-89 cipher, defined in the respective GOST standard. Translation of this GOST into English is provided in the RFC-5830. + at item GCRY_CIPHER_CHACHA20 + at cindex ChaCha20 +This is the ChaCha20 stream cipher. + @end table @node Available cipher modes @@ -1720,9 +1724,9 @@ vector is passed as the buffer @var{K} of length @var{l} bytes and copied to internal data structures. The function checks that the IV matches the requirement of the selected algorithm and mode. -This function is also used with the Salsa20 stream cipher to set or -update the required nonce. In this case it needs to be called after -setting the key. +This function is also used with Salsa20 and ChaCha20 stream ciphers +to set or update the required nonce. In this case it needs to be +called after setting the key. This function is also used with the AEAD cipher modes to set or update the required nonce. diff --git a/src/cipher.h b/src/cipher.h index 5d1b5f6..ed57d3c 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -251,6 +251,7 @@ extern gcry_cipher_spec_t _gcry_cipher_spec_idea; extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20; extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12; extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147; +extern gcry_cipher_spec_t _gcry_cipher_spec_chacha20; /* Declarations for the digest specifications. */ extern gcry_md_spec_t _gcry_digest_spec_crc32; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index c84a3f7..d4e9bb2 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -880,7 +880,8 @@ enum gcry_cipher_algos GCRY_CIPHER_CAMELLIA256 = 312, GCRY_CIPHER_SALSA20 = 313, GCRY_CIPHER_SALSA20R12 = 314, - GCRY_CIPHER_GOST28147 = 315 + GCRY_CIPHER_GOST28147 = 315, + GCRY_CIPHER_CHACHA20 = 316 }; /* The Rijndael algorithm is basically AES, so provide some macros. */ diff --git a/tests/basic.c b/tests/basic.c index 5c6c51c..406d82d 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -195,7 +195,7 @@ show_mac_not_available (int algo) -#define MAX_DATA_LEN 100 +#define MAX_DATA_LEN 128 void progress_handler (void *cb_data, const char *what, int printchar, @@ -2583,8 +2583,331 @@ check_stream_cipher (void) "\x44\xC9\x70\x0A\x0F\x21\x38\xE8\xC1\xA2\x86\xFB\x8C\x1F\xBF\xA0" } } - } + }, #endif /*USE_SALSA20*/ +#ifdef USE_CHACHA20 + /* From draft-strombergson-chacha-test-vectors-01 */ + { + "ChaCha20 128 bit, TC1", + GCRY_CIPHER_CHACHA20, 16, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 8, + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd" + }, + { 112, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8" + "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46" + "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa" + "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f" + "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d" + "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75" + "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06" + }, + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8" + "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46" + "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa" + "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f" + "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d" + "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75" + "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06" + "\x2f\x41\xf6\x7a\x75\x2e\x66\xad\x34\x11\x98\x4c\x78\x7e\x30\xad" + } + } + }, + { + "ChaCha20 256 bit, TC1", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 8, + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90" + }, + { 112, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28" + "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7" + "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37" + "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86" + "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d" + "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed" + "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5" + }, + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28" + "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7" + "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37" + "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86" + "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d" + "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed" + "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5" + "\x31\xed\x1f\x28\x51\x0a\xfb\x45\xac\xe1\x0a\x1f\x4b\x79\x4d\x6f" + } + } + }, + { + "ChaCha20 256 bit, TC2", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xc5\xd3\x0a\x7c\xe1\xec\x11\x93\x78\xc8\x4f\x48\x7d\x77\x5a\x85" + "\x42\xf1\x3e\xce\x23\x8a\x94\x55\xe8\x22\x9e\x88\x8d\xe8\x5b\xbd" + "\x29\xeb\x63\xd0\xa1\x7a\x5b\x99\x9b\x52\xda\x22\xbe\x40\x23\xeb" + "\x07\x62\x0a\x54\xf6\xfa\x6a\xd8\x73\x7b\x71\xeb\x04\x64\xda\xc0" + "\x10\xf6\x56\xe6\xd1\xfd\x55\x05\x3e\x50\xc4\x87\x5c\x99\x30\xa3" + "\x3f\x6d\x02\x63\xbd\x14\xdf\xd6\xab\x8c\x70\x52\x1c\x19\x33\x8b" + "\x23\x08\xb9\x5c\xf8\xd0\xbb\x7d\x20\x2d\x21\x02\x78\x0e\xa3\x52" + "\x8f\x1c\xb4\x85\x60\xf7\x6b\x20\xf3\x82\xb9\x42\x50\x0f\xce\xac" + } + } + }, + { + "ChaCha20 256 bit, TC3", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x01\x00\x00\x00\x00\x00\x00\x00", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xef\x3f\xdf\xd6\xc6\x15\x78\xfb\xf5\xcf\x35\xbd\x3d\xd3\x3b\x80" + "\x09\x63\x16\x34\xd2\x1e\x42\xac\x33\x96\x0b\xd1\x38\xe5\x0d\x32" + "\x11\x1e\x4c\xaf\x23\x7e\xe5\x3c\xa8\xad\x64\x26\x19\x4a\x88\x54" + "\x5d\xdc\x49\x7a\x0b\x46\x6e\x7d\x6b\xbd\xb0\x04\x1b\x2f\x58\x6b" + "\x53\x05\xe5\xe4\x4a\xff\x19\xb2\x35\x93\x61\x44\x67\x5e\xfb\xe4" + "\x40\x9e\xb7\xe8\xe5\xf1\x43\x0f\x5f\x58\x36\xae\xb4\x9b\xb5\x32" + "\x8b\x01\x7c\x4b\x9d\xc1\x1f\x8a\x03\x86\x3f\xa8\x03\xdc\x71\xd5" + "\x72\x6b\x2b\x6b\x31\xaa\x32\x70\x8a\xfe\x5a\xf1\xd6\xb6\x90\x58" + } + } + }, + { + "ChaCha20 256 bit, TC4", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "\xff\xff\xff\xff\xff\xff\xff\xff", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xd9\xbf\x3f\x6b\xce\x6e\xd0\xb5\x42\x54\x55\x77\x67\xfb\x57\x44" + "\x3d\xd4\x77\x89\x11\xb6\x06\x05\x5c\x39\xcc\x25\xe6\x74\xb8\x36" + "\x3f\xea\xbc\x57\xfd\xe5\x4f\x79\x0c\x52\xc8\xae\x43\x24\x0b\x79" + "\xd4\x90\x42\xb7\x77\xbf\xd6\xcb\x80\xe9\x31\x27\x0b\x7f\x50\xeb" + "\x5b\xac\x2a\xcd\x86\xa8\x36\xc5\xdc\x98\xc1\x16\xc1\x21\x7e\xc3" + "\x1d\x3a\x63\xa9\x45\x13\x19\xf0\x97\xf3\xb4\xd6\xda\xb0\x77\x87" + "\x19\x47\x7d\x24\xd2\x4b\x40\x3a\x12\x24\x1d\x7c\xca\x06\x4f\x79" + "\x0f\x1d\x51\xcc\xaf\xf6\xb1\x66\x7d\x4b\xbc\xa1\x95\x8c\x43\x06" + } + } + }, + { + "ChaCha20 256 bit, TC5", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55" + "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55", + "\x55\x55\x55\x55\x55\x55\x55\x55", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xbe\xa9\x41\x1a\xa4\x53\xc5\x43\x4a\x5a\xe8\xc9\x28\x62\xf5\x64" + "\x39\x68\x55\xa9\xea\x6e\x22\xd6\xd3\xb5\x0a\xe1\xb3\x66\x33\x11" + "\xa4\xa3\x60\x6c\x67\x1d\x60\x5c\xe1\x6c\x3a\xec\xe8\xe6\x1e\xa1" + "\x45\xc5\x97\x75\x01\x7b\xee\x2f\xa6\xf8\x8a\xfc\x75\x80\x69\xf7" + "\xe0\xb8\xf6\x76\xe6\x44\x21\x6f\x4d\x2a\x34\x22\xd7\xfa\x36\xc6" + "\xc4\x93\x1a\xca\x95\x0e\x9d\xa4\x27\x88\xe6\xd0\xb6\xd1\xcd\x83" + "\x8e\xf6\x52\xe9\x7b\x14\x5b\x14\x87\x1e\xae\x6c\x68\x04\xc7\x00" + "\x4d\xb5\xac\x2f\xce\x4c\x68\xc7\x26\xd0\x04\xb1\x0f\xca\xba\x86" + } + } + }, + { + "ChaCha20 256 bit, TC6", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa", + "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x9a\xa2\xa9\xf6\x56\xef\xde\x5a\xa7\x59\x1c\x5f\xed\x4b\x35\xae" + "\xa2\x89\x5d\xec\x7c\xb4\x54\x3b\x9e\x9f\x21\xf5\xe7\xbc\xbc\xf3" + "\xc4\x3c\x74\x8a\x97\x08\x88\xf8\x24\x83\x93\xa0\x9d\x43\xe0\xb7" + "\xe1\x64\xbc\x4d\x0b\x0f\xb2\x40\xa2\xd7\x21\x15\xc4\x80\x89\x06" + "\x72\x18\x44\x89\x44\x05\x45\xd0\x21\xd9\x7e\xf6\xb6\x93\xdf\xe5" + "\xb2\xc1\x32\xd4\x7e\x6f\x04\x1c\x90\x63\x65\x1f\x96\xb6\x23\xe6" + "\x2a\x11\x99\x9a\x23\xb6\xf7\xc4\x61\xb2\x15\x30\x26\xad\x5e\x86" + "\x6a\x2e\x59\x7e\xd0\x7b\x84\x01\xde\xc6\x3a\x09\x34\xc6\xb2\xa9" + } + } + }, + { + "ChaCha20 256 bit, TC7", + GCRY_CIPHER_CHACHA20, 32, 8, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\xff\xee\xdd\xcc\xbb\xaa\x99\x88\x77\x66\x55\x44\x33\x22\x11\x00", + "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x9f\xad\xf4\x09\xc0\x08\x11\xd0\x04\x31\xd6\x7e\xfb\xd8\x8f\xba" + "\x59\x21\x8d\x5d\x67\x08\xb1\xd6\x85\x86\x3f\xab\xbb\x0e\x96\x1e" + "\xea\x48\x0f\xd6\xfb\x53\x2b\xfd\x49\x4b\x21\x51\x01\x50\x57\x42" + "\x3a\xb6\x0a\x63\xfe\x4f\x55\xf7\xa2\x12\xe2\x16\x7c\xca\xb9\x31" + "\xfb\xfd\x29\xcf\x7b\xc1\xd2\x79\xed\xdf\x25\xdd\x31\x6b\xb8\x84" + "\x3d\x6e\xde\xe0\xbd\x1e\xf1\x21\xd1\x2f\xa1\x7c\xbc\x2c\x57\x4c" + "\xcc\xab\x5e\x27\x51\x67\xb0\x8b\xd6\x86\xf8\xa0\x9d\xf8\x7e\xc3" + "\xff\xb3\x53\x61\xb9\x4e\xbf\xa1\x3f\xec\x0e\x48\x89\xd1\x8d\xa5" + } + } + }, + { + "ChaCha20 256 bit, TC8", + GCRY_CIPHER_CHACHA20, 32, 8, + "\xc4\x6e\xc1\xb1\x8c\xe8\xa8\x78\x72\x5a\x37\xe7\x80\xdf\xb7\x35" + "\x1f\x68\xed\x2e\x19\x4c\x79\xfb\xc6\xae\xbe\xe1\xa6\x67\x97\x5d", + "\x1a\xda\x31\xd5\xcf\x68\x82\x21", + { + { 128, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06" + "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf" + "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f" + "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92" + "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36" + "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38" + "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0" + "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33\x32" + }, + { 127, + "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06" + "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf" + "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f" + "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92" + "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36" + "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38" + "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0" + "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + } + } + }, + /* from draft-nir-cfrg-chacha20-poly1305-02 */ + { + "ChaCha20 256 bit, IV96-bit", + GCRY_CIPHER_CHACHA20, 32, 12, + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + "\x07\x00\x00\x00\x40\x41\x42\x43\x44\x45\x46\x47", + { + { 64, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x7b\xac\x2b\x25\x2d\xb4\x47\xaf\x09\xb6\x7a\x55\xa4\xe9\x55\x84" + "\x0a\xe1\xd6\x73\x10\x75\xd9\xeb\x2a\x93\x75\x78\x3e\xd5\x53\xff" + "\xa2\x7e\xcc\xde\xad\xdb\x4d\xb4\xd1\x17\x9c\xe4\xc9\x0b\x43\xd8" + "\xbc\xb7\x94\x8c\x4b\x4b\x7d\x8b\x7d\xf6\x27\x39\x32\xa4\x69\x16" + }, + }, + }, +#endif /*USE_CHACHA20*/ }; gcry_cipher_hd_t hde, hdd; @@ -3649,6 +3972,9 @@ check_ciphers (void) GCRY_CIPHER_SALSA20, GCRY_CIPHER_SALSA20R12, #endif +#if USE_CHACHA20 + GCRY_CIPHER_CHACHA20, +#endif 0 }; int i; ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 1 + cipher/chacha20-avx2-amd64.S | 949 +++++++++++++++++++++++++++++++++++++++++ cipher/chacha20-ssse3-amd64.S | 610 ++++++++++++++++++++++++++ cipher/chacha20.c | 544 +++++++++++++++++++++++ cipher/cipher.c | 3 + configure.ac | 16 +- doc/gcrypt.texi | 10 +- src/cipher.h | 1 + src/gcrypt.h.in | 3 +- tests/basic.c | 330 +++++++++++++- 10 files changed, 2460 insertions(+), 7 deletions(-) create mode 100644 cipher/chacha20-avx2-amd64.S create mode 100644 cipher/chacha20-ssse3-amd64.S create mode 100644 cipher/chacha20.c hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cloos at jhcloos.com Mon May 12 19:44:48 2014 From: cloos at jhcloos.com (James Cloos) Date: Mon, 12 May 2014 13:44:48 -0400 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-70-ga39ee75 In-Reply-To: (by Jussi Kivilinna's message of "Sun, 11 May 2014 11:06:09 +0200") References: Message-ID: JK> chacha20: add AVX2/AMD64 assembly implementation JK> Based on public domain implementation by Andrew Moon at JK> https://github.com/floodyberry/chacha-opt It would be cool to grab the sse2 versions, too. Even the sse2 version is significantly faster than the compiled version. There is still a large deployment of systems which lack ssse3 or avx support. Plus, most of the (affordable) KVM-based virtual offerrings run qemu-kvm with options which block everything newer than sse2, even when the host cpu supports them. (One hopes that those vendors will upgrade to qemu 2 and start using -cpu host, but one cannot hold one's breath that long.) -JimC -- James Cloos OpenPGP: 0x997A9F17ED7DAEA6 From smueller at chronox.de Tue May 13 02:40:05 2014 From: smueller at chronox.de (Stephan Mueller) Date: Tue, 13 May 2014 02:40:05 +0200 Subject: [PATCH v6 7/7] User interface to DRBG In-Reply-To: <1768928.lDN30gJEhR@myon.chronox.de> References: <1573675.V5Uuq2JQxy@myon.chronox.de> <1840716.S0MU8MFyFb@myon.chronox.de> <1768928.lDN30gJEhR@myon.chronox.de> Message-ID: <1428285.cGI62b364h@myon.chronox.de> Changes v4: * add explicit type casting from void to unsigned char as reported in https://bugzilla.novell.com/show_bug.cgi?id=877233 Signed-off-by: Stephan Mueller --- diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index c84a3f7..569d7a0 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -193,7 +193,7 @@ gcry_error_t gcry_err_make_from_errno (gcry_err_source_t source, int err); /* Return an error value with the system error ERR. */ gcry_err_code_t gcry_error_from_errno (int err); - + /* NOTE: Since Libgcrypt 1.6 the thread callbacks are not anymore used. However we keep it to allow for some source code compatibility if used in the standard way. */ @@ -228,7 +228,7 @@ struct gcry_thread_cbs (GCRY_THREAD_OPTION_PTHREAD | (GCRY_THREAD_OPTION_VERSION << 8))} - + /* A generic context object as used by some functions. */ struct gcry_context; typedef struct gcry_context *gcry_ctx_t; @@ -254,7 +254,7 @@ typedef struct } gcry_buffer_t; - + /* Check that the library fulfills the version requirement. */ const char *gcry_check_version (const char *req_version); @@ -329,13 +329,14 @@ enum gcry_ctl_cmds GCRYCTL_SET_CCM_LENGTHS = 69, GCRYCTL_CLOSE_RANDOM_DEVICE = 70, GCRYCTL_INACTIVATE_FIPS_FLAG = 71, - GCRYCTL_REACTIVATE_FIPS_FLAG = 72 + GCRYCTL_REACTIVATE_FIPS_FLAG = 72, + GCRYCTL_DRBG_REINIT = 73, }; /* Perform various operations defined by CMD. */ gcry_error_t gcry_control (enum gcry_ctl_cmds CMD, ...); - + /* S-expression management. */ /* The object to represent an S-expression as used with the public key @@ -477,7 +478,7 @@ gpg_error_t gcry_sexp_extract_param (gcry_sexp_t sexp, const char *list, ...) _GCRY_GCC_ATTR_SENTINEL(0); - + /******************************************* * * * Multi Precision Integer Functions * @@ -833,7 +834,7 @@ gcry_mpi_t _gcry_mpi_get_const (int no); #endif /* GCRYPT_NO_MPI_MACROS */ - + /************************************ * * * Symmetric Cipher Functions * @@ -1015,7 +1016,7 @@ size_t gcry_cipher_get_algo_blklen (int algo); #define gcry_cipher_test_algo(a) \ gcry_cipher_algo_info( (a), GCRYCTL_TEST_ALGO, NULL, NULL ) - + /************************************ * * * Asymmetric Cipher Functions * @@ -1114,7 +1115,7 @@ gcry_sexp_t gcry_pk_get_param (int algo, const char *name); gcry_error_t gcry_pubkey_get_sexp (gcry_sexp_t *r_sexp, int mode, gcry_ctx_t ctx); - + /************************************ * * @@ -1291,7 +1292,7 @@ void gcry_md_debug (gcry_md_hd_t hd, const char *suffix); #define gcry_md_get_asnoid(a,b,n) \ gcry_md_algo_info((a), GCRYCTL_GET_ASNOID, (b), (n)) - + /********************************************** * * @@ -1411,7 +1412,7 @@ int gcry_mac_map_name (const char *name) _GCRY_GCC_ATTR_PURE; #define gcry_mac_test_algo(a) \ gcry_mac_algo_info( (a), GCRYCTL_TEST_ALGO, NULL, NULL ) - + /****************************** * * * Key Derivation Functions * @@ -1439,7 +1440,7 @@ gpg_error_t gcry_kdf_derive (const void *passphrase, size_t passphraselen, - + /************************************ * * * Random Generating Functions * @@ -1508,7 +1509,7 @@ void gcry_create_nonce (void *buffer, size_t length); - + /*******************************/ /* */ /* Prime Number Functions */ @@ -1567,7 +1568,7 @@ void gcry_prime_release_factors (gcry_mpi_t *factors); gcry_error_t gcry_prime_check (gcry_mpi_t x, unsigned int flags); - + /************************************ * * * Miscellaneous Stuff * @@ -1672,6 +1673,136 @@ int gcry_is_secure (const void *a) _GCRY_GCC_ATTR_PURE; /* Return true if Libgcrypt is in FIPS mode. */ #define gcry_fips_mode_active() !!gcry_control (GCRYCTL_FIPS_MODE_P, 0) +/* DRBG test data */ +struct drbg_test_data { + struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */ + int fail_seed_source:1; /* if set, the seed function will return an + * error */ +}; + +/* DRBG input data structure for DRBG generate with additional information + * string */ +struct drbg_gen { + unsigned char *outbuf; /* output buffer for random numbers */ + unsigned int outlen; /* size of output buffer */ + struct drbg_string *addtl; /* input buffer for + * additional information string */ + struct drbg_test_data *test_data; /* test data */ +}; + +/* + * Concatenation Helper and string operation helper + * + * SP800-90A requires the concatenation of different data. To avoid copying + * buffers around or allocate additional memory, the following data structure + * is used to point to the original memory with its size. In addition, it + * is used to build a linked list. The linked list defines the concatenation + * of individual buffers. The order of memory block referenced in that + * linked list determines the order of concatenation. + */ +/* DRBG string definition */ +struct drbg_string { + const unsigned char *buf; + size_t len; + struct drbg_string *next; +}; + +static inline void drbg_string_fill(struct drbg_string *string, + const unsigned char *buf, size_t len) +{ + string->buf = buf; + string->len = len; + string->next = NULL; +} + +/* this is a wrapper function for users of libgcrypt */ +static inline void gcry_randomize_drbg(void *outbuf, size_t outlen, + enum gcry_random_level level, + struct drbg_string *addtl) +{ + struct drbg_gen genbuf; + genbuf.outbuf = (unsigned char *)outbuf; + genbuf.outlen = outlen; + genbuf.addtl = addtl; + genbuf.test_data = NULL; + gcry_randomize(&genbuf, 0, level); +} + +/* this is a wrapper function for users of libgcrypt */ +static inline void gcry_randomize_drbg_test(void *outbuf, size_t outlen, + enum gcry_random_level level, + struct drbg_string *addtl, + struct drbg_test_data *test_data) +{ + struct drbg_gen genbuf; + genbuf.outbuf = (unsigned char *)outbuf; + genbuf.outlen = outlen; + genbuf.addtl = addtl; + genbuf.test_data = test_data; + gcry_randomize(&genbuf, 0, level); +} + + +/* + * DRBG flags bitmasks + * + * 31 (B) 28 19 (A) 0 + * +-+-+-+--------+---+-----------+-----+ + * |~|~|u|~~~~~~~~| 3 | 2 | 1 | + * +-+-+-+--------+- -+-----------+-----+ + * ctl flg| |drbg use selection flags + * + */ + +/* internal state control flags (B) */ +#define DRBG_PREDICTION_RESIST ((u_int32_t)1<<28) + +/* CTR type modifiers (A.1)*/ +#define DRBG_CTRAES ((u_int32_t)1<<0) +#define DRBG_CTRSERPENT ((u_int32_t)1<<1) +#define DRBG_CTRTWOFISH ((u_int32_t)1<<2) +#define DRBG_CTR_MASK (DRBG_CTRAES | DRBG_CTRSERPENT | DRBG_CTRTWOFISH) + +/* HASH type modifiers (A.2)*/ +#define DRBG_HASHSHA1 ((u_int32_t)1<<4) +#define DRBG_HASHSHA224 ((u_int32_t)1<<5) +#define DRBG_HASHSHA256 ((u_int32_t)1<<6) +#define DRBG_HASHSHA384 ((u_int32_t)1<<7) +#define DRBG_HASHSHA512 ((u_int32_t)1<<8) +#define DRBG_HASH_MASK (DRBG_HASHSHA1 | DRBG_HASHSHA224 | \ + DRBG_HASHSHA256 | DRBG_HASHSHA384 | \ + DRBG_HASHSHA512) +/* type modifiers (A.3)*/ +#define DRBG_HMAC ((u_int32_t)1<<12) +#define DRBG_SYM128 ((u_int32_t)1<<13) +#define DRBG_SYM192 ((u_int32_t)1<<14) +#define DRBG_SYM256 ((u_int32_t)1<<15) +#define DRBG_TYPE_MASK (DRBG_HMAC | DRBG_SYM128 | DRBG_SYM192 | \ + DRBG_SYM256) +#define DRBG_CIPHER_MASK (DRBG_CTR_MASK | DRBG_HASH_MASK | DRBG_TYPE_MASK) + +#define DRBG_PR_CTRAES128 (DRBG_PREDICTION_RESIST | DRBG_CTRAES | DRBG_SYM128) +#define DRBG_PR_CTRAES192 (DRBG_PREDICTION_RESIST | DRBG_CTRAES | DRBG_SYM192) +#define DRBG_PR_CTRAES256 (DRBG_PREDICTION_RESIST | DRBG_CTRAES | DRBG_SYM256) +#define DRBG_NOPR_CTRAES128 (DRBG_CTRAES | DRBG_SYM128) +#define DRBG_NOPR_CTRAES192 (DRBG_CTRAES | DRBG_SYM192) +#define DRBG_NOPR_CTRAES256 (DRBG_CTRAES | DRBG_SYM256) +#define DRBG_PR_HASHSHA1 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA1) +#define DRBG_PR_HASHSHA256 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA256) +#define DRBG_PR_HASHSHA384 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA384) +#define DRBG_PR_HASHSHA512 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA512) +#define DRBG_NOPR_HASHSHA1 (DRBG_HASHSHA1) +#define DRBG_NOPR_HASHSHA256 (DRBG_HASHSHA256) +#define DRBG_NOPR_HASHSHA384 (DRBG_HASHSHA384) +#define DRBG_NOPR_HASHSHA512 (DRBG_HASHSHA512) +#define DRBG_PR_HMACSHA1 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA1 | DRBG_HMAC) +#define DRBG_PR_HMACSHA256 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA256|DRBG_HMAC) +#define DRBG_PR_HMACSHA384 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA384|DRBG_HMAC) +#define DRBG_PR_HMACSHA512 (DRBG_PREDICTION_RESIST | DRBG_HASHSHA512|DRBG_HMAC) +#define DRBG_NOPR_HMACSHA1 (DRBG_HASHSHA1 | DRBG_HMAC) +#define DRBG_NOPR_HMACSHA256 (DRBG_HASHSHA256 | DRBG_HMAC) +#define DRBG_NOPR_HMACSHA384 (DRBG_HASHSHA384 | DRBG_HMAC) +#define DRBG_NOPR_HMACSHA512 (DRBG_HASHSHA512 | DRBG_HMAC) #if 0 /* (Keep Emacsens' auto-indent happy.) */ { From jussi.kivilinna at iki.fi Tue May 13 21:07:57 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 13 May 2014 22:07:57 +0300 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-70-ga39ee75 In-Reply-To: References: Message-ID: <53726D8D.3040707@iki.fi> On 12.05.2014 20:44, James Cloos wrote: > JK> chacha20: add AVX2/AMD64 assembly implementation > JK> Based on public domain implementation by Andrew Moon at > JK> https://github.com/floodyberry/chacha-opt > > It would be cool to grab the sse2 versions, too. > > Even the sse2 version is significantly faster than the compiled version. > > There is still a large deployment of systems which lack ssse3 or avx > support. > > Plus, most of the (affordable) KVM-based virtual offerrings run qemu-kvm > with options which block everything newer than sse2, even when the host > cpu supports them. (One hopes that those vendors will upgrade to qemu 2 > and start using -cpu host, but one cannot hold one's breath that long.) > I picked SSSE3 version since majority of systems today have support for it, but that necessary isn't the case with virtual systems. I'll look at this after I get Poly1305 SSE2+AVX2 patches ready and out. -Jussi > -JimC > -- > James Cloos OpenPGP: 0x997A9F17ED7DAEA6 > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 713 bytes Desc: OpenPGP digital signature URL: From cvs at cvs.gnupg.org Fri May 16 19:19:28 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 16 May 2014 19:19:28 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-73-gc20daee Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via c20daeeb05329bfc6cc2c562cbd4b965291fe0e1 (commit) via a3062db748f272e0f7346e1ed9e0bf7ed61a4eae (commit) via a7d9eeeba632b7eb4a5b15ff17f6565181642f3c (commit) from a39ee7555691d18cae97560f130aaf952bfbd278 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit c20daeeb05329bfc6cc2c562cbd4b965291fe0e1 Author: Jussi Kivilinna Date: Mon May 12 20:14:32 2014 +0300 chacha20/AVX2: clear upper-halfs of YMM registers on entry * cipher/chacha20-avx2-amd64.S (_gcry_chacha20_amd64_avx2_blocks): Add 'vzeroupper' at beginning. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S index 0c5f3f9..1f33de8 100644 --- a/cipher/chacha20-avx2-amd64.S +++ b/cipher/chacha20-avx2-amd64.S @@ -42,6 +42,7 @@ .type _gcry_chacha20_amd64_avx2_blocks, at function; _gcry_chacha20_amd64_avx2_blocks: .Lchacha_blocks_avx2_local: + vzeroupper pushq %rbx pushq %rbp pushq %r12 commit a3062db748f272e0f7346e1ed9e0bf7ed61a4eae Author: Jussi Kivilinna Date: Mon May 12 20:11:33 2014 +0300 chacha20/AVX2: check for ENABLE_AVX2_SUPPORT instead of HAVE_GCC_INLINE_ASM_AVX2 * cipher/chacha20.c (USE_AVX2): Enable depending on ENABLE_AVX2_SUPPORT, not HAVE_GCC_INLINE_ASM_AVX2. * cipher/chacha20-avx2-amd64.S: Ditto. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S index c50a0c0..0c5f3f9 100644 --- a/cipher/chacha20-avx2-amd64.S +++ b/cipher/chacha20-avx2-amd64.S @@ -27,7 +27,7 @@ #include #if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_AVX2) && USE_CHACHA20 + defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20 #ifdef __PIC__ # define RIP (%rip) diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 2ac5a32..e2cf442 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -57,7 +57,7 @@ /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_AVX2) + defined(ENABLE_AVX2_SUPPORT) # define USE_AVX2 1 #endif commit a7d9eeeba632b7eb4a5b15ff17f6565181642f3c Author: Jussi Kivilinna Date: Mon May 12 19:55:35 2014 +0300 chacha20/SSSE3: clear XMM registers after use * cipher/chacha20-ssse3-amd64.S (_gcry_chacha20_amd64_ssse3_blocks): On return, clear XMM registers. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S index aaa7e5b..50c2ff8 100644 --- a/cipher/chacha20-ssse3-amd64.S +++ b/cipher/chacha20-ssse3-amd64.S @@ -594,9 +594,25 @@ _gcry_chacha20_amd64_ssse3_blocks: .Lchacha_blocks_ssse3_done: movdqu %xmm11, 48(%rdi) movq %rbp, %rsp + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 popq %rbp popq %rbx movl $(63 + 512 + 16), %eax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 ret .size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks; ----------------------------------------------------------------------- Summary of changes: cipher/chacha20-avx2-amd64.S | 3 ++- cipher/chacha20-ssse3-amd64.S | 16 ++++++++++++++++ cipher/chacha20.c | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Fri May 16 20:08:13 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 16 May 2014 21:08:13 +0300 Subject: [PATCH 1/2] poly1305: add AMD64/SSE2 optimized implementation Message-ID: <20140516180813.4774.38392.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'poly1305-sse2-amd64.S'. * cipher/poly1305-internal.h (POLY1305_USE_SSE2) (POLY1305_SSE2_BLOCKSIZE, POLY1305_SSE2_STATESIZE) (POLY1305_SSE2_ALIGNMENT): New. (POLY1305_LARGEST_BLOCKSIZE, POLY1305_LARGEST_STATESIZE) (POLY1305_STATE_ALIGNMENT): Use SSE2 versions when needed. * cipher/poly1305-sse2-amd64.S: New. * cipher/poly1305.c [POLY1305_USE_SSE2] (_gcry_poly1305_amd64_sse2_init_ext) (_gcry_poly1305_amd64_sse2_finish_ext) (_gcry_poly1305_amd64_sse2_blocks, poly1305_amd64_sse2_ops): New. (_gcry_polu1305_init) [POLY1305_USE_SSE2]: Use SSE2 version. * configure.ac [host=x86_64]: Add 'poly1305-sse2-amd64.lo'. -- Add Andrew Moon's public domain SSE2 implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmarks on Intel i5-4570 (haswell): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.844 ns/B 1130.2 MiB/s 2.70 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.448 ns/B 2129.5 MiB/s 1.43 c/B Benchmarks on Intel i5-2450M (sandy-bridge): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 1.25 ns/B 763.0 MiB/s 3.12 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.605 ns/B 1575.9 MiB/s 1.51 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 cipher/poly1305-internal.h | 28 + cipher/poly1305-sse2-amd64.S | 1035 ++++++++++++++++++++++++++++++++++++++++++ cipher/poly1305.c | 23 + configure.ac | 7 5 files changed, 1091 insertions(+), 3 deletions(-) create mode 100644 cipher/poly1305-sse2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 4468647..a32ae89 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,6 +72,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ +poly1305-sse2-amd64.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index d2c6b5c..fa3fe75 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -44,15 +44,37 @@ #define POLY1305_REF_ALIGNMENT sizeof(void *) +/* POLY1305_USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */ +#undef POLY1305_USE_SSE2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define POLY1305_USE_SSE2 1 +# define POLY1305_SSE2_BLOCKSIZE 32 +# define POLY1305_SSE2_STATESIZE 248 +# define POLY1305_SSE2_ALIGNMENT 16 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ -#define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE +#ifdef POLY1305_USE_SSE2 +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE +#else +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE +#endif /* Largest state-size used in any implementation. */ -#define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE +#ifdef POLY1305_USE_SSE2 +# define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE +#else +# define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE +#endif /* Minimum alignment for state pointer passed to implementations. */ -#define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT +#ifdef POLY1305_USE_SSE2 +# define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT +#else +# define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT +#endif typedef struct poly1305_key_s diff --git a/cipher/poly1305-sse2-amd64.S b/cipher/poly1305-sse2-amd64.S new file mode 100644 index 0000000..106b119 --- /dev/null +++ b/cipher/poly1305-sse2-amd64.S @@ -0,0 +1,1035 @@ +/* poly1305-sse2-amd64.S - AMD64/SSE2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_init_ext +.type _gcry_poly1305_amd64_sse2_init_ext, at function; +_gcry_poly1305_amd64_sse2_init_ext: +.Lpoly1305_init_ext_x86_local: + xor %edx, %edx + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdx, %r10 + movq $-1, %rcx + testq %r10, %r10 + pxor %xmm0, %xmm0 + movq $0xfffffc0ffff, %r9 + movdqa %xmm0, (%rdi) + cmove %rcx, %r10 + movdqa %xmm0, 16(%rdi) + movq $0xffc0fffffff, %rcx + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movq 8(%rsi), %r11 + movq %r11, %r8 + movq (%rsi), %r12 + andq %r12, %rcx + shrq $44, %r12 + shlq $20, %r8 + shrq $24, %r11 + orq %r8, %r12 + movq $0xffffffc0f, %r8 + andq %r9, %r12 + andq %r8, %r11 + movl %ecx, %r8d + andl $67108863, %r8d + movq %rcx, %r9 + movl %r8d, 84(%rdi) + movq %r12, %r8 + shrq $26, %r9 + shlq $18, %r8 + orq %r8, %r9 + movq %r12, %r8 + shrq $8, %r8 + andl $67108863, %r9d + andl $67108863, %r8d + movl %r9d, 92(%rdi) + movq %r12, %r9 + movl %r8d, 100(%rdi) + movq %r11, %r8 + shrq $34, %r9 + shlq $10, %r8 + orq %r8, %r9 + movq %r11, %r8 + shrq $16, %r8 + andl $67108863, %r9d + movl %r9d, 108(%rdi) + cmpq $16, %r10 + movl %r8d, 116(%rdi) + movl 16(%rsi), %r8d + movl %r8d, 124(%rdi) + movl 20(%rsi), %r8d + movl %r8d, 132(%rdi) + movl 24(%rsi), %r8d + movl %r8d, 140(%rdi) + movl 28(%rsi), %esi + movl %esi, 148(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r11,%r11,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r13 + movq %rcx, %rax + movq %rdx, %r8 + mulq %rcx + addq %rax, %r13 + lea (%rcx,%rcx), %rax + movq %r13, %r9 + adcq %rdx, %r8 + mulq %r12 + shlq $20, %r8 + movq %rax, %rsi + shrq $44, %r9 + movq %r11, %rax + orq %r9, %r8 + movq %rdx, %r9 + mulq %r14 + addq %rax, %rsi + movq %rcx, %rax + adcq %rdx, %r9 + addq %r11, %r11 + mulq %r11 + addq %rsi, %r8 + movq %rax, %r11 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r9 + mulq %r12 + addq %rax, %r11 + movq %r8, %rsi + adcq %rdx, %rcx + shlq $20, %r9 + shrq $44, %rsi + orq %rsi, %r9 + movq $0xfffffffffff, %rsi + addq %r11, %r9 + movq %r9, %r12 + adcq $0, %rcx + andq %rsi, %r13 + shlq $22, %rcx + andq %rsi, %r8 + shrq $42, %r12 + orq %r12, %rcx + movq %rsi, %r12 + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r13 + movq %rsi, %rcx + andq %r13, %rcx + shrq $44, %r13 + movq %rcx, %r14 + addq %r13, %r8 + movq $0x3ffffffffff, %r13 + andq %r8, %r12 + andq %r13, %r9 + shrq $44, %r8 + movq %r12, %r11 + addq %r8, %r9 + movq %r12, %rax + movq %r9, %r13 + movl %ecx, %r8d + shrq $26, %r14 + andl $67108863, %r8d + shlq $18, %r11 + shrq $34, %rax + orq %r11, %r14 + shlq $10, %r13 + movq %r12, %r11 + orq %r13, %rax + movq %r9, %r13 + shrq $8, %r11 + shrq $16, %r13 + andl $67108863, %r14d + andl $67108863, %r11d + andl $67108863, %eax + movl %r8d, 88(%rdi) + cmpq $64, %r10 + movl %r8d, 80(%rdi) + movl %r14d, 104(%rdi) + movl %r14d, 96(%rdi) + movl %r11d, 120(%rdi) + movl %r11d, 112(%rdi) + movl %eax, 136(%rdi) + movl %eax, 128(%rdi) + movl %r13d, 152(%rdi) + movl %r13d, 144(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r9,%r9,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r8 + movq %rcx, %rax + movq %rdx, %r10 + mulq %rcx + addq %rax, %r8 + lea (%rcx,%rcx), %rax + movq %r8, %r11 + adcq %rdx, %r10 + andq %rsi, %r8 + mulq %r12 + shlq $20, %r10 + movq %rax, %r13 + shrq $44, %r11 + movq %r9, %rax + orq %r11, %r10 + movq %rdx, %r11 + mulq %r14 + addq %rax, %r13 + movq %rcx, %rax + adcq %rdx, %r11 + addq %r9, %r9 + mulq %r9 + addq %r13, %r10 + movq %rax, %r9 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r11 + mulq %r12 + addq %rax, %r9 + movq %r10, %r13 + adcq %rdx, %rcx + andq %rsi, %r10 + shlq $20, %r11 + shrq $44, %r13 + orq %r13, %r11 + addq %r9, %r11 + movq %rsi, %r9 + movq %r11, %r12 + adcq $0, %rcx + shlq $22, %rcx + shrq $42, %r12 + orq %r12, %rcx + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r8 + andq %r8, %r9 + shrq $44, %r8 + movl %r9d, %eax + addq %r8, %r10 + movq $0x3ffffffffff, %r8 + andq %r10, %rsi + andq %r8, %r11 + shrq $44, %r10 + movq %rsi, %r8 + addq %r10, %r11 + andl $67108863, %eax + shrq $26, %r9 + movq %r11, %r10 + shlq $18, %r8 + shlq $10, %r10 + orq %r8, %r9 + movq %rsi, %r8 + shrq $34, %rsi + andl $67108863, %r9d + shrq $8, %r8 + orq %r10, %rsi + shrq $16, %r11 + andl $67108863, %r8d + andl $67108863, %esi + movl %eax, 168(%rdi) + movl %eax, 160(%rdi) + movl %r9d, 184(%rdi) + movl %r9d, 176(%rdi) + movl %r8d, 200(%rdi) + movl %r8d, 192(%rdi) + movl %esi, 216(%rdi) + movl %esi, 208(%rdi) + movl %r11d, 232(%rdi) + movl %r11d, 224(%rdi) +.Lpoly1305_init_ext_sse2_done: + movq $0, 240(%rdi) + popq %r14 + popq %r13 + popq %r12 + ret +.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_finish_ext +.type _gcry_poly1305_amd64_sse2_finish_ext, at function; +_gcry_poly1305_amd64_sse2_finish_ext: +.Lpoly1305_finish_ext_x86_local: + pushq %rbp + movq %rsp, %rbp + subq $64, %rsp + andq $~63, %rsp + movq %rdx, 32(%rsp) + movq %rcx, 40(%rsp) + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_no_leftover + pxor %xmm0, %xmm0 + movdqa %xmm0, 0+0(%rsp) + movdqa %xmm0, 16+0(%rsp) + leaq 0(%rsp), %r8 + testq $16, %rdx + jz .Lpoly1305_finish_x86_skip16 + movdqu 0(%rsi), %xmm0 + movdqa %xmm0, 0(%r8) + addq $16, %rsi + addq $16, %r8 +.Lpoly1305_finish_x86_skip16: + testq $8, %rdx + jz .Lpoly1305_finish_x86_skip8 + movq 0(%rsi), %rax + movq %rax, 0(%r8) + addq $8, %rsi + addq $8, %r8 +.Lpoly1305_finish_x86_skip8: + testq $4, %rdx + jz .Lpoly1305_finish_x86_skip4 + movl 0(%rsi), %eax + movl %eax, 0(%r8) + addq $4, %rsi + addq $4, %r8 +.Lpoly1305_finish_x86_skip4: + testq $2, %rdx + jz .Lpoly1305_finish_x86_skip2 + movw 0(%rsi), %ax + movw %ax, 0(%r8) + addq $2, %rsi + addq $2, %r8 +.Lpoly1305_finish_x86_skip2: + testq $1, %rdx + jz .Lpoly1305_finish_x86_skip1 + movb 0(%rsi), %al + movb %al, 0(%r8) + addq $1, %r8 +.Lpoly1305_finish_x86_skip1: + cmpq $16, %rdx + je .Lpoly1305_finish_x86_is16 + movb $1, 0(%r8) +.Lpoly1305_finish_x86_is16: + movq $4, %rax + jae .Lpoly1305_finish_x86_16andover + movq $8, %rax +.Lpoly1305_finish_x86_16andover: + orq %rax, 240(%rdi) + leaq 0(%rsp), %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_no_leftover: + testq $1, 240(%rdi) + jz .Lpoly1305_finish_x86_not_started + movq 32(%rsp), %rdx + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_r2r + cmpq $16, %rdx + jg .Lpoly1305_finish_x86_r2r + xorl %r10d, %r10d + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 80(%rdi) + movl $1, 8+80(%rdi) + movl %ecx, 96(%rdi) + movl %r10d, 8+96(%rdi) + movl %edx, 112(%rdi) + movl %r10d, 8+112(%rdi) + movl %r8d, 128(%rdi) + movl %r10d, 8+128(%rdi) + movl %r9d, 144(%rdi) + movl %r10d, 8+144(%rdi) + jmp .Lpoly1305_finish_x86_combine +.Lpoly1305_finish_x86_r2r: + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 8+80(%rdi) + movl %ecx, 8+96(%rdi) + movl %edx, 8+112(%rdi) + movl %r8d, 8+128(%rdi) + movl %r9d, 8+144(%rdi) +.Lpoly1305_finish_x86_combine: + xorq %rsi, %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_not_started: + movq 0(%rdi), %r8 + movq 8(%rdi), %r9 + movq %r9, %r10 + movq 16(%rdi), %r11 + shlq $44, %r9 + shrq $20, %r10 + shlq $24, %r11 + orq %r9, %r8 + orq %r11, %r10 + pxor %xmm0, %xmm0 + movl 124(%rdi), %eax + movl 132(%rdi), %ecx + movl 140(%rdi), %edx + movl 148(%rdi), %esi + movq 40(%rsp), %r11 + shlq $32, %rcx + shlq $32, %rsi + orq %rcx, %rax + orq %rsi, %rdx + addq %r8, %rax + adcq %r10, %rdx + movq %rax, 0(%r11) + movq %rdx, 8(%r11) + movq %rbp, %rax + subq %rsp, %rax + movq %rbp, %rsp + movdqa %xmm0, 0(%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm0, 80(%rdi) + movdqa %xmm0, 96(%rdi) + movdqa %xmm0, 112(%rdi) + movdqa %xmm0, 128(%rdi) + movdqa %xmm0, 144(%rdi) + movdqa %xmm0, 160(%rdi) + movdqa %xmm0, 176(%rdi) + movdqa %xmm0, 192(%rdi) + movdqa %xmm0, 208(%rdi) + movdqa %xmm0, 224(%rdi) + popq %rbp + addq $8, %rax + ret +.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_blocks +.type _gcry_poly1305_amd64_sse2_blocks, at function; +_gcry_poly1305_amd64_sse2_blocks: +.Lpoly1305_blocks_x86_local: + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $328, %rsp + movq 240(%rdi), %rax + movl $(1<<24), %r8d + movl $((1<<26)-1), %r9d + movd %r8, %xmm0 + movd %r9, %xmm5 + pshufd $0x44, %xmm0, %xmm0 + pshufd $0x44, %xmm5, %xmm5 + testb $4, %al + je .Lpoly1305_blocks_x86_3 + psrldq $8, %xmm0 +.Lpoly1305_blocks_x86_3: + testb $8, %al + je .Lpoly1305_blocks_x86_4 + pxor %xmm0, %xmm0 +.Lpoly1305_blocks_x86_4: + movdqa %xmm0, 168(%rsp) + testb $1, %al + jne .Lpoly1305_blocks_x86_5 + movq 16(%rsi), %xmm0 + movdqa %xmm5, %xmm7 + movdqa %xmm5, %xmm10 + movq (%rsi), %xmm6 + orq $1, %rax + subq $32, %rdx + movq 8(%rsi), %xmm1 + punpcklqdq %xmm0, %xmm6 + movq 24(%rsi), %xmm0 + pand %xmm6, %xmm7 + movdqa %xmm6, %xmm9 + psrlq $52, %xmm6 + addq $32, %rsi + punpcklqdq %xmm0, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $26, %xmm9 + psllq $12, %xmm0 + movq %rax, 240(%rdi) + pand %xmm5, %xmm9 + por %xmm0, %xmm6 + psrlq $40, %xmm1 + pand %xmm6, %xmm10 + por 168(%rsp), %xmm1 + psrlq $26, %xmm6 + pand %xmm5, %xmm6 +.Lpoly1305_blocks_x86_6: + movdqa 80(%rdi), %xmm13 + cmpq $63, %rdx + movl $(5), %r8d + movd %r8, %xmm14 + pshufd $0x44, %xmm14, %xmm14 + movdqa 96(%rdi), %xmm15 + movdqa %xmm13, -8(%rsp) + movdqa 112(%rdi), %xmm0 + movdqa %xmm14, 136(%rsp) + movdqa 128(%rdi), %xmm3 + movdqa %xmm15, 312(%rsp) + pmuludq %xmm14, %xmm15 + movdqa 144(%rdi), %xmm13 + movdqa %xmm0, 232(%rsp) + pmuludq %xmm14, %xmm0 + movdqa %xmm3, 152(%rsp) + pmuludq %xmm14, %xmm3 + movdqa %xmm13, 56(%rsp) + pmuludq %xmm14, %xmm13 + movdqa %xmm15, 40(%rsp) + movdqa %xmm0, -24(%rsp) + movdqa %xmm3, -40(%rsp) + movdqa %xmm13, -56(%rsp) + jbe .Lpoly1305_blocks_x86_7 + movdqa 192(%rdi), %xmm15 + leaq 32(%rsi), %rax + movq %rdx, %rcx + movdqa 176(%rdi), %xmm14 + movdqa %xmm15, %xmm2 + movdqa 208(%rdi), %xmm0 + movdqa %xmm15, 216(%rsp) + movdqa %xmm14, 296(%rsp) + movdqa 224(%rdi), %xmm3 + pmuludq 136(%rsp), %xmm14 + movdqa -24(%rsp), %xmm13 + movdqa %xmm14, 8(%rsp) + pmuludq 136(%rsp), %xmm2 + movdqa -40(%rsp), %xmm14 + movdqa %xmm0, 120(%rsp) + pmuludq 136(%rsp), %xmm0 + movdqa %xmm3, 24(%rsp) + movdqa 160(%rdi), %xmm12 + movdqa %xmm0, %xmm8 + movdqa -56(%rsp), %xmm15 + movdqa %xmm13, 88(%rsp) + pmuludq 136(%rsp), %xmm3 + movdqa %xmm2, 104(%rsp) + movdqa %xmm0, %xmm13 + movdqa -8(%rsp), %xmm11 + movdqa %xmm3, 280(%rsp) + movdqa %xmm2, %xmm3 + movdqa %xmm0, 200(%rsp) + movdqa %xmm14, 184(%rsp) + movdqa %xmm15, 264(%rsp) + jmp .Lpoly1305_blocks_x86_8 +.p2align 6,,63 +.Lpoly1305_blocks_x86_13: + movdqa 200(%rsp), %xmm13 + movdqa %xmm3, %xmm6 + movdqa 200(%rsp), %xmm8 + movdqa 104(%rsp), %xmm3 +.Lpoly1305_blocks_x86_8: + movdqa 8(%rsp), %xmm4 + pmuludq %xmm6, %xmm3 + subq $64, %rcx + pmuludq %xmm10, %xmm8 + movdqa 104(%rsp), %xmm2 + movdqa 200(%rsp), %xmm0 + pmuludq %xmm1, %xmm4 + movdqa 280(%rsp), %xmm15 + pmuludq %xmm6, %xmm13 + movdqa 280(%rsp), %xmm14 + pmuludq %xmm1, %xmm0 + paddq %xmm3, %xmm4 + pmuludq %xmm1, %xmm2 + movdqa 280(%rsp), %xmm3 + paddq %xmm8, %xmm4 + pmuludq %xmm9, %xmm15 + movdqa 280(%rsp), %xmm8 + pmuludq %xmm10, %xmm14 + pmuludq %xmm6, %xmm8 + paddq %xmm13, %xmm2 + movdqa %xmm6, %xmm13 + pmuludq %xmm1, %xmm3 + paddq %xmm15, %xmm4 + movdqa 296(%rsp), %xmm15 + pmuludq %xmm12, %xmm13 + paddq %xmm14, %xmm2 + movdqa %xmm7, %xmm14 + paddq %xmm8, %xmm0 + pmuludq %xmm12, %xmm14 + movdqa %xmm9, %xmm8 + pmuludq 296(%rsp), %xmm6 + pmuludq %xmm12, %xmm8 + movdqa %xmm6, 248(%rsp) + pmuludq %xmm10, %xmm15 + movq -16(%rax), %xmm6 + paddq %xmm13, %xmm3 + movdqa %xmm10, %xmm13 + paddq %xmm14, %xmm4 + movq -8(%rax), %xmm14 + paddq %xmm8, %xmm2 + movq -32(%rax), %xmm8 + pmuludq %xmm12, %xmm13 + paddq %xmm15, %xmm3 + pmuludq %xmm12, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq 216(%rsp), %xmm10 + punpcklqdq %xmm6, %xmm8 + movq -24(%rax), %xmm6 + pmuludq %xmm9, %xmm15 + paddq %xmm13, %xmm0 + movdqa 296(%rsp), %xmm13 + paddq 248(%rsp), %xmm1 + punpcklqdq %xmm14, %xmm6 + movdqa 296(%rsp), %xmm14 + pmuludq %xmm9, %xmm13 + pmuludq 120(%rsp), %xmm9 + movdqa %xmm15, 72(%rsp) + paddq %xmm10, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq %xmm7, %xmm14 + movdqa %xmm6, %xmm10 + paddq %xmm9, %xmm1 + pmuludq %xmm7, %xmm15 + paddq %xmm13, %xmm0 + paddq 72(%rsp), %xmm3 + movdqa 120(%rsp), %xmm13 + psllq $12, %xmm10 + paddq %xmm14, %xmm2 + movdqa %xmm5, %xmm14 + pand %xmm8, %xmm14 + pmuludq %xmm7, %xmm13 + paddq %xmm15, %xmm0 + movdqa %xmm14, 248(%rsp) + movdqa %xmm8, %xmm14 + psrlq $52, %xmm8 + movdqu (%rax), %xmm9 + por %xmm10, %xmm8 + pmuludq 24(%rsp), %xmm7 + movdqu 16(%rax), %xmm10 + paddq %xmm13, %xmm3 + pxor %xmm13, %xmm13 + movdqa %xmm9, %xmm15 + paddq %xmm7, %xmm1 + movdqa %xmm6, %xmm7 + movdqa %xmm10, -72(%rsp) + punpckldq %xmm10, %xmm15 + movdqa %xmm15, %xmm10 + punpckldq %xmm13, %xmm10 + punpckhdq -72(%rsp), %xmm9 + psrlq $40, %xmm6 + movdqa %xmm10, 72(%rsp) + movdqa %xmm9, %xmm10 + punpckhdq %xmm13, %xmm9 + psllq $18, %xmm9 + paddq 72(%rsp), %xmm4 + addq $64, %rax + paddq %xmm9, %xmm3 + movdqa 40(%rsp), %xmm9 + cmpq $63, %rcx + punpckhdq %xmm13, %xmm15 + psllq $6, %xmm15 + punpckldq %xmm13, %xmm10 + paddq %xmm15, %xmm2 + psllq $12, %xmm10 + por 168(%rsp), %xmm6 + pmuludq %xmm6, %xmm9 + movdqa 88(%rsp), %xmm15 + paddq %xmm10, %xmm0 + movdqa 88(%rsp), %xmm13 + psrlq $14, %xmm7 + pand %xmm5, %xmm8 + movdqa 184(%rsp), %xmm10 + pand %xmm5, %xmm7 + pmuludq %xmm7, %xmm15 + paddq %xmm9, %xmm4 + pmuludq %xmm6, %xmm13 + movdqa 184(%rsp), %xmm9 + paddq 168(%rsp), %xmm1 + pmuludq %xmm7, %xmm10 + pmuludq %xmm6, %xmm9 + paddq %xmm15, %xmm4 + movdqa 184(%rsp), %xmm15 + paddq %xmm13, %xmm2 + psrlq $26, %xmm14 + movdqa 264(%rsp), %xmm13 + paddq %xmm10, %xmm2 + pmuludq %xmm8, %xmm15 + pand %xmm5, %xmm14 + paddq %xmm9, %xmm0 + pmuludq %xmm6, %xmm13 + movdqa 264(%rsp), %xmm9 + movdqa 264(%rsp), %xmm10 + pmuludq %xmm11, %xmm6 + pmuludq %xmm8, %xmm9 + paddq %xmm15, %xmm4 + movdqa 264(%rsp), %xmm15 + pmuludq %xmm14, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm7, %xmm13 + pmuludq %xmm7, %xmm15 + paddq %xmm6, %xmm1 + movdqa 312(%rsp), %xmm6 + paddq %xmm9, %xmm2 + pmuludq %xmm11, %xmm13 + movdqa 248(%rsp), %xmm9 + paddq %xmm10, %xmm4 + pmuludq %xmm8, %xmm6 + pmuludq 312(%rsp), %xmm7 + paddq %xmm15, %xmm0 + movdqa %xmm9, %xmm10 + movdqa %xmm14, %xmm15 + pmuludq %xmm11, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm8, %xmm13 + pmuludq %xmm11, %xmm13 + paddq %xmm6, %xmm3 + paddq %xmm7, %xmm1 + movdqa 232(%rsp), %xmm6 + pmuludq %xmm11, %xmm15 + pmuludq 232(%rsp), %xmm8 + paddq %xmm10, %xmm4 + paddq %xmm8, %xmm1 + movdqa 312(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm14, %xmm6 + movdqa 312(%rsp), %xmm13 + pmuludq %xmm9, %xmm10 + paddq %xmm15, %xmm2 + movdqa 232(%rsp), %xmm7 + pmuludq %xmm14, %xmm13 + pmuludq 152(%rsp), %xmm14 + paddq %xmm14, %xmm1 + pmuludq %xmm9, %xmm7 + paddq %xmm6, %xmm3 + paddq %xmm10, %xmm2 + movdqa 152(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm9, %xmm10 + paddq %xmm7, %xmm0 + movdqa %xmm4, %xmm7 + psrlq $26, %xmm7 + pmuludq 56(%rsp), %xmm9 + pand %xmm5, %xmm4 + paddq %xmm7, %xmm2 + paddq %xmm9, %xmm1 + paddq %xmm10, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm9 + movdqa %xmm3, %xmm6 + psrlq $26, %xmm7 + pand %xmm5, %xmm3 + psrlq $26, %xmm6 + paddq %xmm7, %xmm0 + pand %xmm5, %xmm9 + paddq %xmm6, %xmm1 + movdqa %xmm0, %xmm10 + movdqa %xmm1, %xmm6 + pand %xmm5, %xmm10 + pand %xmm5, %xmm1 + psrlq $26, %xmm6 + pmuludq 136(%rsp), %xmm6 + paddq %xmm6, %xmm4 + movdqa %xmm0, %xmm6 + psrlq $26, %xmm6 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm7 + paddq %xmm6, %xmm3 + psrlq $26, %xmm2 + pand %xmm5, %xmm7 + movdqa %xmm3, %xmm0 + paddq %xmm2, %xmm9 + pand %xmm5, %xmm3 + psrlq $26, %xmm0 + paddq %xmm0, %xmm1 + ja .Lpoly1305_blocks_x86_13 + leaq -64(%rdx), %rax + movdqa %xmm3, %xmm6 + andl $63, %edx + andq $-64, %rax + leaq 64(%rsi,%rax), %rsi +.Lpoly1305_blocks_x86_7: + cmpq $31, %rdx + jbe .Lpoly1305_blocks_x86_9 + movdqa -24(%rsp), %xmm13 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm3 + movdqa 40(%rsp), %xmm11 + movdqa %xmm1, %xmm12 + testq %rsi, %rsi + movdqa -40(%rsp), %xmm2 + pmuludq %xmm13, %xmm0 + movdqa %xmm1, %xmm8 + pmuludq %xmm1, %xmm11 + movdqa %xmm10, %xmm4 + movdqa %xmm1, %xmm14 + pmuludq %xmm2, %xmm3 + movdqa %xmm6, %xmm15 + pmuludq %xmm1, %xmm13 + movdqa %xmm7, %xmm1 + pmuludq %xmm2, %xmm12 + paddq %xmm0, %xmm11 + movdqa -56(%rsp), %xmm0 + pmuludq %xmm10, %xmm2 + paddq %xmm3, %xmm13 + pmuludq %xmm0, %xmm4 + movdqa %xmm9, %xmm3 + pmuludq %xmm0, %xmm3 + paddq %xmm2, %xmm11 + pmuludq %xmm0, %xmm8 + movdqa %xmm6, %xmm2 + pmuludq %xmm0, %xmm2 + movdqa -8(%rsp), %xmm0 + paddq %xmm4, %xmm13 + movdqa 312(%rsp), %xmm4 + paddq %xmm3, %xmm11 + pmuludq 312(%rsp), %xmm6 + movdqa 312(%rsp), %xmm3 + pmuludq %xmm0, %xmm1 + paddq %xmm2, %xmm12 + pmuludq %xmm0, %xmm15 + movdqa %xmm9, %xmm2 + pmuludq %xmm0, %xmm2 + pmuludq %xmm7, %xmm3 + paddq %xmm1, %xmm11 + movdqa 232(%rsp), %xmm1 + pmuludq %xmm0, %xmm14 + paddq %xmm15, %xmm8 + pmuludq %xmm10, %xmm0 + paddq %xmm2, %xmm13 + movdqa 312(%rsp), %xmm2 + pmuludq %xmm10, %xmm4 + paddq %xmm3, %xmm13 + movdqa 152(%rsp), %xmm3 + pmuludq %xmm9, %xmm2 + paddq %xmm6, %xmm14 + pmuludq 232(%rsp), %xmm10 + paddq %xmm0, %xmm12 + pmuludq %xmm9, %xmm1 + paddq %xmm10, %xmm14 + movdqa 232(%rsp), %xmm0 + pmuludq %xmm7, %xmm3 + paddq %xmm4, %xmm8 + pmuludq 152(%rsp), %xmm9 + paddq %xmm2, %xmm12 + paddq %xmm9, %xmm14 + pmuludq %xmm7, %xmm0 + paddq %xmm1, %xmm8 + pmuludq 56(%rsp), %xmm7 + paddq %xmm3, %xmm8 + paddq %xmm7, %xmm14 + paddq %xmm0, %xmm12 + je .Lpoly1305_blocks_x86_10 + movdqu (%rsi), %xmm1 + pxor %xmm0, %xmm0 + paddq 168(%rsp), %xmm14 + movdqu 16(%rsi), %xmm2 + movdqa %xmm1, %xmm3 + punpckldq %xmm2, %xmm3 + punpckhdq %xmm2, %xmm1 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm2 + punpckldq %xmm0, %xmm4 + punpckhdq %xmm0, %xmm3 + punpckhdq %xmm0, %xmm1 + punpckldq %xmm0, %xmm2 + movdqa %xmm2, %xmm0 + psllq $6, %xmm3 + paddq %xmm4, %xmm11 + psllq $12, %xmm0 + paddq %xmm3, %xmm13 + psllq $18, %xmm1 + paddq %xmm0, %xmm12 + paddq %xmm1, %xmm8 +.Lpoly1305_blocks_x86_10: + movdqa %xmm11, %xmm9 + movdqa %xmm8, %xmm1 + movdqa %xmm11, %xmm7 + psrlq $26, %xmm9 + movdqa %xmm8, %xmm6 + pand %xmm5, %xmm7 + paddq %xmm13, %xmm9 + psrlq $26, %xmm1 + pand %xmm5, %xmm6 + movdqa %xmm9, %xmm10 + paddq %xmm14, %xmm1 + pand %xmm5, %xmm9 + psrlq $26, %xmm10 + movdqa %xmm1, %xmm0 + pand %xmm5, %xmm1 + paddq %xmm12, %xmm10 + psrlq $26, %xmm0 + pmuludq 136(%rsp), %xmm0 + movdqa %xmm10, %xmm2 + paddq %xmm0, %xmm7 + psrlq $26, %xmm2 + movdqa %xmm7, %xmm0 + pand %xmm5, %xmm10 + paddq %xmm2, %xmm6 + psrlq $26, %xmm0 + pand %xmm5, %xmm7 + movdqa %xmm6, %xmm2 + paddq %xmm0, %xmm9 + pand %xmm5, %xmm6 + psrlq $26, %xmm2 + paddq %xmm2, %xmm1 +.Lpoly1305_blocks_x86_9: + testq %rsi, %rsi + je .Lpoly1305_blocks_x86_11 + movdqa %xmm7, 0(%rdi) + movdqa %xmm9, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm6, 48(%rdi) + movdqa %xmm1, 64(%rdi) + movq -8(%rbp), %rbx + leave + ret +.Lpoly1305_blocks_x86_5: + movdqa 0(%rdi), %xmm7 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm6 + movdqa 64(%rdi), %xmm1 + jmp .Lpoly1305_blocks_x86_6 +.Lpoly1305_blocks_x86_11: + movdqa %xmm7, %xmm0 + movdqa %xmm9, %xmm2 + movdqa %xmm6, %xmm3 + psrldq $8, %xmm0 + movabsq $4398046511103, %rbx + paddq %xmm0, %xmm7 + psrldq $8, %xmm2 + movdqa %xmm10, %xmm0 + movd %xmm7, %edx + paddq %xmm2, %xmm9 + psrldq $8, %xmm0 + movl %edx, %ecx + movd %xmm9, %eax + paddq %xmm0, %xmm10 + shrl $26, %ecx + psrldq $8, %xmm3 + movdqa %xmm1, %xmm0 + addl %ecx, %eax + movd %xmm10, %ecx + paddq %xmm3, %xmm6 + movl %eax, %r9d + shrl $26, %eax + psrldq $8, %xmm0 + addl %ecx, %eax + movd %xmm6, %ecx + paddq %xmm0, %xmm1 + movl %eax, %esi + andl $67108863, %r9d + movd %xmm1, %r10d + shrl $26, %esi + andl $67108863, %eax + andl $67108863, %edx + addl %ecx, %esi + salq $8, %rax + movl %r9d, %ecx + shrl $18, %r9d + movl %esi, %r8d + shrl $26, %esi + andl $67108863, %r8d + addl %r10d, %esi + orq %r9, %rax + salq $16, %rsi + movq %r8, %r9 + shrl $10, %r8d + salq $26, %rcx + orq %r8, %rsi + salq $34, %r9 + orq %rdx, %rcx + movq %rsi, %r8 + shrq $42, %rsi + movabsq $17592186044415, %rdx + orq %r9, %rax + andq %rbx, %r8 + leaq (%rsi,%rsi,4), %rsi + andq %rdx, %rcx + andq %rdx, %rax + movabsq $-4398046511104, %r10 + addq %rsi, %rcx + movq %rcx, %rsi + shrq $44, %rcx + addq %rcx, %rax + andq %rdx, %rsi + movq %rax, %rcx + shrq $44, %rax + addq %r8, %rax + andq %rdx, %rcx + andq %rax, %rbx + shrq $42, %rax + leaq (%rsi,%rax,4), %rsi + addq %rbx, %r10 + addq %rax, %rsi + movq %rsi, %r8 + shrq $44, %rsi + andq %rdx, %r8 + addq %rcx, %rsi + leaq 5(%r8), %r9 + movq %r9, %r11 + andq %rdx, %r9 + shrq $44, %r11 + addq %rsi, %r11 + movq %r11, %rax + andq %r11, %rdx + shrq $44, %rax + addq %rax, %r10 + movq %r10, %rax + shrq $63, %rax + subq $1, %rax + movq %rax, %rcx + andq %rax, %r9 + andq %rax, %rdx + notq %rcx + andq %r10, %rax + andq %rcx, %r8 + andq %rcx, %rsi + andq %rbx, %rcx + orq %r9, %r8 + orq %rdx, %rsi + orq %rax, %rcx + movq %r8, 0(%rdi) + movq %rsi, 8(%rdi) + movq %rcx, 16(%rdi) + movq -8(%rbp), %rbx + movq %rbp, %rax + subq %rsp, %rax + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 + leave + addq $8, %rax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 + ret +.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks; + +#endif diff --git a/cipher/poly1305.c b/cipher/poly1305.c index 472ae42..cd1902a 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -38,6 +38,25 @@ static const char *selftest (void); +#ifdef POLY1305_USE_SSE2 + +void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_amd64_sse2_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_amd64_sse2_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_amd64_sse2_ops = { + POLY1305_SSE2_BLOCKSIZE, + _gcry_poly1305_amd64_sse2_init_ext, + _gcry_poly1305_amd64_sse2_blocks, + _gcry_poly1305_amd64_sse2_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -612,7 +631,11 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; +#ifdef POLY1305_USE_SSE2 + ctx->ops = &poly1305_amd64_sse2_ops; +#else ctx->ops = &poly1305_default_ops; +#endif buf_cpy (keytmp.b, key, POLY1305_KEYLEN); poly1305_init (ctx, &keytmp); diff --git a/configure.ac b/configure.ac index 3a0fd52..4dc36d5 100644 --- a/configure.ac +++ b/configure.ac @@ -1821,6 +1821,13 @@ if test "$found" = "1" ; then esac fi +case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo" + ;; +esac + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" From jussi.kivilinna at iki.fi Fri May 16 20:08:18 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 16 May 2014 21:08:18 +0300 Subject: [PATCH 2/2] poly1305: add AMD64/AVX2 optimized implementation In-Reply-To: <20140516180813.4774.38392.stgit@localhost6.localdomain6> References: <20140516180813.4774.38392.stgit@localhost6.localdomain6> Message-ID: <20140516180818.4774.14840.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'poly1305-avx2-amd64.S'. * cipher/poly1305-avx2-amd64.S: New. * cipher/poly1305-internal.h (POLY1305_USE_AVX2) (POLY1305_AVX2_BLOCKSIZE, POLY1305_AVX2_STATESIZE) (POLY1305_AVX2_ALIGNMENT): New. (POLY1305_LARGEST_BLOCKSIZE, POLY1305_LARGEST_STATESIZE) (POLY1305_STATE_ALIGNMENT): Use AVX2 versions when needed. * cipher/poly1305.c [POLY1305_USE_AVX2] (_gcry_poly1305_amd64_avx2_init_ext) (_gcry_poly1305_amd64_avx2_finish_ext) (_gcry_poly1305_amd64_avx2_blocks, poly1305_amd64_avx2_ops): New. (_gcry_poly1305_init) [POLY1305_USE_AVX2]: Use AVX2 implementation if AVX2 supported by CPU. * configure.ac [host=x86_64]: Add 'poly1305-avx2-amd64.lo'. -- Add Andrew Moon's public domain AVX2 implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmarks on Intel i5-4570 (haswell): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.448 ns/B 2129.5 MiB/s 1.43 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.205 ns/B 4647.1 MiB/s 0.739 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/poly1305-avx2-amd64.S | 954 ++++++++++++++++++++++++++++++++++++++++++ cipher/poly1305-internal.h | 23 + cipher/poly1305.c | 26 + configure.ac | 1 5 files changed, 1002 insertions(+), 4 deletions(-) create mode 100644 cipher/poly1305-avx2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a32ae89..19b0097 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,7 +72,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ -poly1305-sse2-amd64.S \ +poly1305-sse2-amd64.S poly1305-avx2-amd64.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S new file mode 100644 index 0000000..0ba7e76 --- /dev/null +++ b/cipher/poly1305-avx2-amd64.S @@ -0,0 +1,954 @@ +/* poly1305-avx2-amd64.S - AMD64/AVX2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_init_ext +.type _gcry_poly1305_amd64_avx2_init_ext, at function; +_gcry_poly1305_amd64_avx2_init_ext: +.Lpoly1305_init_ext_avx2_local: + xor %edx, %edx + vzeroupper + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + vpxor %ymm0, %ymm0, %ymm0 + movq $-1, %r8 + testq %rcx, %rcx + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, 128(%rdi) + movq 8(%rsi), %r9 + cmove %r8, %rcx + movq $0xffc0fffffff, %r8 + movq %r9, %r13 + movq (%rsi), %r10 + andq %r10, %r8 + shrq $44, %r10 + movq %r8, %r14 + shlq $20, %r13 + orq %r13, %r10 + movq $0xfffffc0ffff, %r13 + shrq $24, %r9 + andq %r13, %r10 + movq $0xffffffc0f, %r13 + andq %r13, %r9 + movl %r8d, %r13d + andl $67108863, %r13d + movl %r13d, 164(%rdi) + movq %r10, %r13 + shrq $26, %r14 + shlq $18, %r13 + orq %r13, %r14 + movq %r10, %r13 + shrq $8, %r13 + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 172(%rdi) + movq %r10, %r14 + movl %r13d, 180(%rdi) + movq %r9, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r9, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r14d, 188(%rdi) + movl %r13d, 196(%rdi) + cmpq $16, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + lea (%r9,%r9,4), %r11 + shlq $2, %r11 + lea (%r10,%r10), %rax + mulq %r11 + movq %rax, %r13 + movq %r8, %rax + movq %rdx, %r14 + mulq %r8 + addq %rax, %r13 + lea (%r8,%r8), %rax + movq %r13, %r12 + adcq %rdx, %r14 + mulq %r10 + shlq $20, %r14 + movq %rax, %r15 + shrq $44, %r12 + movq %r11, %rax + orq %r12, %r14 + movq %rdx, %r12 + mulq %r9 + addq %rax, %r15 + movq %r8, %rax + adcq %rdx, %r12 + addq %r15, %r14 + lea (%r9,%r9), %r15 + movq %r14, %rbx + adcq $0, %r12 + mulq %r15 + shlq $20, %r12 + movq %rdx, %r11 + shrq $44, %rbx + orq %rbx, %r12 + movq %rax, %rbx + movq %r10, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %r11 + addq %rbx, %r12 + movq $0xfffffffffff, %rbx + movq %r12, %r15 + adcq $0, %r11 + andq %rbx, %r13 + shlq $22, %r11 + andq %rbx, %r14 + shrq $42, %r15 + orq %r15, %r11 + lea (%r11,%r11,4), %r11 + addq %r11, %r13 + movq %rbx, %r11 + andq %r13, %r11 + shrq $44, %r13 + movq %r11, %r15 + addq %r13, %r14 + movq $0x3ffffffffff, %r13 + andq %r14, %rbx + andq %r13, %r12 + movq %rbx, %r13 + shrq $26, %r15 + shlq $18, %r13 + orq %r13, %r15 + movq %rbx, %r13 + shrq $44, %r14 + shrq $8, %r13 + addq %r14, %r12 + movl %r11d, %r14d + andl $67108863, %r15d + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 204(%rdi) + movq %rbx, %r14 + movl %r13d, 220(%rdi) + movq %r12, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r12, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r15d, 212(%rdi) + movl %r14d, 228(%rdi) + movl %r13d, 236(%rdi) + cmpq $32, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + movq %r9, %rax + lea (%rbx,%rbx,4), %r14 + shlq $2, %r14 + mulq %r14 + movq %rdi, -32(%rsp) + lea (%r12,%r12,4), %rdi + shlq $2, %rdi + movq %rax, %r14 + movq %r10, %rax + movq %rdx, %r15 + mulq %rdi + movq %rax, %r13 + movq %r11, %rax + movq %rcx, -16(%rsp) + movq %rdx, %rcx + mulq %r8 + addq %rax, %r13 + movq %rdi, %rax + movq %rsi, -24(%rsp) + adcq %rdx, %rcx + addq %r13, %r14 + adcq %rcx, %r15 + movq %r14, %rcx + mulq %r9 + shlq $20, %r15 + movq %rax, %r13 + shrq $44, %rcx + movq %r11, %rax + orq %rcx, %r15 + movq %rdx, %rcx + mulq %r10 + movq %rax, %rsi + movq %rbx, %rax + movq %rdx, %rdi + mulq %r8 + addq %rax, %rsi + movq %r11, %rax + adcq %rdx, %rdi + addq %rsi, %r13 + adcq %rdi, %rcx + addq %r13, %r15 + movq %r15, %rdi + adcq $0, %rcx + mulq %r9 + shlq $20, %rcx + movq %rdx, %rsi + shrq $44, %rdi + orq %rdi, %rcx + movq %rax, %rdi + movq %rbx, %rax + mulq %r10 + movq %rax, %r9 + movq %r8, %rax + movq %rdx, %r10 + movq $0xfffffffffff, %r8 + mulq %r12 + addq %rax, %r9 + adcq %rdx, %r10 + andq %r8, %r14 + addq %r9, %rdi + adcq %r10, %rsi + andq %r8, %r15 + addq %rdi, %rcx + movq $0x3ffffffffff, %rdi + movq %rcx, %r10 + adcq $0, %rsi + andq %rdi, %rcx + shlq $22, %rsi + shrq $42, %r10 + orq %r10, %rsi + movq -32(%rsp), %rdi + lea (%rsi,%rsi,4), %r9 + movq %r8, %rsi + addq %r9, %r14 + andq %r14, %rsi + shrq $44, %r14 + addq %r14, %r15 + andq %r15, %r8 + shrq $44, %r15 + movq %r8, %r14 + addq %r15, %rcx + movl %esi, %r15d + movq %rcx, %r10 + movq %r8, %r9 + shrq $26, %rsi + andl $67108863, %r15d + shlq $18, %r14 + shrq $34, %r8 + orq %r14, %rsi + shlq $10, %r10 + shrq $8, %r9 + orq %r10, %r8 + shrq $16, %rcx + andl $67108863, %esi + movl %esi, 252(%rdi) + andl $67108863, %r9d + movl %ecx, 276(%rdi) + andl $67108863, %r8d + movl %r15d, 244(%rdi) + movl %r9d, 260(%rdi) + movl %r8d, 268(%rdi) + movq -16(%rsp), %rcx + movq -24(%rsp), %rsi +.Lpoly1305_init_ext_avx2_continue: + movl 16(%rsi), %r8d + movl %r8d, 284(%rdi) + movl 20(%rsi), %r9d + movl %r9d, 292(%rdi) + movl 24(%rsi), %r10d + movl %r10d, 300(%rdi) + movl 28(%rsi), %esi + movl %esi, 308(%rdi) + cmpq $48, %rcx + jbe .Lpoly1305_init_ext_avx2_done + lea (%r12,%r12,4), %r9 + shlq $2, %r9 + lea (%rbx,%rbx), %rax + mulq %r9 + movq %rax, %rsi + movq %r11, %rax + movq %rdx, %r8 + mulq %r11 + addq %rax, %rsi + lea (%r11,%r11), %rax + movq %rsi, %r10 + adcq %rdx, %r8 + mulq %rbx + movq %rax, %r13 + movq %r12, %rax + movq %rdx, %rcx + addq %r12, %r12 + mulq %r9 + addq %rax, %r13 + movq %r11, %rax + movq $0xfffffffffff, %r9 + adcq %rdx, %rcx + andq %r9, %rsi + mulq %r12 + shlq $20, %r8 + movq %rax, %r11 + shrq $44, %r10 + movq %rbx, %rax + orq %r10, %r8 + movq %rdx, %r12 + mulq %rbx + addq %r13, %r8 + movq %r8, %r14 + adcq $0, %rcx + andq %r9, %r8 + addq %rax, %r11 + adcq %rdx, %r12 + shlq $20, %rcx + shrq $44, %r14 + orq %r14, %rcx + addq %r11, %rcx + movq %rcx, %rbx + adcq $0, %r12 + shlq $22, %r12 + shrq $42, %rbx + orq %rbx, %r12 + movq %r9, %rbx + lea (%r12,%r12,4), %r15 + addq %r15, %rsi + andq %rsi, %rbx + shrq $44, %rsi + movl %ebx, %r11d + addq %rsi, %r8 + movq $0x3ffffffffff, %rsi + andq %r8, %r9 + andq %rsi, %rcx + shrq $44, %r8 + movq %r9, %rax + addq %r8, %rcx + movq %r9, %r8 + movq %rcx, %r10 + andl $67108863, %r11d + shrq $26, %rbx + shlq $18, %r8 + shrq $34, %r9 + orq %r8, %rbx + shlq $10, %r10 + shrq $8, %rax + orq %r10, %r9 + shrq $16, %rcx + andl $67108863, %ebx + andl $67108863, %eax + andl $67108863, %r9d + movl %r11d, 184(%rdi) + movl %r11d, 176(%rdi) + movl %r11d, 168(%rdi) + movl %r11d, 160(%rdi) + movl %ebx, 216(%rdi) + movl %ebx, 208(%rdi) + movl %ebx, 200(%rdi) + movl %ebx, 192(%rdi) + movl %eax, 248(%rdi) + movl %eax, 240(%rdi) + movl %eax, 232(%rdi) + movl %eax, 224(%rdi) + movl %r9d, 280(%rdi) + movl %r9d, 272(%rdi) + movl %r9d, 264(%rdi) + movl %r9d, 256(%rdi) + movl %ecx, 312(%rdi) + movl %ecx, 304(%rdi) + movl %ecx, 296(%rdi) + movl %ecx, 288(%rdi) +.Lpoly1305_init_ext_avx2_done: + movq $0, 320(%rdi) + vzeroall + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + ret +.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_blocks +.type _gcry_poly1305_amd64_avx2_blocks, at function; +_gcry_poly1305_amd64_avx2_blocks: +.Lpoly1305_blocks_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $200, %rsp + movl $((1<<26)-1), %r8d + movl $(5), %r9d + movl $((1<<24)), %r10d + vmovd %r8d, %xmm0 + vmovd %r9d, %xmm8 + vmovd %r10d, %xmm7 + vpbroadcastq %xmm0, %ymm0 + vpbroadcastq %xmm8, %ymm8 + vpbroadcastq %xmm7, %ymm7 + vmovdqa %ymm7, 168(%rsp) + movq 320(%rdi), %rax + testb $60, %al + je .Lpoly1305_blocks_avx2_9 + vmovdqa 168(%rsp), %ymm7 + vpsrldq $8, %ymm7, %ymm1 + vmovdqa %ymm1, 168(%rsp) + testb $4, %al + je .Lpoly1305_blocks_avx2_10 + vpermq $192, %ymm1, %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_10: + testb $8, %al + je .Lpoly1305_blocks_avx2_11 + vpermq $240, 168(%rsp), %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_11: + testb $16, %al + je .Lpoly1305_blocks_avx2_12 + vpermq $252, 168(%rsp), %ymm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_12: + testb $32, %al + je .Lpoly1305_blocks_avx2_9 + vpxor %xmm6, %xmm6, %xmm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_9: + testb $1, %al + jne .Lpoly1305_blocks_avx2_13 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm1 + vpunpcklqdq %ymm1, %ymm3, %ymm2 + vpunpckhqdq %ymm1, %ymm3, %ymm1 + vpermq $216, %ymm2, %ymm2 + vpermq $216, %ymm1, %ymm1 + vpand %ymm2, %ymm0, %ymm5 + vpsrlq $26, %ymm2, %ymm4 + vpand %ymm4, %ymm0, %ymm4 + vpsllq $12, %ymm1, %ymm3 + vpsrlq $52, %ymm2, %ymm2 + vpor %ymm3, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm3 + vpsrlq $26, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm2 + vpsrlq $40, %ymm1, %ymm1 + vpor 168(%rsp), %ymm1, %ymm1 + addq $64, %rsi + subq $64, %rdx + orq $1, 320(%rdi) + jmp .Lpoly1305_blocks_avx2_14 +.Lpoly1305_blocks_avx2_13: + vmovdqa (%rdi), %ymm5 + vmovdqa 32(%rdi), %ymm4 + vmovdqa 64(%rdi), %ymm3 + vmovdqa 96(%rdi), %ymm2 + vmovdqa 128(%rdi), %ymm1 +.Lpoly1305_blocks_avx2_14: + cmpq $63, %rdx + jbe .Lpoly1305_blocks_avx2_15 + vmovdqa 160(%rdi), %ymm6 + vmovdqa %ymm8, 136(%rsp) + vmovdqa 192(%rdi), %ymm7 + vpmuludq %ymm8, %ymm7, %ymm11 + vmovdqa %ymm11, 104(%rsp) + vmovdqa 224(%rdi), %ymm11 + vmovdqa %ymm11, 72(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, 40(%rsp) + vmovdqa 256(%rdi), %ymm11 + vmovdqa %ymm11, 8(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, -24(%rsp) + vmovdqa 288(%rdi), %ymm13 + vmovdqa %ymm13, -56(%rsp) + vpmuludq %ymm13, %ymm8, %ymm13 + vmovdqa %ymm13, -88(%rsp) +.Lpoly1305_blocks_avx2_16: + vpmuludq 104(%rsp), %ymm1, %ymm14 + vmovdqa 40(%rsp), %ymm13 + vpmuludq %ymm13, %ymm2, %ymm8 + vpmuludq %ymm13, %ymm1, %ymm13 + vmovdqa -24(%rsp), %ymm9 + vpmuludq %ymm9, %ymm2, %ymm10 + vpmuludq %ymm9, %ymm1, %ymm11 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm9, %ymm3, %ymm8 + vmovdqa -88(%rsp), %ymm12 + vpmuludq %ymm12, %ymm1, %ymm9 + vpaddq %ymm10, %ymm13, %ymm13 + vpmuludq %ymm12, %ymm4, %ymm15 + vmovdqa %ymm12, %ymm10 + vpmuludq %ymm12, %ymm3, %ymm12 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm10, %ymm2, %ymm10 + vpmuludq %ymm6, %ymm2, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm6, %ymm1, %ymm1 + vpaddq %ymm12, %ymm13, %ymm13 + vpmuludq %ymm6, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm11 + vpmuludq %ymm6, %ymm4, %ymm12 + vpaddq %ymm8, %ymm9, %ymm9 + vpmuludq %ymm6, %ymm3, %ymm10 + vpmuludq %ymm7, %ymm3, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm7, %ymm2, %ymm2 + vpaddq %ymm12, %ymm13, %ymm12 + vpmuludq %ymm7, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm10 + vpmuludq %ymm7, %ymm4, %ymm13 + vpaddq %ymm8, %ymm9, %ymm8 + vmovdqa 72(%rsp), %ymm9 + vpmuludq %ymm9, %ymm4, %ymm11 + vpaddq %ymm2, %ymm1, %ymm1 + vpmuludq %ymm9, %ymm3, %ymm3 + vpaddq %ymm15, %ymm12, %ymm12 + vpmuludq %ymm9, %ymm5, %ymm15 + vpaddq %ymm13, %ymm10, %ymm10 + vmovdqa 8(%rsp), %ymm2 + vpmuludq %ymm2, %ymm5, %ymm9 + vpaddq %ymm11, %ymm8, %ymm8 + vpmuludq %ymm2, %ymm4, %ymm4 + vpaddq %ymm3, %ymm1, %ymm1 + vpmuludq -56(%rsp), %ymm5, %ymm5 + vpaddq %ymm15, %ymm10, %ymm10 + vpaddq %ymm9, %ymm8, %ymm8 + vpaddq %ymm4, %ymm1, %ymm1 + vpaddq %ymm5, %ymm1, %ymm5 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm2 + vperm2i128 $32, %ymm2, %ymm3, %ymm1 + vperm2i128 $49, %ymm2, %ymm3, %ymm2 + vpunpckldq %ymm2, %ymm1, %ymm15 + vpunpckhdq %ymm2, %ymm1, %ymm2 + vpxor %xmm4, %xmm4, %xmm4 + vpunpckldq %ymm4, %ymm15, %ymm1 + vpunpckhdq %ymm4, %ymm15, %ymm15 + vpunpckldq %ymm4, %ymm2, %ymm3 + vpunpckhdq %ymm4, %ymm2, %ymm2 + vpsllq $6, %ymm15, %ymm15 + vpsllq $12, %ymm3, %ymm3 + vpsllq $18, %ymm2, %ymm2 + vpaddq %ymm1, %ymm14, %ymm14 + vpaddq %ymm15, %ymm12, %ymm12 + vpaddq %ymm3, %ymm10, %ymm10 + vpaddq %ymm2, %ymm8, %ymm8 + vpaddq 168(%rsp), %ymm5, %ymm5 + addq $64, %rsi + vpsrlq $26, %ymm14, %ymm4 + vpsrlq $26, %ymm8, %ymm2 + vpand %ymm0, %ymm14, %ymm14 + vpand %ymm0, %ymm8, %ymm8 + vpaddq %ymm4, %ymm12, %ymm12 + vpaddq %ymm2, %ymm5, %ymm5 + vpsrlq $26, %ymm12, %ymm3 + vpsrlq $26, %ymm5, %ymm9 + vpand %ymm0, %ymm12, %ymm12 + vpand %ymm0, %ymm5, %ymm11 + vpaddq %ymm3, %ymm10, %ymm3 + vpmuludq 136(%rsp), %ymm9, %ymm9 + vpaddq %ymm9, %ymm14, %ymm14 + vpsrlq $26, %ymm3, %ymm2 + vpsrlq $26, %ymm14, %ymm4 + vpand %ymm0, %ymm3, %ymm3 + vpand %ymm0, %ymm14, %ymm5 + vpaddq %ymm2, %ymm8, %ymm2 + vpaddq %ymm4, %ymm12, %ymm4 + vpsrlq $26, %ymm2, %ymm1 + vpand %ymm0, %ymm2, %ymm2 + vpaddq %ymm1, %ymm11, %ymm1 + subq $64, %rdx + cmpq $63, %rdx + ja .Lpoly1305_blocks_avx2_16 +.Lpoly1305_blocks_avx2_15: + testb $64, 320(%rdi) + jne .Lpoly1305_blocks_avx2_17 + vmovdqa %ymm5, (%rdi) + vmovdqa %ymm4, 32(%rdi) + vmovdqa %ymm3, 64(%rdi) + vmovdqa %ymm2, 96(%rdi) + vmovdqa %ymm1, 128(%rdi) + jmp .Lpoly1305_blocks_avx2_8 +.Lpoly1305_blocks_avx2_17: + vpermq $245, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $245, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $245, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $245, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $245, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vpermq $170, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $170, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $170, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $170, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $170, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vmovd %xmm5, %eax + vmovd %xmm4, %edx + movl %eax, %ecx + shrl $26, %ecx + addl %edx, %ecx + movl %ecx, %edx + andl $67108863, %edx + vmovd %xmm3, %esi + shrl $26, %ecx + movl %ecx, %r11d + addl %esi, %r11d + vmovd %xmm2, %ecx + movl %r11d, %r10d + shrl $26, %r10d + addl %ecx, %r10d + movl %r10d, %r9d + andl $67108863, %r9d + vmovd %xmm1, %r8d + movl %edx, %esi + salq $26, %rsi + andl $67108863, %eax + orq %rax, %rsi + movabsq $17592186044415, %rax + andq %rax, %rsi + andl $67108863, %r11d + salq $8, %r11 + shrl $18, %edx + movl %edx, %edx + orq %r11, %rdx + movq %r9, %rcx + salq $34, %rcx + orq %rcx, %rdx + andq %rax, %rdx + shrl $26, %r10d + addl %r10d, %r8d + salq $16, %r8 + shrl $10, %r9d + movl %r9d, %r9d + orq %r9, %r8 + movabsq $4398046511103, %r10 + movq %r8, %r9 + andq %r10, %r9 + shrq $42, %r8 + leaq (%r8,%r8,4), %rcx + addq %rcx, %rsi + movq %rsi, %r8 + andq %rax, %r8 + movq %rsi, %rcx + shrq $44, %rcx + addq %rdx, %rcx + movq %rcx, %rsi + andq %rax, %rsi + shrq $44, %rcx + movq %rcx, %rdx + addq %r9, %rdx + andq %rdx, %r10 + shrq $42, %rdx + leaq (%r8,%rdx,4), %rcx + leaq (%rcx,%rdx), %rdx + movq %rdx, %rbx + andq %rax, %rbx + shrq $44, %rdx + movq %rdx, %r11 + addq %rsi, %r11 + leaq 5(%rbx), %r9 + movq %r9, %r8 + shrq $44, %r8 + addq %r11, %r8 + movabsq $-4398046511104, %rsi + addq %r10, %rsi + movq %r8, %rdx + shrq $44, %rdx + addq %rdx, %rsi + movq %rsi, %rdx + shrq $63, %rdx + subq $1, %rdx + movq %rdx, %rcx + notq %rcx + andq %rcx, %rbx + andq %rcx, %r11 + andq %r10, %rcx + andq %rax, %r9 + andq %rdx, %r9 + orq %r9, %rbx + movq %rbx, (%rdi) + andq %r8, %rax + andq %rdx, %rax + orq %rax, %r11 + movq %r11, 8(%rdi) + andq %rsi, %rdx + orq %rcx, %rdx + movq %rdx, 16(%rdi) +.Lpoly1305_blocks_avx2_8: + movq -8(%rbp), %rbx + vzeroall + movq %rbp, %rax + subq %rsp, %rax + leave + addq $8, %rax + ret +.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_finish_ext +.type _gcry_poly1305_amd64_avx2_finish_ext, at function; +_gcry_poly1305_amd64_avx2_finish_ext: +.Lpoly1305_finish_ext_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %r13 + pushq %r12 + pushq %rbx + andq $-64, %rsp + subq $64, %rsp + movq %rdi, %rbx + movq %rdx, %r13 + movq %rcx, %r12 + testq %rdx, %rdx + je .Lpoly1305_finish_ext_avx2_22 + vpxor %xmm0, %xmm0, %xmm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movq %rsp, %rax + subq %rsp, %rsi + testb $32, %dl + je .Lpoly1305_finish_ext_avx2_23 + vmovdqu (%rsp,%rsi), %ymm0 + vmovdqa %ymm0, (%rsp) + leaq 32(%rsp), %rax +.Lpoly1305_finish_ext_avx2_23: + testb $16, %r13b + je .Lpoly1305_finish_ext_avx2_24 + vmovdqu (%rax,%rsi), %xmm0 + vmovdqa %xmm0, (%rax) + addq $16, %rax +.Lpoly1305_finish_ext_avx2_24: + testb $8, %r13b + je .Lpoly1305_finish_ext_avx2_25 + movq (%rax,%rsi), %rdx + movq %rdx, (%rax) + addq $8, %rax +.Lpoly1305_finish_ext_avx2_25: + testb $4, %r13b + je .Lpoly1305_finish_ext_avx2_26 + movl (%rax,%rsi), %edx + movl %edx, (%rax) + addq $4, %rax +.Lpoly1305_finish_ext_avx2_26: + testb $2, %r13b + je .Lpoly1305_finish_ext_avx2_27 + movzwl (%rax,%rsi), %edx + movw %dx, (%rax) + addq $2, %rax +.Lpoly1305_finish_ext_avx2_27: + testb $1, %r13b + je .Lpoly1305_finish_ext_avx2_28 + movzbl (%rax,%rsi), %edx + movb %dl, (%rax) +.Lpoly1305_finish_ext_avx2_28: + testb $15, %r13b + je .Lpoly1305_finish_ext_avx2_29 + movb $1, (%rsp,%r13) +.Lpoly1305_finish_ext_avx2_29: + cmpq $47, %r13 + jbe .Lpoly1305_finish_ext_avx2_30 + orq $4, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_30: + cmpq $31, %r13 + jbe .Lpoly1305_finish_ext_avx2_32 + orq $8, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_32: + cmpq $15, %r13 + jbe .Lpoly1305_finish_ext_avx2_33 + orq $16, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_33: + orq $32, 320(%rbx) +.Lpoly1305_finish_ext_avx2_31: + testb $1, 320(%rbx) + je .Lpoly1305_finish_ext_avx2_34 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_34 + cmpq $17, %r13 + sbbq %rsi, %rsi + notq %rsi + addq $2, %rsi + cmpq $17, %r13 + sbbq %rax, %rax + movq %rbx, %rdx + addq $23, %rax + leaq (%rbx,%rax,8), %rax + movl $0, %ecx +.Lpoly1305_finish_ext_avx2_37: + movl 244(%rdx), %edi + movl %edi, (%rax) + movl 252(%rdx), %edi + movl %edi, 32(%rax) + movl 260(%rdx), %edi + movl %edi, 64(%rax) + movl 268(%rdx), %edi + movl %edi, 96(%rax) + movl 276(%rdx), %edi + movl %edi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + cmpq %rcx, %rsi + ja .Lpoly1305_finish_ext_avx2_37 +.Lpoly1305_finish_ext_avx2_34: + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_22: + movq 320(%rbx), %r8 + testb $1, %r8b + je .Lpoly1305_finish_ext_avx2_38 + leaq -1(%r13), %rax + cmpq $47, %rax + ja .Lpoly1305_finish_ext_avx2_46 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_47 + cmpq $17, %r13 + sbbq %r9, %r9 + addq $2, %r9 + movl $0, %edi + cmpq $17, %r13 + sbbq %rax, %rax + notq %rax + andl $5, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_41: + movl (%rdx), %esi + movl %esi, (%rax) + movl 8(%rdx), %esi + movl %esi, 32(%rax) + movl 16(%rdx), %esi + movl %esi, 64(%rax) + movl 24(%rdx), %esi + movl %esi, 96(%rax) + movl 32(%rdx), %esi + movl %esi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + movq %rcx, %rsi + subq %rdi, %rsi + cmpq %rsi, %r9 + ja .Lpoly1305_finish_ext_avx2_41 + cmpq $3, %rcx + ja .Lpoly1305_finish_ext_avx2_42 + leaq 160(%rbx,%rcx,8), %rax +.Lpoly1305_finish_ext_avx2_43: + movl $1, (%rax) + movl $0, 32(%rax) + movl $0, 64(%rax) + movl $0, 96(%rax) + movl $0, 128(%rax) + addq $1, %rcx + addq $8, %rax + cmpq $4, %rcx + jne .Lpoly1305_finish_ext_avx2_43 +.Lpoly1305_finish_ext_avx2_42: + orq $96, %r8 + movq %r8, 320(%rbx) + vpxor %ymm0, %ymm0, %ymm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_38: + movq 8(%rbx), %rax + movq %rax, %rdx + salq $44, %rdx + orq (%rbx), %rdx + shrq $20, %rax + movl $24, %edi + shlx %rdi, 16(%rbx), %rcx + orq %rcx, %rax + movl 292(%rbx), %ecx + salq $32, %rcx + movl 284(%rbx), %esi + orq %rsi, %rcx + movl 308(%rbx), %esi + salq $32, %rsi + movl 300(%rbx), %edi + orq %rdi, %rsi + addq %rcx, %rdx + adcq %rsi, %rax + movq %rdx, (%r12) + movq %rax, 8(%r12) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm0, 32(%rbx) + vmovdqu %ymm0, 64(%rbx) + vmovdqu %ymm0, 96(%rbx) + vmovdqu %ymm0, 128(%rbx) + vmovdqu %ymm0, 160(%rbx) + vmovdqu %ymm0, 192(%rbx) + vmovdqu %ymm0, 224(%rbx) + jmp .Lpoly1305_finish_ext_avx2_49 +.Lpoly1305_finish_ext_avx2_46: + movl $3, %r9d + movl $1, %edi + movl $10, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_47: + movl $3, %r9d + movl $0, %edi + movl $10, %eax +.Lpoly1305_finish_ext_avx2_39: + leaq 164(%rbx,%rax,8), %rdx + leaq 160(%rbx,%rdi,8), %rax + movq %rdi, %rcx + jmp .Lpoly1305_finish_ext_avx2_41 +.Lpoly1305_finish_ext_avx2_49: + movq %rbp, %rax + subq %rsp, %rax + leaq -24(%rbp), %rsp + vzeroall + popq %rbx + popq %r12 + popq %r13 + popq %rbp + addq $(8*5), %rax +ret +.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext; + +#endif diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index fa3fe75..0299c43 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -54,23 +54,40 @@ #endif +/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ +#undef POLY1305_USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) +# define POLY1305_USE_AVX2 1 +# define POLY1305_AVX2_BLOCKSIZE 64 +# define POLY1305_AVX2_STATESIZE 328 +# define POLY1305_AVX2_ALIGNMENT 32 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE #else # define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE #endif /* Largest state-size used in any implementation. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE #else # define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE #endif /* Minimum alignment for state pointer passed to implementations. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT +#elif defined(POLY1305_USE_SSE2) # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT #else # define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT diff --git a/cipher/poly1305.c b/cipher/poly1305.c index cd1902a..fe241c1 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -57,6 +57,25 @@ static const poly1305_ops_t poly1305_amd64_sse2_ops = { #endif +#ifdef POLY1305_USE_AVX2 + +void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_amd64_avx2_ops = { + POLY1305_AVX2_BLOCKSIZE, + _gcry_poly1305_amd64_avx2_init_ext, + _gcry_poly1305_amd64_avx2_blocks, + _gcry_poly1305_amd64_avx2_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -616,6 +635,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, static int initialized; static const char *selftest_failed; poly1305_key_t keytmp; + unsigned int features = _gcry_get_hw_features (); if (!initialized) { @@ -637,6 +657,12 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, ctx->ops = &poly1305_default_ops; #endif +#ifdef POLY1305_USE_AVX2 + if (features & HWF_INTEL_AVX2) + ctx->ops = &poly1305_amd64_avx2_ops; +#endif + (void)features; + buf_cpy (keytmp.b, key, POLY1305_KEYLEN); poly1305_init (ctx, &keytmp); diff --git a/configure.ac b/configure.ac index 4dc36d5..47a322b 100644 --- a/configure.ac +++ b/configure.ac @@ -1825,6 +1825,7 @@ case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo" ;; esac From cvs at cvs.gnupg.org Fri May 16 20:22:56 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 16 May 2014 20:22:56 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-76-ge813958 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via e813958419b0ec4439e6caf07d3b2234cffa2bfa (commit) via 73b3b75c2221a6e3bed4117e0a206a1193acd2ed (commit) via b8794fed68ebe7567f4617141f0996ad290d9120 (commit) from c20daeeb05329bfc6cc2c562cbd4b965291fe0e1 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit e813958419b0ec4439e6caf07d3b2234cffa2bfa Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 Add Poly1305 based cipher AEAD mode * cipher/Makefile.am: Add 'cipher-poly1305.c'. * cipher/cipher-internal.h (gcry_cipher_handle): Add 'u_mode.poly1305'. (_gcry_cipher_poly1305_encrypt, _gcry_cipher_poly1305_decrypt) (_gcry_cipher_poly1305_setiv, _gcry_cipher_poly1305_authenticate) (_gcry_cipher_poly1305_get_tag, _gcry_cipher_poly1305_check_tag): New. * cipher/cipher-poly1305.c: New. * cipher/cipher.c (_gcry_cipher_open_internal, cipher_setkey) (cipher_reset, cipher_encrypt, cipher_decrypt, _gcry_cipher_setiv) (_gcry_cipher_authenticate, _gcry_cipher_gettag) (_gcry_cipher_checktag): Handle 'GCRY_CIPHER_MODE_POLY1305'. (cipher_setiv): Move handling of 'GCRY_CIPHER_MODE_GCM' to ... (_gcry_cipher_setiv): ... here, as with other modes. * src/gcrypt.h.in: Add 'GCRY_CIPHER_MODE_POLY1305'. * tests/basic.c (_check_poly1305_cipher, check_poly1305_cipher): New. (check_ciphers): Add Poly1305 check. (check_cipher_modes): Call 'check_poly1305_cipher'. * tests/bench-slope.c (bench_gcm_encrypt_do_bench): Rename to bench_aead_... and take nonce as argument. (bench_gcm_decrypt_do_bench, bench_gcm_authenticate_do_bench): Ditto. (bench_gcm_encrypt_do_bench, bench_gcm_decrypt_do_bench) (bench_gcm_authenticate_do_bench, bench_poly1305_encrypt_do_bench) (bench_poly1305_decrypt_do_bench) (bench_poly1305_authenticate_do_bench, poly1305_encrypt_ops) (poly1305_decrypt_ops, poly1305_authenticate_ops): New. (cipher_modes): Add Poly1305. (cipher_bench_one): Add special handling for Poly1305. -- Patch adds Poly1305 based AEAD cipher mode to libgcrypt. ChaCha20 variant of this mode is proposed for use in TLS and ipsec: https://tools.ietf.org/html/draft-agl-tls-chacha20poly1305-04 http://tools.ietf.org/html/draft-nir-ipsecme-chacha20-poly1305-02 Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a8b86e6..4468647 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -40,7 +40,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ -cipher-ccm.c cipher-cmac.c cipher-gcm.c \ +cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-poly1305.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index cdac445..f6bda66 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -20,6 +20,9 @@ #ifndef G10_CIPHER_INTERNAL_H #define G10_CIPHER_INTERNAL_H +#include "./poly1305-internal.h" + + /* The maximum supported size of a block in bytes. */ #define MAX_BLOCKSIZE 16 @@ -154,6 +157,17 @@ struct gcry_cipher_handle } ccm; #endif + /* Mode specific storage for Poly1305 mode. */ + struct { + /* byte counter for AAD and data. */ + u32 bytecount[2]; + + unsigned int aad_finalized:1; + unsigned int bytecount_over_limits:1; + + poly1305_context_t ctx; + } poly1305; + /* Mode specific storage for CMAC mode. */ struct { unsigned int tag:1; /* Set to 1 if tag has been finalized. */ @@ -319,4 +333,28 @@ void _gcry_cipher_gcm_setkey /* */ (gcry_cipher_hd_t c); +/*-- cipher-poly1305.c --*/ +gcry_err_code_t _gcry_cipher_poly1305_encrypt +/* */ (gcry_cipher_hd_t c, + unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen); +gcry_err_code_t _gcry_cipher_poly1305_decrypt +/* */ (gcry_cipher_hd_t c, + unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen); +gcry_err_code_t _gcry_cipher_poly1305_setiv +/* */ (gcry_cipher_hd_t c, + const unsigned char *iv, size_t ivlen); +gcry_err_code_t _gcry_cipher_poly1305_authenticate +/* */ (gcry_cipher_hd_t c, + const unsigned char *aadbuf, size_t aadbuflen); +gcry_err_code_t _gcry_cipher_poly1305_get_tag +/* */ (gcry_cipher_hd_t c, + unsigned char *outtag, size_t taglen); +gcry_err_code_t _gcry_cipher_poly1305_check_tag +/* */ (gcry_cipher_hd_t c, + const unsigned char *intag, size_t taglen); +void _gcry_cipher_poly1305_setkey +/* */ (gcry_cipher_hd_t c); + #endif /*G10_CIPHER_INTERNAL_H*/ diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c new file mode 100644 index 0000000..a22ffa3 --- /dev/null +++ b/cipher/cipher-poly1305.c @@ -0,0 +1,296 @@ +/* cipher-pol1305.c - Poly1305 based AEAD cipher mode + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" +#include "./poly1305-internal.h" + + +static inline int +poly1305_bytecounter_add (u32 ctr[2], size_t add) +{ + int overflow = 0; + + if (sizeof(add) > sizeof(u32)) + { + u32 high_add = ((add >> 31) >> 1) & 0xffffffff; + ctr[1] += high_add; + if (ctr[1] < high_add) + overflow = 1; + } + + ctr[0] += add; + if (ctr[0] >= add) + return overflow; + + ctr[1] += 1; + return (ctr[1] < 1) || overflow; +} + + +static void +poly1305_fill_bytecount (gcry_cipher_hd_t c) +{ + u32 lenbuf[2]; + + lenbuf[0] = le_bswap32(c->u_mode.poly1305.bytecount[0]); + lenbuf[1] = le_bswap32(c->u_mode.poly1305.bytecount[1]); + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, (byte*)lenbuf, + sizeof(lenbuf)); + + wipememory(lenbuf, sizeof(lenbuf)); +} + + +static void +poly1305_aad_finish (gcry_cipher_hd_t c) +{ + /* Start of encryption marks end of AAD stream. */ + poly1305_fill_bytecount(c); + + c->u_mode.poly1305.aad_finalized = 1; + + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; +} + + +static gcry_err_code_t +poly1305_set_zeroiv (gcry_cipher_hd_t c) +{ + byte zero[8] = { 0, }; + + return _gcry_cipher_poly1305_setiv (c, zero, sizeof(zero)); +} + + +gcry_err_code_t +_gcry_cipher_poly1305_authenticate (gcry_cipher_hd_t c, + const byte * aadbuf, size_t aadbuflen) +{ + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + if (c->u_mode.poly1305.aad_finalized) + return GPG_ERR_INV_STATE; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + + if (!c->marks.iv) + poly1305_set_zeroiv(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, aadbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, aadbuf, aadbuflen); + + return 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c, + byte *outbuf, size_t outbuflen, + const byte *inbuf, size_t inbuflen) +{ + gcry_err_code_t err; + + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, inbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, inbuflen); + + return 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c, + byte *outbuf, size_t outbuflen, + const byte *inbuf, size_t inbuflen) +{ + gcry_err_code_t err; + + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->marks.tag) + return GPG_ERR_INV_STATE; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (poly1305_bytecounter_add(c->u_mode.poly1305.bytecount, inbuflen)) + { + c->u_mode.poly1305.bytecount_over_limits = 1; + return GPG_ERR_INV_LENGTH; + } + + _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, inbuflen); + + c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, inbuflen); + return 0; +} + + +static gcry_err_code_t +_gcry_cipher_poly1305_tag (gcry_cipher_hd_t c, + byte * outbuf, size_t outbuflen, int check) +{ + gcry_err_code_t err; + + if (outbuflen < GCRY_GCM_BLOCK_LEN) + return GPG_ERR_BUFFER_TOO_SHORT; + if (c->u_mode.poly1305.bytecount_over_limits) + return GPG_ERR_INV_LENGTH; + + if (!c->marks.iv) + { + err = poly1305_set_zeroiv(c); + if (err) + return err; + } + + if (!c->u_mode.poly1305.aad_finalized) + poly1305_aad_finish(c); + + if (!c->marks.tag) + { + /* Write data-length to poly1305. */ + poly1305_fill_bytecount(c); + + _gcry_poly1305_finish(&c->u_mode.poly1305.ctx, c->u_iv.iv); + + c->marks.tag = 1; + } + + if (check) + return buf_eq_const(outbuf, c->u_iv.iv, outbuflen) ? + GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM; + + memcpy (outbuf, c->u_iv.iv, outbuflen); + return GPG_ERR_NO_ERROR; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_get_tag (gcry_cipher_hd_t c, unsigned char *outtag, + size_t taglen) +{ + return _gcry_cipher_poly1305_tag (c, outtag, taglen, 0); +} + +gcry_err_code_t +_gcry_cipher_poly1305_check_tag (gcry_cipher_hd_t c, const unsigned char *intag, + size_t taglen) +{ + return _gcry_cipher_poly1305_tag (c, (unsigned char *) intag, taglen, 1); +} + + +void +_gcry_cipher_poly1305_setkey (gcry_cipher_hd_t c) +{ + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; + + c->u_mode.poly1305.bytecount_over_limits = 0; + c->u_mode.poly1305.aad_finalized = 0; + c->marks.tag = 0; + c->marks.iv = 0; +} + + +gcry_err_code_t +_gcry_cipher_poly1305_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen) +{ + byte tmpbuf[64]; /* size of ChaCha20/Salsa20 block */ + gcry_err_code_t err; + + if (!iv && ivlen > 0) + return GPG_ERR_INV_ARG; + + memset(&c->u_mode.poly1305.ctx, 0, sizeof(c->u_mode.poly1305.ctx)); + + c->u_mode.poly1305.bytecount[0] = 0; + c->u_mode.poly1305.bytecount[1] = 0; + + c->u_mode.poly1305.bytecount_over_limits = 0; + c->u_mode.poly1305.aad_finalized = 0; + c->marks.tag = 0; + c->marks.iv = 0; + + /* Set up IV for stream cipher. */ + c->spec->setiv (&c->context.c, iv, ivlen); + + /* Get the first block from ChaCha20/Salsa20. */ + memset(tmpbuf, 0, sizeof(tmpbuf)); + c->spec->stencrypt(&c->context.c, tmpbuf, tmpbuf, sizeof(tmpbuf)); + + /* Use the first 32-bytes as Poly1305 key. */ + err = _gcry_poly1305_init (&c->u_mode.poly1305.ctx, tmpbuf, POLY1305_KEYLEN); + + wipememory(tmpbuf, sizeof(tmpbuf)); + + if (err) + return err; + + c->marks.iv = 1; + return 0; +} diff --git a/cipher/cipher.c b/cipher/cipher.c index 4751302..da59061 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -418,6 +418,15 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, err = GPG_ERR_INV_CIPHER_MODE; break; + case GCRY_CIPHER_MODE_POLY1305: + if (!spec->stencrypt || !spec->stdecrypt || !spec->setiv) + err = GPG_ERR_INV_CIPHER_MODE; + else if (spec->algo != GCRY_CIPHER_SALSA20 && + spec->algo != GCRY_CIPHER_SALSA20R12 && + spec->algo != GCRY_CIPHER_CHACHA20) + err = GPG_ERR_INV_CIPHER_MODE; + break; + case GCRY_CIPHER_MODE_STREAM: if (!spec->stencrypt || !spec->stdecrypt) err = GPG_ERR_INV_CIPHER_MODE; @@ -611,6 +620,10 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen) _gcry_cipher_gcm_setkey (c); break; + case GCRY_CIPHER_MODE_POLY1305: + _gcry_cipher_poly1305_setkey (c); + break; + default: break; }; @@ -627,10 +640,6 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen) static gcry_err_code_t cipher_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen) { - /* GCM has its own IV handler */ - if (c->mode == GCRY_CIPHER_MODE_GCM) - return _gcry_cipher_gcm_setiv (c, iv, ivlen); - /* If the cipher has its own IV handler, we use only this one. This is currently used for stream ciphers requiring a nonce. */ if (c->spec->setiv) @@ -699,6 +708,10 @@ cipher_reset (gcry_cipher_hd_t c) } break; + case GCRY_CIPHER_MODE_POLY1305: + memset (&c->u_mode.poly1305, 0, sizeof c->u_mode.poly1305); + break; + #ifdef HAVE_U64_TYPEDEF case GCRY_CIPHER_MODE_CCM: memset (&c->u_mode.ccm, 0, sizeof c->u_mode.ccm); @@ -811,6 +824,11 @@ cipher_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, rc = _gcry_cipher_gcm_encrypt (c, outbuf, outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_encrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->spec->stencrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -919,6 +937,11 @@ cipher_decrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, rc = _gcry_cipher_gcm_decrypt (c, outbuf, outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_decrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->spec->stdecrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -1000,6 +1023,14 @@ _gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen) rc = _gcry_cipher_ccm_set_nonce (hd, iv, ivlen); break; + case GCRY_CIPHER_MODE_GCM: + rc = _gcry_cipher_gcm_setiv (hd, iv, ivlen); + break; + + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_setiv (hd, iv, ivlen); + break; + default: rc = cipher_setiv (hd, iv, ivlen); break; @@ -1050,6 +1081,10 @@ _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf, rc = _gcry_cipher_gcm_authenticate (hd, abuf, abuflen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_authenticate (hd, abuf, abuflen); + break; + default: log_error ("gcry_cipher_authenticate: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; @@ -1079,6 +1114,10 @@ _gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag, size_t taglen) rc = _gcry_cipher_gcm_get_tag (hd, outtag, taglen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_get_tag (hd, outtag, taglen); + break; + default: log_error ("gcry_cipher_gettag: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; @@ -1108,6 +1147,10 @@ _gcry_cipher_checktag (gcry_cipher_hd_t hd, const void *intag, size_t taglen) rc = _gcry_cipher_gcm_check_tag (hd, intag, taglen); break; + case GCRY_CIPHER_MODE_POLY1305: + rc = _gcry_cipher_poly1305_check_tag (hd, intag, taglen); + break; + default: log_error ("gcry_cipher_checktag: invalid mode %d\n", hd->mode); rc = GPG_ERR_INV_CIPHER_MODE; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 3145020..bd38a24 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -895,16 +895,17 @@ enum gcry_cipher_algos supported for each algorithm. */ enum gcry_cipher_modes { - GCRY_CIPHER_MODE_NONE = 0, /* Not yet specified. */ - GCRY_CIPHER_MODE_ECB = 1, /* Electronic codebook. */ - GCRY_CIPHER_MODE_CFB = 2, /* Cipher feedback. */ - GCRY_CIPHER_MODE_CBC = 3, /* Cipher block chaining. */ - GCRY_CIPHER_MODE_STREAM = 4, /* Used with stream ciphers. */ - GCRY_CIPHER_MODE_OFB = 5, /* Outer feedback. */ - GCRY_CIPHER_MODE_CTR = 6, /* Counter. */ - GCRY_CIPHER_MODE_AESWRAP= 7, /* AES-WRAP algorithm. */ - GCRY_CIPHER_MODE_CCM = 8, /* Counter with CBC-MAC. */ - GCRY_CIPHER_MODE_GCM = 9 /* Galois Counter Mode. */ + GCRY_CIPHER_MODE_NONE = 0, /* Not yet specified. */ + GCRY_CIPHER_MODE_ECB = 1, /* Electronic codebook. */ + GCRY_CIPHER_MODE_CFB = 2, /* Cipher feedback. */ + GCRY_CIPHER_MODE_CBC = 3, /* Cipher block chaining. */ + GCRY_CIPHER_MODE_STREAM = 4, /* Used with stream ciphers. */ + GCRY_CIPHER_MODE_OFB = 5, /* Outer feedback. */ + GCRY_CIPHER_MODE_CTR = 6, /* Counter. */ + GCRY_CIPHER_MODE_AESWRAP = 7, /* AES-WRAP algorithm. */ + GCRY_CIPHER_MODE_CCM = 8, /* Counter with CBC-MAC. */ + GCRY_CIPHER_MODE_GCM = 9, /* Galois Counter Mode. */ + GCRY_CIPHER_MODE_POLY1305 = 10, /* Poly1305 based AEAD mode. */ }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index de10617..9740919 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -1620,6 +1620,320 @@ check_gcm_cipher (void) static void +_check_poly1305_cipher (unsigned int step) +{ + struct tv + { + int algo; + char key[MAX_DATA_LEN]; + char iv[MAX_DATA_LEN]; + int ivlen; + unsigned char aad[MAX_DATA_LEN]; + int aadlen; + unsigned char plaintext[MAX_DATA_LEN]; + int inlen; + char out[MAX_DATA_LEN]; + char tag[MAX_DATA_LEN]; + } tv[] = + { + /* draft-agl-tls-chacha20poly1305-04 */ + { GCRY_CIPHER_CHACHA20, + "\x42\x90\xbc\xb1\x54\x17\x35\x31\xf3\x14\xaf\x57\xf3\xbe\x3b\x50" + "\x06\xda\x37\x1e\xce\x27\x2a\xfa\x1b\x5d\xbd\xd1\x10\x0a\x10\x07", + "\xcd\x7c\xf6\x7b\xe3\x9c\x79\x4a", 8, + "\x87\xe2\x29\xd4\x50\x08\x45\xa0\x79\xc0", 10, + "\x86\xd0\x99\x74\x84\x0b\xde\xd2\xa5\xca", 10, + "\xe3\xe4\x46\xf7\xed\xe9\xa1\x9b\x62\xa4", + "\x67\x7d\xab\xf4\xe3\xd2\x4b\x87\x6b\xb2\x84\x75\x38\x96\xe1\xd6" }, + /* draft-nir-cfrg-chacha20-poly1305-03 */ + { GCRY_CIPHER_CHACHA20, + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + "\x07\x00\x00\x00\x40\x41\x42\x43\x44\x45\x46\x47", 12, + "\x50\x51\x52\x53\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7", 12, + "Ladies and Gentlemen of the class of '99: If I could offer you " + "only one tip for the future, sunscreen would be it.", 114, + "\xd3\x1a\x8d\x34\x64\x8e\x60\xdb\x7b\x86\xaf\xbc\x53\xef\x7e\xc2" + "\xa4\xad\xed\x51\x29\x6e\x08\xfe\xa9\xe2\xb5\xa7\x36\xee\x62\xd6" + "\x3d\xbe\xa4\x5e\x8c\xa9\x67\x12\x82\xfa\xfb\x69\xda\x92\x72\x8b" + "\x1a\x71\xde\x0a\x9e\x06\x0b\x29\x05\xd6\xa5\xb6\x7e\xcd\x3b\x36" + "\x92\xdd\xbd\x7f\x2d\x77\x8b\x8c\x98\x03\xae\xe3\x28\x09\x1b\x58" + "\xfa\xb3\x24\xe4\xfa\xd6\x75\x94\x55\x85\x80\x8b\x48\x31\xd7\xbc" + "\x3f\xf4\xde\xf0\x8e\x4b\x7a\x9d\xe5\x76\xd2\x65\x86\xce\xc6\x4b" + "\x61\x16", + "\x18\xfb\x11\xa5\x03\x1a\xd1\x3a\x7e\x3b\x03\xd4\x6e\xe3\xa6\xa7" } + }; + + gcry_cipher_hd_t hde, hdd; + unsigned char out[MAX_DATA_LEN]; + unsigned char tag[16]; + int i, keylen; + gcry_error_t err = 0; + size_t pos, poslen; + int byteNum; + + if (verbose) + fprintf (stderr, " Starting POLY1305 checks.\n"); + + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) + { + if (verbose) + fprintf (stderr, " checking POLY1305 mode for %s [%i]\n", + gcry_cipher_algo_name (tv[i].algo), + tv[i].algo); + err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_POLY1305, 0); + if (!err) + err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_POLY1305, 0); + if (err) + { + fail ("poly1305, gcry_cipher_open failed: %s\n", gpg_strerror (err)); + return; + } + + keylen = gcry_cipher_get_algo_keylen(tv[i].algo); + if (!keylen) + { + fail ("poly1305, gcry_cipher_get_algo_keylen failed\n"); + return; + } + + err = gcry_cipher_setkey (hde, tv[i].key, keylen); + if (!err) + err = gcry_cipher_setkey (hdd, tv[i].key, keylen); + if (err) + { + fail ("poly1305, gcry_cipher_setkey failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("poly1305, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + for (pos = 0; pos < tv[i].aadlen; pos += step) + { + poslen = (pos + step < tv[i].aadlen) ? step : tv[i].aadlen - pos; + + err = gcry_cipher_authenticate(hde, tv[i].aad + pos, poslen); + if (err) + { + fail ("poly1305, gcry_cipher_authenticate (%d) (%d:%d) failed: " + "%s\n", i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + err = gcry_cipher_authenticate(hdd, tv[i].aad + pos, poslen); + if (err) + { + fail ("poly1305, de gcry_cipher_authenticate (%d) (%d:%d) failed: " + "%s\n", i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + for (pos = 0; pos < tv[i].inlen; pos += step) + { + poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos; + + err = gcry_cipher_encrypt (hde, out + pos, poslen, + tv[i].plaintext + pos, poslen); + if (err) + { + fail ("poly1305, gcry_cipher_encrypt (%d) (%d:%d) failed: %s\n", + i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("poly1305, encrypt mismatch entry %d (step %d)\n", i, step); + + for (pos = 0; pos < tv[i].inlen; pos += step) + { + poslen = (pos + step < tv[i].inlen) ? step : tv[i].inlen - pos; + + err = gcry_cipher_decrypt (hdd, out + pos, poslen, NULL, 0); + if (err) + { + fail ("poly1305, gcry_cipher_decrypt (%d) (%d:%d) failed: %s\n", + i, pos, step, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("poly1305, decrypt mismatch entry %d (step %d)\n", i, step); + + err = gcry_cipher_gettag (hde, out, 16); + if (err) + { + fail ("poly1305, gcry_cipher_gettag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, out, 16)) + fail ("poly1305, encrypt tag mismatch entry %d\n", i); + + + err = gcry_cipher_checktag (hdd, out, 16); + if (err) + { + fail ("poly1305, gcry_cipher_checktag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_reset(hde); + if (!err) + err = gcry_cipher_reset(hdd); + if (err) + { + fail ("poly1305, gcry_cipher_reset (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + /* gcry_cipher_reset clears the IV */ + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("poly1305, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + /* this time we authenticate, encrypt and decrypt one byte at a time */ + for (byteNum = 0; byteNum < tv[i].aadlen; ++byteNum) + { + err = gcry_cipher_authenticate(hde, tv[i].aad + byteNum, 1); + if (err) + { + fail ("poly1305, gcry_cipher_authenticate (%d) (byte-buf) failed: " + "%s\n", i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + err = gcry_cipher_authenticate(hdd, tv[i].aad + byteNum, 1); + if (err) + { + fail ("poly1305, de gcry_cipher_authenticate (%d) (byte-buf) " + "failed: %s\n", i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_encrypt (hde, out+byteNum, 1, + (tv[i].plaintext) + byteNum, + 1); + if (err) + { + fail ("poly1305, gcry_cipher_encrypt (%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("poly1305, encrypt mismatch entry %d, (byte-buf)\n", i); + + err = gcry_cipher_gettag (hde, tag, 16); + if (err) + { + fail ("poly1305, gcry_cipher_gettag(%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, tag, 16)) + fail ("poly1305, encrypt tag mismatch entry %d, (byte-buf)\n", i); + + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_decrypt (hdd, out+byteNum, 1, NULL, 0); + if (err) + { + fail ("poly1305, gcry_cipher_decrypt (%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("poly1305, decrypt mismatch entry %d\n", i); + + err = gcry_cipher_checktag (hdd, tag, 16); + if (err) + { + fail ("poly1305, gcry_cipher_checktag(%d) (byte-buf) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + } + if (verbose) + fprintf (stderr, " Completed POLY1305 checks.\n"); +} + + +static void +check_poly1305_cipher (void) +{ + /* Large buffers, no splitting. */ + _check_poly1305_cipher(0xffffffff); + /* Split input to one byte buffers. */ + _check_poly1305_cipher(1); + /* Split input to 7 byte buffers. */ + _check_poly1305_cipher(7); + /* Split input to 16 byte buffers. */ + _check_poly1305_cipher(16); +} + + +static void check_ccm_cipher (void) { #ifdef HAVE_U64_TYPEDEF @@ -4019,6 +4333,10 @@ check_ciphers (void) gcry_cipher_algo_name (algos2[i])); check_one_cipher (algos2[i], GCRY_CIPHER_MODE_STREAM, 0); + if (algos2[i] == GCRY_CIPHER_CHACHA20 || + algos2[i] == GCRY_CIPHER_SALSA20 || + algos2[i] == GCRY_CIPHER_SALSA20R12) + check_one_cipher (algos2[i], GCRY_CIPHER_MODE_POLY1305, 0); } /* we have now run all cipher's selftests */ @@ -4042,6 +4360,7 @@ check_cipher_modes(void) check_ofb_cipher (); check_ccm_cipher (); check_gcm_cipher (); + check_poly1305_cipher (); check_stream_cipher (); check_stream_cipher_large_block (); diff --git a/tests/bench-slope.c b/tests/bench-slope.c index a911ef8..7bf587f 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -907,15 +907,14 @@ static struct bench_ops ccm_authenticate_ops = { static void -bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) +bench_aead_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen, + const char *nonce, size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16]; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_encrypt (hd, buf, buflen, buf, buflen); if (err) @@ -937,15 +936,14 @@ bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) } static void -bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) +bench_aead_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen, + const char *nonce, size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16] = { 0, }; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_decrypt (hd, buf, buflen, buf, buflen); if (err) @@ -969,17 +967,16 @@ bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) } static void -bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, - size_t buflen) +bench_aead_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen, const char *nonce, + size_t noncelen) { gcry_cipher_hd_t hd = obj->priv; int err; char tag[16] = { 0, }; - char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, - 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; char data = 0xff; - gcry_cipher_setiv (hd, nonce, sizeof (nonce)); + gcry_cipher_setiv (hd, nonce, noncelen); err = gcry_cipher_authenticate (hd, buf, buflen); if (err) @@ -1009,6 +1006,34 @@ bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, } } + +static void +bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_gcm_decrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_gcm_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[12] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, + 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; + bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + static struct bench_ops gcm_encrypt_ops = { &bench_encrypt_init, &bench_encrypt_free, @@ -1028,6 +1053,49 @@ static struct bench_ops gcm_authenticate_ops = { }; +static void +bench_poly1305_encrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_encrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_poly1305_decrypt_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_decrypt_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static void +bench_poly1305_authenticate_do_bench (struct bench_obj *obj, void *buf, + size_t buflen) +{ + char nonce[8] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + bench_aead_authenticate_do_bench (obj, buf, buflen, nonce, sizeof(nonce)); +} + +static struct bench_ops poly1305_encrypt_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_encrypt_do_bench +}; + +static struct bench_ops poly1305_decrypt_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_decrypt_do_bench +}; + +static struct bench_ops poly1305_authenticate_ops = { + &bench_encrypt_init, + &bench_encrypt_free, + &bench_poly1305_authenticate_do_bench +}; + + static struct bench_cipher_mode cipher_modes[] = { {GCRY_CIPHER_MODE_ECB, "ECB enc", &encrypt_ops}, {GCRY_CIPHER_MODE_ECB, "ECB dec", &decrypt_ops}, @@ -1047,6 +1115,9 @@ static struct bench_cipher_mode cipher_modes[] = { {GCRY_CIPHER_MODE_GCM, "GCM enc", &gcm_encrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM dec", &gcm_decrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM auth", &gcm_authenticate_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 enc", &poly1305_encrypt_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 dec", &poly1305_decrypt_ops}, + {GCRY_CIPHER_MODE_POLY1305, "POLY1305 auth", &poly1305_authenticate_ops}, {0}, }; @@ -1066,8 +1137,9 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) if (!blklen) return; - /* Stream cipher? Only test with ECB. */ - if (blklen == 1 && mode.mode != GCRY_CIPHER_MODE_ECB) + /* Stream cipher? Only test with "ECB" and POLY1305. */ + if (blklen == 1 && (mode.mode != GCRY_CIPHER_MODE_ECB && + mode.mode != GCRY_CIPHER_MODE_POLY1305)) return; if (blklen == 1 && mode.mode == GCRY_CIPHER_MODE_ECB) { @@ -1075,6 +1147,12 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) mode.name = mode.ops == &encrypt_ops ? "STREAM enc" : "STREAM dec"; } + /* Poly1305 has restrictions for cipher algorithm */ + if (mode.mode == GCRY_CIPHER_MODE_POLY1305 && + (algo != GCRY_CIPHER_SALSA20 && algo != GCRY_CIPHER_SALSA20R12 && + algo != GCRY_CIPHER_CHACHA20)) + return; + /* CCM has restrictions for block-size */ if (mode.mode == GCRY_CIPHER_MODE_CCM && blklen != GCRY_CCM_BLOCK_LEN) return; commit 73b3b75c2221a6e3bed4117e0a206a1193acd2ed Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 Add Poly1305-AES (-Camellia, etc) MACs * cipher/mac-internal.h (_gcry_mac_type_spec_poly1305_aes) (_gcry_mac_type_spec_poly1305_camellia) (_gcry_mac_type_spec_poly1305_twofish) (_gcry_mac_type_spec_poly1305_serpent) (_gcry_mac_type_spec_poly1305_seed): New. * cipher/mac-poly1305.c (poly1305mac_context_s): Add 'hd' and 'nonce_set'. (poly1305mac_open, poly1305mac_close, poly1305mac_setkey): Add handling for Poly1305-*** MACs. (poly1305mac_prepare_key, poly1305mac_setiv): New. (poly1305mac_reset, poly1305mac_write, poly1305mac_read): Add handling for 'nonce_set'. (poly1305mac_ops): Add 'poly1305mac_setiv'. (_gcry_mac_type_spec_poly1305_aes) (_gcry_mac_type_spec_poly1305_camellia) (_gcry_mac_type_spec_poly1305_twofish) (_gcry_mac_type_spec_poly1305_serpent) (_gcry_mac_type_spec_poly1305_seed): New. * cipher/mac.c (mac_list): Add Poly1305-AES, Poly1305-Twofish, Poly1305-Serpent, Poly1305-SEED and Poly1305-Camellia. * src/gcrypt.h.in: Add 'GCRY_MAC_POLY1305_AES', 'GCRY_MAC_POLY1305_CAMELLIA', 'GCRY_MAC_POLY1305_TWOFISH', 'GCRY_MAC_POLY1305_SERPENT' and 'GCRY_MAC_POLY1305_SEED'. * tests/basic.c (check_mac): Add Poly1305-AES test vectors. * tests/bench-slope.c (bench_mac_init): Set IV for Poly1305-*** MACs. * tests/bench-slope.c (mac_bench): Set IV for Poly1305-*** MACs. -- Patch adds Bernstein's Poly1305-AES message authentication code to libgcrypt and other variants of Poly1305-<128-bit block cipher>. Signed-off-by: Jussi Kivilinna diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 81b6185..f65a8ae 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -217,3 +217,18 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; * The Poly1305 MAC algorithm specifications (mac-poly1305.c). */ extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac; +#if USE_AES +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes; +#endif +#if USE_CAMELLIA +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia; +#endif +#if USE_TWOFISH +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish; +#endif +#if USE_SERPENT +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent; +#endif +#if USE_SEED +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed; +#endif diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c index e265b64..76b369a 100644 --- a/cipher/mac-poly1305.c +++ b/cipher/mac-poly1305.c @@ -30,8 +30,10 @@ struct poly1305mac_context_s { poly1305_context_t ctx; + gcry_cipher_hd_t hd; struct { unsigned int key_set:1; + unsigned int nonce_set:1; unsigned int tag:1; } marks; byte tag[POLY1305_TAGLEN]; @@ -44,6 +46,9 @@ poly1305mac_open (gcry_mac_hd_t h) { struct poly1305mac_context_s *mac_ctx; int secure = (h->magic == CTX_MAGIC_SECURE); + unsigned int flags = (secure ? GCRY_CIPHER_SECURE : 0); + gcry_err_code_t err; + int cipher_algo; if (secure) mac_ctx = xtrycalloc_secure (1, sizeof(*mac_ctx)); @@ -55,14 +60,71 @@ poly1305mac_open (gcry_mac_hd_t h) h->u.poly1305mac.ctx = mac_ctx; + switch (h->spec->algo) + { + default: + /* already checked. */ + case GCRY_MAC_POLY1305: + /* plain Poly1305. */ + cipher_algo = -1; + return 0; + case GCRY_MAC_POLY1305_AES: + cipher_algo = GCRY_CIPHER_AES; + break; + case GCRY_MAC_POLY1305_CAMELLIA: + cipher_algo = GCRY_CIPHER_CAMELLIA128; + break; + case GCRY_MAC_POLY1305_TWOFISH: + cipher_algo = GCRY_CIPHER_TWOFISH; + break; + case GCRY_MAC_POLY1305_SERPENT: + cipher_algo = GCRY_CIPHER_SERPENT128; + break; + case GCRY_MAC_POLY1305_SEED: + cipher_algo = GCRY_CIPHER_SEED; + break; + } + + err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo, + GCRY_CIPHER_MODE_ECB, flags); + if (err) + goto err_free; + return 0; + +err_free: + xfree(h->u.poly1305mac.ctx); + return err; } static void poly1305mac_close (gcry_mac_hd_t h) { - xfree(h->u.poly1305mac.ctx); + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (h->spec->algo != GCRY_MAC_POLY1305) + _gcry_cipher_close (mac_ctx->hd); + + xfree(mac_ctx); +} + + +static gcry_err_code_t +poly1305mac_prepare_key (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + size_t block_keylen = keylen - 16; + + /* Need at least 16 + 1 byte key. */ + if (keylen <= 16) + return GPG_ERR_INV_KEYLEN; + + /* For Poly1305-AES, first part of key is passed to Poly1305 as is. */ + memcpy (mac_ctx->key, key + block_keylen, 16); + + /* Remaining part is used as key for the block cipher. */ + return _gcry_cipher_setkey (mac_ctx->hd, key, block_keylen); } @@ -77,22 +139,74 @@ poly1305mac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); mac_ctx->marks.key_set = 0; + mac_ctx->marks.nonce_set = 0; mac_ctx->marks.tag = 0; - if (keylen != POLY1305_KEYLEN) - return GPG_ERR_INV_KEYLEN; - - memcpy(mac_ctx->key, key, POLY1305_KEYLEN); + if (h->spec->algo != GCRY_MAC_POLY1305) + { + err = poly1305mac_prepare_key (h, key, keylen); + if (err) + return err; - err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); - if (err) + /* Poly1305-AES/etc also need nonce. */ + mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 0; + } + else { - memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); - return err; + /* For plain Poly1305, key is the nonce and setup is complete now. */ + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + memcpy (mac_ctx->key, key, keylen); + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + { + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + return err; + } + + mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 1; } - mac_ctx->marks.key_set = 1; + return 0; +} + + +static gcry_err_code_t +poly1305mac_setiv (gcry_mac_hd_t h, const unsigned char *iv, size_t ivlen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + + if (h->spec->algo == GCRY_MAC_POLY1305) + return GPG_ERR_INV_ARG; + + if (ivlen != 16) + return GPG_ERR_INV_ARG; + if (!mac_ctx->marks.key_set) + return 0; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + mac_ctx->marks.nonce_set = 0; + mac_ctx->marks.tag = 0; + + /* Prepare second part of the poly1305 key. */ + + err = _gcry_cipher_encrypt (mac_ctx->hd, mac_ctx->key + 16, 16, iv, 16); + if (err) + return err; + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + return err; + + mac_ctx->marks.nonce_set = 1; return 0; } @@ -102,13 +216,14 @@ poly1305mac_reset (gcry_mac_hd_t h) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set) return GPG_ERR_INV_STATE; memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); mac_ctx->marks.key_set = 1; + mac_ctx->marks.nonce_set = 1; mac_ctx->marks.tag = 0; return _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); @@ -120,7 +235,8 @@ poly1305mac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set || mac_ctx->marks.tag) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set || + mac_ctx->marks.tag) return GPG_ERR_INV_STATE; _gcry_poly1305_update (&mac_ctx->ctx, buf, buflen); @@ -133,7 +249,7 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen) { struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; - if (!mac_ctx->marks.key_set) + if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set) return GPG_ERR_INV_STATE; if (!mac_ctx->marks.tag) @@ -197,7 +313,7 @@ static gcry_mac_spec_ops_t poly1305mac_ops = { poly1305mac_open, poly1305mac_close, poly1305mac_setkey, - NULL, + poly1305mac_setiv, poly1305mac_reset, poly1305mac_write, poly1305mac_read, @@ -211,3 +327,33 @@ gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac = { GCRY_MAC_POLY1305, {0, 0}, "POLY1305", &poly1305mac_ops }; +#if USE_AES +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes = { + GCRY_MAC_POLY1305_AES, {0, 0}, "POLY1305_AES", + &poly1305mac_ops +}; +#endif +#if USE_CAMELLIA +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia = { + GCRY_MAC_POLY1305_CAMELLIA, {0, 0}, "POLY1305_CAMELLIA", + &poly1305mac_ops +}; +#endif +#if USE_TWOFISH +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish = { + GCRY_MAC_POLY1305_TWOFISH, {0, 0}, "POLY1305_TWOFISH", + &poly1305mac_ops +}; +#endif +#if USE_SERPENT +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent = { + GCRY_MAC_POLY1305_SERPENT, {0, 0}, "POLY1305_SERPENT", + &poly1305mac_ops +}; +#endif +#if USE_SEED +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed = { + GCRY_MAC_POLY1305_SEED, {0, 0}, "POLY1305_SEED", + &poly1305mac_ops +}; +#endif diff --git a/cipher/mac.c b/cipher/mac.c index e583369..30117b9 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -75,14 +75,17 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_AES &_gcry_mac_type_spec_cmac_aes, &_gcry_mac_type_spec_gmac_aes, + &_gcry_mac_type_spec_poly1305mac_aes, #endif #if USE_TWOFISH &_gcry_mac_type_spec_cmac_twofish, &_gcry_mac_type_spec_gmac_twofish, + &_gcry_mac_type_spec_poly1305mac_twofish, #endif #if USE_SERPENT &_gcry_mac_type_spec_cmac_serpent, &_gcry_mac_type_spec_gmac_serpent, + &_gcry_mac_type_spec_poly1305mac_serpent, #endif #if USE_RFC2268 &_gcry_mac_type_spec_cmac_rfc2268, @@ -90,10 +93,12 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_SEED &_gcry_mac_type_spec_cmac_seed, &_gcry_mac_type_spec_gmac_seed, + &_gcry_mac_type_spec_poly1305mac_seed, #endif #if USE_CAMELLIA &_gcry_mac_type_spec_cmac_camellia, &_gcry_mac_type_spec_gmac_camellia, + &_gcry_mac_type_spec_poly1305mac_camellia, #endif #ifdef USE_IDEA &_gcry_mac_type_spec_cmac_idea, diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 8648e96..3145020 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1343,7 +1343,12 @@ enum gcry_mac_algos GCRY_MAC_GMAC_SERPENT = 404, GCRY_MAC_GMAC_SEED = 405, - GCRY_MAC_POLY1305 = 501 + GCRY_MAC_POLY1305 = 501, + GCRY_MAC_POLY1305_AES = 502, + GCRY_MAC_POLY1305_CAMELLIA = 503, + GCRY_MAC_POLY1305_TWOFISH = 504, + GCRY_MAC_POLY1305_SERPENT = 505, + GCRY_MAC_POLY1305_SEED = 506 }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index cc0f4c1..de10617 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5596,6 +5596,39 @@ check_mac (void) "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, + /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ + { GCRY_MAC_POLY1305_AES, + "\xf3\xf6", + "\xec\x07\x4c\x83\x55\x80\x74\x17\x01\x42\x5b\x62\x32\x35\xad\xd6" + "\x85\x1f\xc4\x0c\x34\x67\xac\x0b\xe0\x5c\xc2\x04\x04\xf3\xf7\x00", + "\xf4\xc6\x33\xc3\x04\x4f\xc1\x45\xf8\x4f\x33\x5c\xb8\x19\x53\xde", + "\xfb\x44\x73\x50\xc4\xe8\x68\xc5\x2a\xc3\x27\x5c\xf9\xd4\x32\x7e", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "", + "\x75\xde\xaa\x25\xc0\x9f\x20\x8e\x1d\xc4\xce\x6b\x5c\xad\x3f\xbf" + "\xa0\xf3\x08\x00\x00\xf4\x64\x00\xd0\xc7\xe9\x07\x6c\x83\x44\x03", + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + "\x61\xee\x09\x21\x8d\x29\xb0\xaa\xed\x7e\x15\x4a\x2c\x55\x09\xcc", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "\x66\x3c\xea\x19\x0f\xfb\x83\xd8\x95\x93\xf3\xf4\x76\xb6\xbc\x24" + "\xd7\xe6\x79\x10\x7e\xa2\x6a\xdb\x8c\xaf\x66\x52\xd0\x65\x61\x36", + "\x6a\xcb\x5f\x61\xa7\x17\x6d\xd3\x20\xc5\xc1\xeb\x2e\xdc\xdc\x74" + "\x48\x44\x3d\x0b\xb0\xd2\x11\x09\xc8\x9a\x10\x0b\x5c\xe2\xc2\x08", + "\x0e\xe1\xc1\x6b\xb7\x3f\x0f\x4f\xd1\x98\x81\x75\x3c\x01\xcd\xbe", + "\xae\x21\x2a\x55\x39\x97\x29\x59\x5d\xea\x45\x8b\xc6\x21\xff\x0e", + 0, 32 }, + { GCRY_MAC_POLY1305_AES, + "\xab\x08\x12\x72\x4a\x7f\x1e\x34\x27\x42\xcb\xed\x37\x4d\x94\xd1" + "\x36\xc6\xb8\x79\x5d\x45\xb3\x81\x98\x30\xf2\xc0\x44\x91\xfa\xf0" + "\x99\x0c\x62\xe4\x8b\x80\x18\xb2\xc3\xe4\xa0\xfa\x31\x34\xcb\x67" + "\xfa\x83\xe1\x58\xc9\x94\xd9\x61\xc4\xcb\x21\x09\x5c\x1b\xf9", + "\xe1\xa5\x66\x8a\x4d\x5b\x66\xa5\xf6\x8c\xc5\x42\x4e\xd5\x98\x2d" + "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07", + "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b", + "\x9a\xe8\x31\xe7\x43\x97\x8d\x3a\x23\x52\x7c\x71\x28\x14\x9e\x3a", + 0, 32 }, { 0 }, }; int i; diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 3d8ae37..a911ef8 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -1316,6 +1316,19 @@ bench_mac_init (struct bench_obj *obj) exit (1); } + switch (mode->algo) + { + default: + break; + case GCRY_MAC_POLY1305_AES: + case GCRY_MAC_POLY1305_CAMELLIA: + case GCRY_MAC_POLY1305_TWOFISH: + case GCRY_MAC_POLY1305_SERPENT: + case GCRY_MAC_POLY1305_SEED: + gcry_mac_setiv (hd, key, 16); + break; + } + obj->priv = hd; return 0; diff --git a/tests/benchmark.c b/tests/benchmark.c index 9fd716d..042e721 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -509,6 +509,18 @@ mac_bench ( const char *algoname ) for (i=0; i < bufsize; i++) buf[i] = i; + if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SEED) + { + static const char iv[16] = { 1, 2, 3, 4, }; + err = gcry_mac_setiv(hd, iv, sizeof(iv)); + if (err) + { + fprintf (stderr, PGM ": error setting nonce for mac algorithm `%s': %s\n", + algoname, gpg_strerror (err)); + exit (1); + } + } + printf ("%-20s", gcry_mac_algo_name (algo)); start_timer (); commit b8794fed68ebe7567f4617141f0996ad290d9120 Author: Jussi Kivilinna Date: Sun May 11 12:00:19 2014 +0300 Add Poly1305 MAC * cipher/Makefile.am: Add 'mac-poly1305.c', 'poly1305.c' and 'poly1305-internal.h'. * cipher/mac-internal.h (poly1305mac_context_s): New. (gcry_mac_handle): Add 'u.poly1305mac'. (_gcry_mac_type_spec_poly1305mac): New. * cipher/mac-poly1305.c: New. * cipher/mac.c (mac_list): Add Poly1305. * cipher/poly1305-internal.h: New. * cipher/poly1305.c: New. * src/gcrypt.h.in: Add 'GCRY_MAC_POLY1305'. * tests/basic.c (check_mac): Add Poly1035 test vectors; Allow overriding lengths of data and key buffers. * tests/bench-slope.c (mac_bench): Increase max algo number from 500 to 600. * tests/benchmark.c (mac_bench): Ditto. -- Patch adds Bernstein's Poly1305 message authentication code to libgcrypt. Implementation is based on Andrew Moon's public domain implementation from: https://github.com/floodyberry/poly1305-opt The algorithm added by this patch is the plain Poly1305 without AES and takes 32-bit key that must not be reused. Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 26d13d2..a8b86e6 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -45,7 +45,8 @@ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ -mac-hmac.c mac-cmac.c mac-gmac.c \ +mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ +poly1305.c poly1305-internal.h \ kdf.c kdf-internal.h \ hmac-tests.c \ bithelp.h \ diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 9895a54..81b6185 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -17,9 +17,17 @@ * License along with this program; if not, see . */ +#include + +#include "g10lib.h" + + /* The data object used to hold a handle to an encryption object. */ struct gcry_mac_handle; +/* The data object used to hold poly1305-mac context. */ +struct poly1305mac_context_s; + /* * @@ -84,7 +92,6 @@ typedef struct gcry_mac_spec } gcry_mac_spec_t; - /* The handle structure. */ struct gcry_mac_handle { @@ -106,6 +113,9 @@ struct gcry_mac_handle gcry_cipher_hd_t ctx; int cipher_algo; } gmac; + struct { + struct poly1305mac_context_s *ctx; + } poly1305mac; } u; }; @@ -202,3 +212,8 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed; #if USE_CAMELLIA extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; #endif + +/* + * The Poly1305 MAC algorithm specifications (mac-poly1305.c). + */ +extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac; diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c new file mode 100644 index 0000000..e265b64 --- /dev/null +++ b/cipher/mac-poly1305.c @@ -0,0 +1,213 @@ +/* mac-poly1305.c - Poly1305 based MACs + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "mac-internal.h" +#include "poly1305-internal.h" + + +struct poly1305mac_context_s { + poly1305_context_t ctx; + struct { + unsigned int key_set:1; + unsigned int tag:1; + } marks; + byte tag[POLY1305_TAGLEN]; + byte key[POLY1305_KEYLEN]; +}; + + +static gcry_err_code_t +poly1305mac_open (gcry_mac_hd_t h) +{ + struct poly1305mac_context_s *mac_ctx; + int secure = (h->magic == CTX_MAGIC_SECURE); + + if (secure) + mac_ctx = xtrycalloc_secure (1, sizeof(*mac_ctx)); + else + mac_ctx = xtrycalloc (1, sizeof(*mac_ctx)); + + if (!mac_ctx) + return gpg_err_code_from_syserror (); + + h->u.poly1305mac.ctx = mac_ctx; + + return 0; +} + + +static void +poly1305mac_close (gcry_mac_hd_t h) +{ + xfree(h->u.poly1305mac.ctx); +} + + +static gcry_err_code_t +poly1305mac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + + mac_ctx->marks.key_set = 0; + mac_ctx->marks.tag = 0; + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + memcpy(mac_ctx->key, key, POLY1305_KEYLEN); + + err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); + if (err) + { + memset(&mac_ctx->key, 0, sizeof(mac_ctx->key)); + return err; + } + + mac_ctx->marks.key_set = 1; + + return 0; +} + + +static gcry_err_code_t +poly1305mac_reset (gcry_mac_hd_t h) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set) + return GPG_ERR_INV_STATE; + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag)); + + mac_ctx->marks.key_set = 1; + mac_ctx->marks.tag = 0; + + return _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN); +} + + +static gcry_err_code_t +poly1305mac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set || mac_ctx->marks.tag) + return GPG_ERR_INV_STATE; + + _gcry_poly1305_update (&mac_ctx->ctx, buf, buflen); + return 0; +} + + +static gcry_err_code_t +poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + + if (!mac_ctx->marks.key_set) + return GPG_ERR_INV_STATE; + + if (!mac_ctx->marks.tag) + { + _gcry_poly1305_finish(&mac_ctx->ctx, mac_ctx->tag); + + memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx)); + mac_ctx->marks.tag = 1; + } + + if (*outlen <= POLY1305_TAGLEN) + buf_cpy (outbuf, mac_ctx->tag, *outlen); + else + { + buf_cpy (outbuf, mac_ctx->tag, POLY1305_TAGLEN); + *outlen = POLY1305_TAGLEN; + } + + return 0; +} + + +static gcry_err_code_t +poly1305mac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen) +{ + struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx; + gcry_err_code_t err; + size_t outlen = 0; + + /* Check and finalize tag. */ + err = poly1305mac_read(h, NULL, &outlen); + if (err) + return err; + + if (buflen > POLY1305_TAGLEN) + return GPG_ERR_INV_LENGTH; + + return buf_eq_const (buf, mac_ctx->tag, buflen) ? 0 : GPG_ERR_CHECKSUM; +} + + +static unsigned int +poly1305mac_get_maclen (int algo) +{ + (void)algo; + + return POLY1305_TAGLEN; +} + + +static unsigned int +poly1305mac_get_keylen (int algo) +{ + (void)algo; + + return POLY1305_KEYLEN; +} + + +static gcry_mac_spec_ops_t poly1305mac_ops = { + poly1305mac_open, + poly1305mac_close, + poly1305mac_setkey, + NULL, + poly1305mac_reset, + poly1305mac_write, + poly1305mac_read, + poly1305mac_verify, + poly1305mac_get_maclen, + poly1305mac_get_keylen +}; + + +gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac = { + GCRY_MAC_POLY1305, {0, 0}, "POLY1305", + &poly1305mac_ops +}; diff --git a/cipher/mac.c b/cipher/mac.c index 7805467..e583369 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -101,6 +101,7 @@ static gcry_mac_spec_t *mac_list[] = { #if USE_GOST28147 &_gcry_mac_type_spec_cmac_gost28147, #endif + &_gcry_mac_type_spec_poly1305mac, NULL, }; diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h new file mode 100644 index 0000000..d2c6b5c --- /dev/null +++ b/cipher/poly1305-internal.h @@ -0,0 +1,93 @@ +/* poly1305-internal.h - Poly1305 internals + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef G10_POLY1305_INTERNAL_H +#define G10_POLY1305_INTERNAL_H + +#include +#include +#include +#include +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" + + +#define POLY1305_TAGLEN 16 +#define POLY1305_KEYLEN 32 + + +/* Block-size used in default implementation. */ +#define POLY1305_REF_BLOCKSIZE 16 + +/* State size of default implementation. */ +#define POLY1305_REF_STATESIZE 64 + +/* State alignment for default implementation. */ +#define POLY1305_REF_ALIGNMENT sizeof(void *) + + +/* Largest block-size used in any implementation (optimized implementations + * might use block-size multiple of 16). */ +#define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE + +/* Largest state-size used in any implementation. */ +#define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE + +/* Minimum alignment for state pointer passed to implementations. */ +#define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT + + +typedef struct poly1305_key_s +{ + byte b[POLY1305_KEYLEN]; +} poly1305_key_t; + + +typedef struct poly1305_ops_s +{ + size_t block_size; + void (*init_ext) (void *ctx, const poly1305_key_t * key); + unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes); + unsigned int (*finish_ext) (void *ctx, const byte * m, size_t remaining, + byte mac[POLY1305_TAGLEN]); +} poly1305_ops_t; + + +typedef struct poly1305_context_s +{ + byte state[POLY1305_LARGEST_STATESIZE + POLY1305_STATE_ALIGNMENT]; + byte buffer[POLY1305_LARGEST_BLOCKSIZE]; + const poly1305_ops_t *ops; + unsigned int leftover; +} poly1305_context_t; + + +gcry_err_code_t _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, + size_t keylen); + +void _gcry_poly1305_finish (poly1305_context_t * ctx, + byte mac[POLY1305_TAGLEN]); + +void _gcry_poly1305_update (poly1305_context_t * ctx, const byte * buf, + size_t buflen); + + +#endif /* G10_POLY1305_INTERNAL_H */ diff --git a/cipher/poly1305.c b/cipher/poly1305.c new file mode 100644 index 0000000..472ae42 --- /dev/null +++ b/cipher/poly1305.c @@ -0,0 +1,766 @@ +/* poly1305.c - Poly1305 internals and generic implementation + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* The code is based on public-domain Poly1305 implementation by + * Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include +#include +#include +#include + +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "poly1305-internal.h" + + +static const char *selftest (void); + + + +#ifdef HAVE_U64_TYPEDEF + +/* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit + * multiplication and 64 bit addition. + */ + +typedef struct poly1305_state_ref32_s +{ + u32 r[5]; + u32 h[5]; + u32 pad[4]; + byte final; +} poly1305_state_ref32_t; + + +static void +poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + + gcry_assert (sizeof (*st) + POLY1305_STATE_ALIGNMENT <= + sizeof (((poly1305_context_t *) 0)->state)); + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = (buf_get_le32 (&key->b[0])) & 0x3ffffff; + st->r[1] = (buf_get_le32 (&key->b[3]) >> 2) & 0x3ffff03; + st->r[2] = (buf_get_le32 (&key->b[6]) >> 4) & 0x3ffc0ff; + st->r[3] = (buf_get_le32 (&key->b[9]) >> 6) & 0x3f03fff; + st->r[4] = (buf_get_le32 (&key->b[12]) >> 8) & 0x00fffff; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* save pad for later */ + st->pad[0] = buf_get_le32 (&key->b[16]); + st->pad[1] = buf_get_le32 (&key->b[20]); + st->pad[2] = buf_get_le32 (&key->b[24]); + st->pad[3] = buf_get_le32 (&key->b[28]); + + st->final = 0; +} + + +static unsigned int +poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + const u32 hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ + u32 r0, r1, r2, r3, r4; + u32 s1, s2, s3, s4; + u32 h0, h1, h2, h3, h4; + u64 d0, d1, d2, d3, d4; + u32 c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (bytes >= POLY1305_REF_BLOCKSIZE) + { + /* h += m[i] */ + h0 += (buf_get_le32 (m + 0)) & 0x3ffffff; + h1 += (buf_get_le32 (m + 3) >> 2) & 0x3ffffff; + h2 += (buf_get_le32 (m + 6) >> 4) & 0x3ffffff; + h3 += (buf_get_le32 (m + 9) >> 6) & 0x3ffffff; + h4 += (buf_get_le32 (m + 12) >> 8) | hibit; + + /* h *= r */ + d0 = + ((u64) h0 * r0) + ((u64) h1 * s4) + + ((u64) h2 * s3) + ((u64) h3 * s2) + ((u64) h4 * s1); + d1 = + ((u64) h0 * r1) + ((u64) h1 * r0) + + ((u64) h2 * s4) + ((u64) h3 * s3) + ((u64) h4 * s2); + d2 = + ((u64) h0 * r2) + ((u64) h1 * r1) + + ((u64) h2 * r0) + ((u64) h3 * s4) + ((u64) h4 * s3); + d3 = + ((u64) h0 * r3) + ((u64) h1 * r2) + + ((u64) h2 * r1) + ((u64) h3 * r0) + ((u64) h4 * s4); + d4 = + ((u64) h0 * r4) + ((u64) h1 * r3) + + ((u64) h2 * r2) + ((u64) h3 * r1) + ((u64) h4 * r0); + + /* (partial) h %= p */ + c = (u32) (d0 >> 26); + h0 = (u32) d0 & 0x3ffffff; + d1 += c; + c = (u32) (d1 >> 26); + h1 = (u32) d1 & 0x3ffffff; + d2 += c; + c = (u32) (d2 >> 26); + h2 = (u32) d2 & 0x3ffffff; + d3 += c; + c = (u32) (d3 >> 26); + h3 = (u32) d3 & 0x3ffffff; + d4 += c; + c = (u32) (d4 >> 26); + h4 = (u32) d4 & 0x3ffffff; + h0 += c * 5; + c = (h0 >> 26); + h0 = h0 & 0x3ffffff; + h1 += c; + + m += POLY1305_REF_BLOCKSIZE; + bytes -= POLY1305_REF_BLOCKSIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; + + return (16 * sizeof (u32) + 5 * sizeof (u64) + 5 * sizeof (void *)); +} + + +static unsigned int +poly1305_finish_ext_ref32 (void *state, const byte * m, + size_t remaining, byte mac[POLY1305_TAGLEN]) +{ + poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state; + u32 h0, h1, h2, h3, h4, c; + u32 g0, g1, g2, g3, g4; + u64 f; + u32 mask; + unsigned int burn = 0; + + /* process the remaining block */ + if (remaining) + { + byte final[POLY1305_REF_BLOCKSIZE] = { 0 }; + size_t i; + for (i = 0; i < remaining; i++) + final[i] = m[i]; + final[remaining] = 1; + st->final = 1; + burn = poly1305_blocks_ref32 (st, final, POLY1305_REF_BLOCKSIZE); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; + h1 = h1 & 0x3ffffff; + h2 += c; + c = h2 >> 26; + h2 = h2 & 0x3ffffff; + h3 += c; + c = h3 >> 26; + h3 = h3 & 0x3ffffff; + h4 += c; + c = h4 >> 26; + h4 = h4 & 0x3ffffff; + h0 += c * 5; + c = h0 >> 26; + h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 26; + g0 &= 0x3ffffff; + g1 = h1 + c; + c = g1 >> 26; + g1 &= 0x3ffffff; + g2 = h2 + c; + c = g2 >> 26; + g2 &= 0x3ffffff; + g3 = h3 + c; + c = g3 >> 26; + g3 &= 0x3ffffff; + g4 = h4 + c - (1 << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof (u32) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + pad) % (2^128) */ + f = (u64) h0 + st->pad[0]; + h0 = (u32) f; + f = (u64) h1 + st->pad[1] + (f >> 32); + h1 = (u32) f; + f = (u64) h2 + st->pad[2] + (f >> 32); + h2 = (u32) f; + f = (u64) h3 + st->pad[3] + (f >> 32); + h3 = (u32) f; + + buf_put_le32 (mac + 0, h0); + buf_put_le32 (mac + 4, h1); + buf_put_le32 (mac + 8, h2); + buf_put_le32 (mac + 12, h3); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->r[3] = 0; + st->r[4] = 0; + st->pad[0] = 0; + st->pad[1] = 0; + st->pad[2] = 0; + st->pad[3] = 0; + + /* burn_stack */ + return (13 * sizeof (u32) + sizeof (u64) + + POLY1305_REF_BLOCKSIZE + 6 * sizeof (void *)) + burn; +} + + +static const poly1305_ops_t poly1305_default_ops = { + POLY1305_REF_BLOCKSIZE, + poly1305_init_ext_ref32, + poly1305_blocks_ref32, + poly1305_finish_ext_ref32 +}; + +#else /* !HAVE_U64_TYPEDEF */ + +/* Reference unoptimized poly1305 implementation using 8 bit * 8 bit = 16 bit + * multiplication and 16 bit addition, used when we don't have 'u64'. + */ + +typedef struct poly1305_state_ref8_t +{ + byte h[17]; + byte r[17]; + byte pad[17]; + byte final; +} poly1305_state_ref8_t; + + +static void +poly1305_init_ext_ref8 (void *state, const poly1305_key_t * key) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + size_t i; + + /* h = 0 */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = key->b[0]; + st->r[1] = key->b[1]; + st->r[2] = key->b[2]; + st->r[3] = key->b[3] & 0x0f; + st->r[4] = key->b[4] & 0xfc; + st->r[5] = key->b[5]; + st->r[6] = key->b[6]; + st->r[7] = key->b[7] & 0x0f; + st->r[8] = key->b[8] & 0xfc; + st->r[9] = key->b[9]; + st->r[10] = key->b[10]; + st->r[11] = key->b[11] & 0x0f; + st->r[12] = key->b[12] & 0xfc; + st->r[13] = key->b[13]; + st->r[14] = key->b[14]; + st->r[15] = key->b[15] & 0x0f; + st->r[16] = 0; + + /* save pad for later */ + for (i = 0; i < 16; i++) + st->pad[i] = key->b[i + 16]; + st->pad[16] = 0; + + st->final = 0; +} + + +static void +poly1305_add_ref8 (byte h[17], const byte c[17]) +{ + u16 u; + unsigned int i; + for (u = 0, i = 0; i < 17; i++) + { + u += (u16) h[i] + (u16) c[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } +} + + +static void +poly1305_squeeze_ref8 (byte h[17], u32 hr[17]) +{ + u32 u; + unsigned int i; + u = 0; + for (i = 0; i < 16; i++) + { + u += hr[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } + u += hr[16]; + h[16] = (byte) u & 0x03; + u >>= 2; + u += (u << 2); /* u *= 5; */ + for (i = 0; i < 16; i++) + { + u += h[i]; + h[i] = (byte) u & 0xff; + u >>= 8; + } + h[16] += (byte) u; +} + + +static void +poly1305_freeze_ref8 (byte h[17]) +{ + static const byte minusp[17] = { + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfc + }; + byte horig[17], negative; + unsigned int i; + + /* compute h + -p */ + for (i = 0; i < 17; i++) + horig[i] = h[i]; + poly1305_add_ref8 (h, minusp); + + /* select h if h < p, or h + -p if h >= p */ + negative = -(h[16] >> 7); + for (i = 0; i < 17; i++) + h[i] ^= negative & (horig[i] ^ h[i]); +} + + +static unsigned int +poly1305_blocks_ref8 (void *state, const byte * m, size_t bytes) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + const byte hibit = st->final ^ 1; /* 1 << 128 */ + + while (bytes >= POLY1305_REF_BLOCKSIZE) + { + u32 hr[17], u; + byte c[17]; + unsigned int i, j; + + /* h += m */ + for (i = 0; i < 16; i++) + c[i] = m[i]; + c[16] = hibit; + poly1305_add_ref8 (st->h, c); + + /* h *= r */ + for (i = 0; i < 17; i++) + { + u = 0; + for (j = 0; j <= i; j++) + { + u += (u16) st->h[j] * st->r[i - j]; + } + for (j = i + 1; j < 17; j++) + { + u32 v = (u16) st->h[j] * st->r[i + 17 - j]; + v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ + u += v; + } + hr[i] = u; + } + + /* (partial) h %= p */ + poly1305_squeeze_ref8 (st->h, hr); + + m += POLY1305_REF_BLOCKSIZE; + bytes -= POLY1305_REF_BLOCKSIZE; + } + + /* burn_stack */ + return (18 + 2) * sizeof (u32) + 18 + 6 * sizeof (void *) + + 6 * sizeof (void *); +} + + +static unsigned int +poly1305_finish_ext_ref8 (void *state, const byte * m, size_t remaining, + byte mac[POLY1305_TAGLEN]) +{ + poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state; + size_t i; + unsigned int burn = 0; + + /* process the remaining block */ + if (remaining) + { + byte final[POLY1305_REF_BLOCKSIZE] = { 0 }; + for (i = 0; i < remaining; i++) + final[i] = m[i]; + final[remaining] = 1; + st->final = 1; + burn = poly1305_blocks_ref8 (st, final, POLY1305_REF_BLOCKSIZE); + } + + /* fully reduce h */ + poly1305_freeze_ref8 (st->h); + + /* h = (h + pad) % (1 << 128) */ + poly1305_add_ref8 (st->h, st->pad); + for (i = 0; i < 16; i++) + mac[i] = st->h[i]; + + /* zero out the state */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + for (i = 0; i < 17; i++) + st->r[i] = 0; + for (i = 0; i < 17; i++) + st->pad[i] = 0; + + /* burn_stack */ + return POLY1305_REF_BLOCKSIZE + 18 + 16 * sizeof (void *) + burn; +} + + +static const poly1305_ops_t poly1305_default_ops = { + POLY1305_REF_BLOCKSIZE, + poly1305_init_ext_ref8, + poly1305_blocks_ref8, + poly1305_finish_ext_ref8 +}; + +#endif /* HAVE_U64_TYPEDEF */ + + + +static inline void * +poly1305_get_state (poly1305_context_t * ctx) +{ + byte *c = ctx->state; + c += POLY1305_STATE_ALIGNMENT - 1; + c -= (uintptr_t) c & (POLY1305_STATE_ALIGNMENT - 1); + return c; +} + + +static void +poly1305_init (poly1305_context_t * ctx, const poly1305_key_t * key) +{ + void *state = poly1305_get_state (ctx); + + ctx->leftover = 0; + + ctx->ops->init_ext (state, key); +} + + +void +_gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes) +{ + void *state = poly1305_get_state (ctx); + unsigned int burn = 0; + size_t block_size = ctx->ops->block_size; + + /* handle leftover */ + if (ctx->leftover) + { + size_t want = (block_size - ctx->leftover); + if (want > bytes) + want = bytes; + buf_cpy (ctx->buffer + ctx->leftover, m, want); + bytes -= want; + m += want; + ctx->leftover += want; + if (ctx->leftover < block_size) + return; + burn = ctx->ops->blocks (state, ctx->buffer, block_size); + ctx->leftover = 0; + } + + /* process full blocks */ + if (bytes >= block_size) + { + size_t want = (bytes & ~(block_size - 1)); + burn = ctx->ops->blocks (state, m, want); + m += want; + bytes -= want; + } + + /* store leftover */ + if (bytes) + { + buf_cpy (ctx->buffer + ctx->leftover, m, bytes); + ctx->leftover += bytes; + } + + if (burn) + _gcry_burn_stack (burn); +} + + +void +_gcry_poly1305_finish (poly1305_context_t * ctx, byte mac[POLY1305_TAGLEN]) +{ + void *state = poly1305_get_state (ctx); + unsigned int burn; + + burn = ctx->ops->finish_ext (state, ctx->buffer, ctx->leftover, mac); + + _gcry_burn_stack (burn); +} + + +gcry_err_code_t +_gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, + size_t keylen) +{ + static int initialized; + static const char *selftest_failed; + poly1305_key_t keytmp; + + if (!initialized) + { + initialized = 1; + selftest_failed = selftest (); + if (selftest_failed) + log_error ("Poly1305 selftest failed (%s)\n", selftest_failed); + } + + if (keylen != POLY1305_KEYLEN) + return GPG_ERR_INV_KEYLEN; + + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + ctx->ops = &poly1305_default_ops; + + buf_cpy (keytmp.b, key, POLY1305_KEYLEN); + poly1305_init (ctx, &keytmp); + + wipememory (&keytmp, sizeof (keytmp)); + + return 0; +} + + +static void +poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes, + const byte * key) +{ + poly1305_context_t ctx; + + memset (&ctx, 0, sizeof (ctx)); + + _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN); + _gcry_poly1305_update (&ctx, m, bytes); + _gcry_poly1305_finish (&ctx, mac); + + wipememory (&ctx, sizeof (ctx)); +} + + +static const char * +selftest (void) +{ + /* example from nacl */ + static const byte nacl_key[POLY1305_KEYLEN] = { + 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91, + 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25, + 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65, + 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80, + }; + + static const byte nacl_msg[131] = { + 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73, + 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce, + 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4, + 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a, + 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b, + 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72, + 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2, + 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38, + 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a, + 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae, + 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea, + 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda, + 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde, + 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3, + 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6, + 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74, + 0xe3, 0x55, 0xa5 + }; + + static const byte nacl_mac[16] = { + 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5, + 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 + }; + + /* generates a final value of (2^130 - 2) == 3 */ + static const byte wrap_key[POLY1305_KEYLEN] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + static const byte wrap_msg[16] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + + static const byte wrap_mac[16] = { + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + /* mac of the macs of messages of length 0 to 256, where the key and messages + * have all their values set to the length + */ + static const byte total_key[POLY1305_KEYLEN] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + + static const byte total_mac[16] = { + 0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd, + 0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39 + }; + + poly1305_context_t ctx; + poly1305_context_t total_ctx; + byte all_key[POLY1305_KEYLEN]; + byte all_msg[256]; + byte mac[16]; + size_t i, j; + + memset (&ctx, 0, sizeof (ctx)); + memset (&total_ctx, 0, sizeof (total_ctx)); + + memset (mac, 0, sizeof (mac)); + poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key); + if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 1 failed."; + + /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so + * make sure everything still works varying between them */ + memset (mac, 0, sizeof (mac)); + _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN); + _gcry_poly1305_update (&ctx, nacl_msg + 0, 32); + _gcry_poly1305_update (&ctx, nacl_msg + 32, 64); + _gcry_poly1305_update (&ctx, nacl_msg + 96, 16); + _gcry_poly1305_update (&ctx, nacl_msg + 112, 8); + _gcry_poly1305_update (&ctx, nacl_msg + 120, 4); + _gcry_poly1305_update (&ctx, nacl_msg + 124, 2); + _gcry_poly1305_update (&ctx, nacl_msg + 126, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 127, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 128, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 129, 1); + _gcry_poly1305_update (&ctx, nacl_msg + 130, 1); + _gcry_poly1305_finish (&ctx, mac); + if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 2 failed."; + + memset (mac, 0, sizeof (mac)); + poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key); + if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0) + return "Poly1305 test 3 failed."; + + _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN); + for (i = 0; i < 256; i++) + { + /* set key and message to 'i,i,i..' */ + for (j = 0; j < sizeof (all_key); j++) + all_key[j] = i; + for (j = 0; j < i; j++) + all_msg[j] = i; + poly1305_auth (mac, all_msg, i, all_key); + _gcry_poly1305_update (&total_ctx, mac, 16); + } + _gcry_poly1305_finish (&total_ctx, mac); + if (memcmp (total_mac, mac, sizeof (total_mac)) != 0) + return "Poly1305 test 4 failed."; + + return NULL; +} diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index d4e9bb2..8648e96 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1341,7 +1341,9 @@ enum gcry_mac_algos GCRY_MAC_GMAC_CAMELLIA = 402, GCRY_MAC_GMAC_TWOFISH = 403, GCRY_MAC_GMAC_SERPENT = 404, - GCRY_MAC_GMAC_SEED = 405 + GCRY_MAC_GMAC_SEED = 405, + + GCRY_MAC_POLY1305 = 501 }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index 406d82d..cc0f4c1 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5041,6 +5041,8 @@ check_mac (void) const char *key; const char *expect; const char *iv; + unsigned int dlen; + unsigned int klen; } algos[] = { { GCRY_MAC_HMAC_MD5, "what do ya want for nothing?", "Jefe", @@ -5491,6 +5493,109 @@ check_mac (void) "\xc9\xfc\xa7\x29\xab\x60\xad\xa0", "\x20\x4b\xdb\x1b\xd6\x21\x54\xbf\x08\x92\x2a\xaa\x54\xee\xd7\x05", "\x05\xad\x13\xa5\xe2\xc2\xab\x66\x7e\x1a\x6f\xbc" }, + /* from NaCl */ + { GCRY_MAC_POLY1305, + "\x8e\x99\x3b\x9f\x48\x68\x12\x73\xc2\x96\x50\xba\x32\xfc\x76\xce" + "\x48\x33\x2e\xa7\x16\x4d\x96\xa4\x47\x6f\xb8\xc5\x31\xa1\x18\x6a" + "\xc0\xdf\xc1\x7c\x98\xdc\xe8\x7b\x4d\xa7\xf0\x11\xec\x48\xc9\x72" + "\x71\xd2\xc2\x0f\x9b\x92\x8f\xe2\x27\x0d\x6f\xb8\x63\xd5\x17\x38" + "\xb4\x8e\xee\xe3\x14\xa7\xcc\x8a\xb9\x32\x16\x45\x48\xe5\x26\xae" + "\x90\x22\x43\x68\x51\x7a\xcf\xea\xbd\x6b\xb3\x73\x2b\xc0\xe9\xda" + "\x99\x83\x2b\x61\xca\x01\xb6\xde\x56\x24\x4a\x9e\x88\xd5\xf9\xb3" + "\x79\x73\xf6\x22\xa4\x3d\x14\xa6\x59\x9b\x1f\x65\x4c\xb4\x5a\x74" + "\xe3\x55\xa5", + "\xee\xa6\xa7\x25\x1c\x1e\x72\x91\x6d\x11\xc2\xcb\x21\x4d\x3c\x25" + "\x25\x39\x12\x1d\x8e\x23\x4e\x65\x2d\x65\x1f\xa4\xc8\xcf\xf8\x80", + "\xf3\xff\xc7\x70\x3f\x94\x00\xe5\x2a\x7d\xfb\x4b\x3d\x33\x05\xd9" }, + /* from draft-nir-cfrg-chacha20-poly1305-03 */ + { GCRY_MAC_POLY1305, + "Cryptographic Forum Research Group", + "\x85\xd6\xbe\x78\x57\x55\x6d\x33\x7f\x44\x52\xfe\x42\xd5\x06\xa8" + "\x01\x03\x80\x8a\xfb\x0d\xb2\xfd\x4a\xbf\xf6\xaf\x41\x49\xf5\x1b", + "\xa8\x06\x1d\xc1\x30\x51\x36\xc6\xc2\x2b\x8b\xaf\x0c\x01\x27\xa9" }, + { GCRY_MAC_POLY1305, + "'Twas brillig, and the slithy toves\n" + "Did gyre and gimble in the wabe:\n" + "All mimsy were the borogoves,\n" + "And the mome raths outgrabe.", + "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + "\x45\x41\x66\x9a\x7e\xaa\xee\x61\xe7\x08\xdc\x7c\xbc\xc5\xeb\x62" }, + { GCRY_MAC_POLY1305, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + NULL, + 191, 32 }, + { GCRY_MAC_POLY1305, + "Any submission to the IETF intended by the Contributor for " + "publication as all or part of an IETF Internet-Draft or RFC and " + "any statement made within the context of an IETF activity is " + "considered an \"IETF Contribution\". Such statements include " + "oral statements in IETF sessions, as well as written and " + "electronic communications made at any time or place, which are " + "addressed to", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e", + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "Any submission to the IETF intended by the Contributor for " + "publication as all or part of an IETF Internet-Draft or RFC and " + "any statement made within the context of an IETF activity is " + "considered an \"IETF Contribution\". Such statements include " + "oral statements in IETF sessions, as well as written and " + "electronic communications made at any time or place, which are " + "addressed to", + "\x36\xe5\xf6\xb5\xc5\xe0\x60\x70\xf0\xef\xca\x96\x22\x7a\x86\x3e" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\xf3\x47\x7e\x7c\xd9\x54\x17\xaf\x89\xa6\xb8\x79\x4c\x31\x0c\xf0", + NULL, + 0, 32 }, + /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ + { GCRY_MAC_POLY1305, + "\xf3\xf6", + "\x85\x1f\xc4\x0c\x34\x67\xac\x0b\xe0\x5c\xc2\x04\x04\xf3\xf7\x00" + "\x58\x0b\x3b\x0f\x94\x47\xbb\x1e\x69\xd0\x95\xb5\x92\x8b\x6d\xbc", + "\xf4\xc6\x33\xc3\x04\x4f\xc1\x45\xf8\x4f\x33\x5c\xb8\x19\x53\xde", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "", + "\xa0\xf3\x08\x00\x00\xf4\x64\x00\xd0\xc7\xe9\x07\x6c\x83\x44\x03" + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + "\xdd\x3f\xab\x22\x51\xf1\x1a\xc7\x59\xf0\x88\x71\x29\xcc\x2e\xe7", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "\x66\x3c\xea\x19\x0f\xfb\x83\xd8\x95\x93\xf3\xf4\x76\xb6\xbc\x24" + "\xd7\xe6\x79\x10\x7e\xa2\x6a\xdb\x8c\xaf\x66\x52\xd0\x65\x61\x36", + "\x48\x44\x3d\x0b\xb0\xd2\x11\x09\xc8\x9a\x10\x0b\x5c\xe2\xc2\x08" + "\x83\x14\x9c\x69\xb5\x61\xdd\x88\x29\x8a\x17\x98\xb1\x07\x16\xef", + "\x0e\xe1\xc1\x6b\xb7\x3f\x0f\x4f\xd1\x98\x81\x75\x3c\x01\xcd\xbe", + NULL, + 0, 32 }, + { GCRY_MAC_POLY1305, + "\xab\x08\x12\x72\x4a\x7f\x1e\x34\x27\x42\xcb\xed\x37\x4d\x94\xd1" + "\x36\xc6\xb8\x79\x5d\x45\xb3\x81\x98\x30\xf2\xc0\x44\x91\xfa\xf0" + "\x99\x0c\x62\xe4\x8b\x80\x18\xb2\xc3\xe4\xa0\xfa\x31\x34\xcb\x67" + "\xfa\x83\xe1\x58\xc9\x94\xd9\x61\xc4\xcb\x21\x09\x5c\x1b\xf9", + "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" + "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", + "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, { 0 }, }; int i; @@ -5500,6 +5605,8 @@ check_mac (void) for (i = 0; algos[i].algo; i++) { + size_t klen, dlen; + if (gcry_mac_test_algo (algos[i].algo)) { show_mac_not_available (algos[i].algo); @@ -5520,13 +5627,14 @@ check_mac (void) algos[i].algo, (int)strlen(algos[i].key), (int)strlen(algos[i].data)); - check_one_mac (algos[i].algo, algos[i].data, strlen (algos[i].data), - algos[i].key, strlen(algos[i].key), algos[i].iv, - algos[i].iv ? strlen(algos[i].iv) : 0, + klen = algos[i].klen ? algos[i].klen : strlen(algos[i].key); + dlen = algos[i].dlen ? algos[i].dlen : strlen (algos[i].data); + + check_one_mac (algos[i].algo, algos[i].data, dlen, algos[i].key, klen, + algos[i].iv, algos[i].iv ? strlen(algos[i].iv) : 0, algos[i].expect, 0); - check_one_mac (algos[i].algo, algos[i].data, strlen (algos[i].data), - algos[i].key, strlen(algos[i].key), algos[i].iv, - algos[i].iv ? strlen(algos[i].iv) : 0, + check_one_mac (algos[i].algo, algos[i].data, dlen, algos[i].key, klen, + algos[i].iv, algos[i].iv ? strlen(algos[i].iv) : 0, algos[i].expect, 1); } diff --git a/tests/bench-slope.c b/tests/bench-slope.c index bd05064..3d8ae37 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -1405,7 +1405,7 @@ mac_bench (char **argv, int argc) } else { - for (i = 1; i < 500; i++) + for (i = 1; i < 600; i++) if (!gcry_mac_test_algo (i)) _mac_bench (i); } diff --git a/tests/benchmark.c b/tests/benchmark.c index 5efc083..9fd716d 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -461,7 +461,7 @@ mac_bench ( const char *algoname ) if (!algoname) { - for (i=1; i < 500; i++) + for (i=1; i < 600; i++) if (in_fips_mode && i == GCRY_MAC_HMAC_MD5) ; /* Don't use MD5 in fips mode. */ else if ( !gcry_mac_test_algo (i) ) ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 5 +- cipher/cipher-internal.h | 38 +++ cipher/cipher-poly1305.c | 296 +++++++++++++++++ cipher/cipher.c | 51 ++- cipher/mac-internal.h | 32 +- cipher/mac-poly1305.c | 359 +++++++++++++++++++++ cipher/mac.c | 6 + cipher/poly1305-internal.h | 93 ++++++ cipher/poly1305.c | 766 ++++++++++++++++++++++++++++++++++++++++++++ src/gcrypt.h.in | 30 +- tests/basic.c | 472 ++++++++++++++++++++++++++- tests/bench-slope.c | 123 ++++++- tests/benchmark.c | 14 +- 13 files changed, 2244 insertions(+), 41 deletions(-) create mode 100644 cipher/cipher-poly1305.c create mode 100644 cipher/mac-poly1305.c create mode 100644 cipher/poly1305-internal.h create mode 100644 cipher/poly1305.c hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Sat May 17 16:38:55 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 17 May 2014 17:38:55 +0300 Subject: [PATCH] chacha20: add SSE2/AMD64 optimized implementation Message-ID: <20140517143855.17845.12795.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'chacha20-sse2-amd64.S'. * cipher/chacha20-sse2-amd64.S: New. * cipher/chacha20.c (USE_SSE2): New. [USE_SSE2] (_gcry_chacha20_amd64_sse2_blocks): New. (chacha20_do_setkey) [USE_SSE2]: Use SSE2 implementation for blocks function. * configure.ac [host=x86-64]: Add 'chacha20-sse2-amd64.lo'. -- Add Andrew Moon's public domain SSE2 implementation of ChaCha20. Original source is available at: https://github.com/floodyberry/chacha-opt Benchmark on Intel i5-4570 (haswell), with "--disable-hwf intel-avx2 --disable-hwf intel-ssse3": Old: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.97 ns/B 483.8 MiB/s 6.31 c/B STREAM dec | 1.97 ns/B 483.6 MiB/s 6.31 c/B New: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.931 ns/B 1024.7 MiB/s 2.98 c/B STREAM dec | 0.930 ns/B 1025.0 MiB/s 2.98 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/chacha20-sse2-amd64.S | 652 ++++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 18 + configure.ac | 1 4 files changed, 672 insertions(+), 1 deletion(-) create mode 100644 cipher/chacha20-sse2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 19b0097..8a3bd19 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -60,7 +60,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ +chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S new file mode 100644 index 0000000..4811f40 --- /dev/null +++ b/cipher/chacha20-sse2-amd64.S @@ -0,0 +1,652 @@ +/* chacha20-sse2-amd64.S - AMD64/SSE2 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && USE_CHACHA20 + +.text + +.align 8 +.globl _gcry_chacha20_amd64_sse2_blocks +.type _gcry_chacha20_amd64_sse2_blocks, at function; +_gcry_chacha20_amd64_sse2_blocks: +.Lchacha_blocks_sse2_local: + pushq %rbx + pushq %rbp + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + movdqu (%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movq $20, %rax + movq $1, %r9 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rax, 64(%rsp) + cmpq $256, %rcx + jb .Lchacha_blocks_sse2_below256 + pshufd $0x00, %xmm8, %xmm0 + pshufd $0x55, %xmm8, %xmm1 + pshufd $0xaa, %xmm8, %xmm2 + pshufd $0xff, %xmm8, %xmm3 + movdqa %xmm0, 128(%rsp) + movdqa %xmm1, 144(%rsp) + movdqa %xmm2, 160(%rsp) + movdqa %xmm3, 176(%rsp) + pshufd $0x00, %xmm9, %xmm0 + pshufd $0x55, %xmm9, %xmm1 + pshufd $0xaa, %xmm9, %xmm2 + pshufd $0xff, %xmm9, %xmm3 + movdqa %xmm0, 192(%rsp) + movdqa %xmm1, 208(%rsp) + movdqa %xmm2, 224(%rsp) + movdqa %xmm3, 240(%rsp) + pshufd $0x00, %xmm10, %xmm0 + pshufd $0x55, %xmm10, %xmm1 + pshufd $0xaa, %xmm10, %xmm2 + pshufd $0xff, %xmm10, %xmm3 + movdqa %xmm0, 256(%rsp) + movdqa %xmm1, 272(%rsp) + movdqa %xmm2, 288(%rsp) + movdqa %xmm3, 304(%rsp) + pshufd $0xaa, %xmm11, %xmm0 + pshufd $0xff, %xmm11, %xmm1 + movdqa %xmm0, 352(%rsp) + movdqa %xmm1, 368(%rsp) + jmp .Lchacha_blocks_sse2_atleast256 +.p2align 6,,63 +.Lchacha_blocks_sse2_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 320(%rsp) + movl %r8d, 4+320(%rsp) + movl %r9d, 8+320(%rsp) + movl %r10d, 12+320(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 336(%rsp) + movl %r8d, 4+336(%rsp) + movl %r9d, 8+336(%rsp) + movl %r10d, 12+336(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + movdqa 128(%rsp), %xmm0 + movdqa 144(%rsp), %xmm1 + movdqa 160(%rsp), %xmm2 + movdqa 176(%rsp), %xmm3 + movdqa 192(%rsp), %xmm4 + movdqa 208(%rsp), %xmm5 + movdqa 224(%rsp), %xmm6 + movdqa 240(%rsp), %xmm7 + movdqa 256(%rsp), %xmm8 + movdqa 272(%rsp), %xmm9 + movdqa 288(%rsp), %xmm10 + movdqa 304(%rsp), %xmm11 + movdqa 320(%rsp), %xmm12 + movdqa 336(%rsp), %xmm13 + movdqa 352(%rsp), %xmm14 + movdqa 368(%rsp), %xmm15 +.Lchacha_blocks_sse2_mainloop1: + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + movdqa %xmm6, 96(%rsp) + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshuflw $0xb1,%xmm12,%xmm12 + pshufhw $0xb1,%xmm12,%xmm12 + pshuflw $0xb1,%xmm13,%xmm13 + pshufhw $0xb1,%xmm13,%xmm13 + pshuflw $0xb1,%xmm14,%xmm14 + pshufhw $0xb1,%xmm14,%xmm14 + pshuflw $0xb1,%xmm15,%xmm15 + pshufhw $0xb1,%xmm15,%xmm15 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa 96(%rsp), %xmm6 + movdqa %xmm4, %xmm12 + pslld $ 12, %xmm4 + psrld $20, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 12, %xmm5 + psrld $20, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 12, %xmm6 + psrld $20, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 12, %xmm7 + psrld $20, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + movdqa %xmm6, 96(%rsp) + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + movdqa %xmm12, %xmm6 + pslld $ 8, %xmm12 + psrld $24, %xmm6 + pxor %xmm6, %xmm12 + movdqa %xmm13, %xmm6 + pslld $ 8, %xmm13 + psrld $24, %xmm6 + pxor %xmm6, %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + movdqa %xmm14, %xmm6 + pslld $ 8, %xmm14 + psrld $24, %xmm6 + pxor %xmm6, %xmm14 + movdqa %xmm15, %xmm6 + pslld $ 8, %xmm15 + psrld $24, %xmm6 + pxor %xmm6, %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa 96(%rsp), %xmm6 + movdqa %xmm4, %xmm12 + pslld $ 7, %xmm4 + psrld $25, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 7, %xmm5 + psrld $25, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 7, %xmm6 + psrld $25, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 7, %xmm7 + psrld $25, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + movdqa %xmm7, 96(%rsp) + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshuflw $0xb1,%xmm15,%xmm15 + pshufhw $0xb1,%xmm15,%xmm15 + pshuflw $0xb1,%xmm12,%xmm12 + pshufhw $0xb1,%xmm12,%xmm12 + pshuflw $0xb1,%xmm13,%xmm13 + pshufhw $0xb1,%xmm13,%xmm13 + pshuflw $0xb1,%xmm14,%xmm14 + pshufhw $0xb1,%xmm14,%xmm14 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa %xmm5, %xmm15 + pslld $ 12, %xmm5 + psrld $20, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 12, %xmm6 + psrld $20, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 12, %xmm7 + psrld $20, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 12, %xmm4 + psrld $20, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + movdqa %xmm7, 96(%rsp) + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + movdqa %xmm15, %xmm7 + pslld $ 8, %xmm15 + psrld $24, %xmm7 + pxor %xmm7, %xmm15 + movdqa %xmm12, %xmm7 + pslld $ 8, %xmm12 + psrld $24, %xmm7 + pxor %xmm7, %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + movdqa %xmm13, %xmm7 + pslld $ 8, %xmm13 + psrld $24, %xmm7 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pslld $ 8, %xmm14 + psrld $24, %xmm7 + pxor %xmm7, %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa %xmm5, %xmm15 + pslld $ 7, %xmm5 + psrld $25, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 7, %xmm6 + psrld $25, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 7, %xmm7 + psrld $25, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 7, %xmm4 + psrld $25, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + subq $2, %rax + jnz .Lchacha_blocks_sse2_mainloop1 + paddd 128(%rsp), %xmm0 + paddd 144(%rsp), %xmm1 + paddd 160(%rsp), %xmm2 + paddd 176(%rsp), %xmm3 + paddd 192(%rsp), %xmm4 + paddd 208(%rsp), %xmm5 + paddd 224(%rsp), %xmm6 + paddd 240(%rsp), %xmm7 + paddd 256(%rsp), %xmm8 + paddd 272(%rsp), %xmm9 + paddd 288(%rsp), %xmm10 + paddd 304(%rsp), %xmm11 + paddd 320(%rsp), %xmm12 + paddd 336(%rsp), %xmm13 + paddd 352(%rsp), %xmm14 + paddd 368(%rsp), %xmm15 + movdqa %xmm8, 384(%rsp) + movdqa %xmm9, 400(%rsp) + movdqa %xmm10, 416(%rsp) + movdqa %xmm11, 432(%rsp) + movdqa %xmm12, 448(%rsp) + movdqa %xmm13, 464(%rsp) + movdqa %xmm14, 480(%rsp) + movdqa %xmm15, 496(%rsp) + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm8, %xmm5 + movdqa %xmm10, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm1 + punpcklqdq %xmm6, %xmm3 + punpcklqdq %xmm9, %xmm5 + punpcklqdq %xmm11, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 64(%rsi), %xmm9 + movdqu 80(%rsi), %xmm11 + movdqu 128(%rsi), %xmm12 + movdqu 144(%rsi), %xmm13 + movdqu 192(%rsi), %xmm14 + movdqu 208(%rsi), %xmm15 + pxor %xmm2, %xmm5 + pxor %xmm6, %xmm7 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm1 + pxor %xmm13, %xmm3 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm6 + movdqu 96(%rsi), %xmm9 + movdqu 112(%rsi), %xmm11 + movdqu 160(%rsi), %xmm12 + movdqu 176(%rsi), %xmm13 + movdqu 224(%rsi), %xmm14 + movdqu 240(%rsi), %xmm15 + pxor %xmm2, %xmm1 + pxor %xmm6, %xmm5 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm3 + pxor %xmm13, %xmm7 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_sse2_mainloop_cont +.Lchacha_blocks_sse2_noinput1: + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) +.Lchacha_blocks_sse2_mainloop_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_sse2_atleast256 + movdqa 0(%rsp), %xmm8 + movdqa 16(%rsp), %xmm9 + movdqa 32(%rsp), %xmm10 + movdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_sse2_below256: + movq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_sse2_done + cmpq $64, %rcx + jae .Lchacha_blocks_sse2_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput2 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_sse2_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_sse2_copyinput + movq %rsp, %rsi +.Lchacha_blocks_sse2_noinput2: + movq %rsp, %rdx +.Lchacha_blocks_sse2_above63: + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_sse2_mainloop2: + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshuflw $0xb1,%xmm3,%xmm3 + pshufhw $0xb1,%xmm3,%xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1,%xmm4 + pslld $12, %xmm1 + psrld $20, %xmm4 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + movdqa %xmm3,%xmm4 + pslld $8, %xmm3 + psrld $24, %xmm4 + pshufd $0x93,%xmm0,%xmm0 + pxor %xmm4, %xmm3 + paddd %xmm3, %xmm2 + pshufd $0x4e,%xmm3,%xmm3 + pxor %xmm2, %xmm1 + pshufd $0x39,%xmm2,%xmm2 + movdqa %xmm1,%xmm4 + pslld $7, %xmm1 + psrld $25, %xmm4 + pxor %xmm4, %xmm1 + subq $2, %rax + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshuflw $0xb1,%xmm3,%xmm3 + pshufhw $0xb1,%xmm3,%xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1,%xmm4 + pslld $12, %xmm1 + psrld $20, %xmm4 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + movdqa %xmm3,%xmm4 + pslld $8, %xmm3 + psrld $24, %xmm4 + pshufd $0x39,%xmm0,%xmm0 + pxor %xmm4, %xmm3 + paddd %xmm3, %xmm2 + pshufd $0x4e,%xmm3,%xmm3 + pxor %xmm2, %xmm1 + pshufd $0x93,%xmm2,%xmm2 + movdqa %xmm1,%xmm4 + pslld $7, %xmm1 + psrld $25, %xmm4 + pxor %xmm4, %xmm1 + jnz .Lchacha_blocks_sse2_mainloop2 + paddd %xmm8, %xmm0 + paddd %xmm9, %xmm1 + paddd %xmm10, %xmm2 + paddd %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput3 + movdqu 0(%rsi), %xmm12 + movdqu 16(%rsi), %xmm13 + movdqu 32(%rsi), %xmm14 + movdqu 48(%rsi), %xmm15 + pxor %xmm12, %xmm0 + pxor %xmm13, %xmm1 + pxor %xmm14, %xmm2 + pxor %xmm15, %xmm3 + addq $64, %rsi +.Lchacha_blocks_sse2_noinput3: + movdqu %xmm0, 0(%rdx) + movdqu %xmm1, 16(%rdx) + movdqu %xmm2, 32(%rdx) + movdqu %xmm3, 48(%rdx) + paddq %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_sse2_mainloop2_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_sse2_below256 +.Lchacha_blocks_sse2_mainloop2_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_sse2_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_sse2_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_sse2_copyoutput +.Lchacha_blocks_sse2_done: + movdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 + popq %rbp + popq %rbx + movl $(63 + 512 + 16), %eax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 + ret +.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks; + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index e2cf442..03416d4 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -47,6 +47,12 @@ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) +/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */ +#undef USE_SSE2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_SSE2 1 +#endif + /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ @@ -77,6 +83,13 @@ typedef struct CHACHA20_context_s } CHACHA20_context_t; +#ifdef USE_SSE2 + +unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_SSE2 */ + #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, @@ -323,7 +336,12 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; +#ifdef USE_SSE2 + ctx->blocks = _gcry_chacha20_amd64_sse2_blocks; +#else ctx->blocks = chacha20_blocks; +#endif + #ifdef USE_SSSE3 if (features & HWF_INTEL_SSSE3) ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks; diff --git a/configure.ac b/configure.ac index 47a322b..c5952c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1815,6 +1815,7 @@ if test "$found" = "1" ; then case "${host}" in x86_64-*-*) # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-sse2-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; From jussi.kivilinna at iki.fi Sat May 17 17:42:56 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 17 May 2014 18:42:56 +0300 Subject: [PATCH] Add Poly1305 to documentation Message-ID: <20140517154256.21505.88570.stgit@localhost6.localdomain6> * doc/gcrypt.texi: Add documentation for Poly1305 MACs and AEAD mode. -- Signed-off-by: Jussi Kivilinna --- doc/gcrypt.texi | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index d202b8b..ab89cbe 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -1629,6 +1629,11 @@ Galois/Counter Mode (GCM) is an Authenticated Encryption with Associated Data (AEAD) block cipher mode, which is specified in 'NIST Special Publication 800-38D'. + at item GCRY_CIPHER_MODE_POLY1305 + at cindex Poly1305 based AEAD mode +Poly1305 is an Authenticated Encryption with Associated Data (AEAD) +block cipher mode. + @end table @node Working with cipher handles @@ -1655,12 +1660,13 @@ The cipher mode to use must be specified via @var{mode}. See @xref{Available cipher modes}, for a list of supported cipher modes and the according constants. Note that some modes are incompatible with some algorithms - in particular, stream mode -(@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers. The -block cipher modes (@code{GCRY_CIPHER_MODE_ECB}, +(@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers. +Poly1305 AEAD mode (@code{GCRY_CIPHER_MODE_POLY1305}) only works with +ChaCha and Salsa stream ciphers. The block cipher modes (@code{GCRY_CIPHER_MODE_ECB}, @code{GCRY_CIPHER_MODE_CBC}, @code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB} and @code{GCRY_CIPHER_MODE_CTR}) will work -with any block cipher algorithm. @code{GCRY_CIPHER_MODE_CCM} and - at code{GCRY_CIPHER_MODE_GCM} modes will only work with block cipher algorithms +with any block cipher algorithm. GCM mode (@code{GCRY_CIPHER_MODE_CCM}) and +CCM mode (@code{GCRY_CIPHER_MODE_GCM}) will only work with block cipher algorithms which have the block size of 16 bytes. The third argument @var{flags} can either be passed as @code{0} or as @@ -3548,6 +3554,30 @@ block cipher algorithm. This is GMAC message authentication algorithm based on the SEED block cipher algorithm. + at item GCRY_MAC_POLY1305 +This is plain Poly1305 message authentication algorithm, used with +one-time key. + + at item GCRY_MAC_POLY1305_AES +This is Poly1305-AES message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_CAMELLIA +This is Poly1305-Camellia message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_TWOFISH +This is Poly1305-Twofish message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_SERPENT +This is Poly1305-Serpent message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_SEED +This is Poly1305-SEED message authentication algorithm, used with +key and one-time nonce. + @end table @c end table of MAC algorithms @@ -3593,8 +3623,8 @@ underlying block cipher. @end deftypefun -GMAC algorithms need initialization vector to be set, which can be -performed with function: +GMAC algorithms and Poly1305-with-cipher algorithms need initialization vector to be set, +which can be performed with function: @deftypefun gcry_error_t gcry_mac_setiv (gcry_mac_hd_t @var{h}, const void *@var{iv}, size_t @var{ivlen}) From cvs at cvs.gnupg.org Sun May 18 11:57:58 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sun, 18 May 2014 11:57:58 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-80-gbf49439 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via bf4943932dae95a0573b63bf32a9b9acd5a6ddf3 (commit) via 323b1eb80ff3396d83fedbe5bba9a4e6c412d192 (commit) via 98f021961ee65669037bc8bb552a69fd78f610fc (commit) via 297532602ed2d881d8fdc393d1961068a143a891 (commit) from e813958419b0ec4439e6caf07d3b2234cffa2bfa (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit bf4943932dae95a0573b63bf32a9b9acd5a6ddf3 Author: Jussi Kivilinna Date: Sat May 17 18:30:39 2014 +0300 Add Poly1305 to documentation * doc/gcrypt.texi: Add documentation for Poly1305 MACs and AEAD mode. -- Signed-off-by: Jussi Kivilinna diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index d202b8b..d59c095 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -1629,6 +1629,11 @@ Galois/Counter Mode (GCM) is an Authenticated Encryption with Associated Data (AEAD) block cipher mode, which is specified in 'NIST Special Publication 800-38D'. + at item GCRY_CIPHER_MODE_POLY1305 + at cindex Poly1305 based AEAD mode +Poly1305 is an Authenticated Encryption with Associated Data (AEAD) +mode, which can be used with ChaCha20 and Salsa20 stream ciphers. + @end table @node Working with cipher handles @@ -1655,12 +1660,13 @@ The cipher mode to use must be specified via @var{mode}. See @xref{Available cipher modes}, for a list of supported cipher modes and the according constants. Note that some modes are incompatible with some algorithms - in particular, stream mode -(@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers. The -block cipher modes (@code{GCRY_CIPHER_MODE_ECB}, +(@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers. +Poly1305 AEAD mode (@code{GCRY_CIPHER_MODE_POLY1305}) only works with +ChaCha and Salsa stream ciphers. The block cipher modes (@code{GCRY_CIPHER_MODE_ECB}, @code{GCRY_CIPHER_MODE_CBC}, @code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB} and @code{GCRY_CIPHER_MODE_CTR}) will work -with any block cipher algorithm. @code{GCRY_CIPHER_MODE_CCM} and - at code{GCRY_CIPHER_MODE_GCM} modes will only work with block cipher algorithms +with any block cipher algorithm. GCM mode (@code{GCRY_CIPHER_MODE_CCM}) and +CCM mode (@code{GCRY_CIPHER_MODE_GCM}) will only work with block cipher algorithms which have the block size of 16 bytes. The third argument @var{flags} can either be passed as @code{0} or as @@ -3548,6 +3554,30 @@ block cipher algorithm. This is GMAC message authentication algorithm based on the SEED block cipher algorithm. + at item GCRY_MAC_POLY1305 +This is plain Poly1305 message authentication algorithm, used with +one-time key. + + at item GCRY_MAC_POLY1305_AES +This is Poly1305-AES message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_CAMELLIA +This is Poly1305-Camellia message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_TWOFISH +This is Poly1305-Twofish message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_SERPENT +This is Poly1305-Serpent message authentication algorithm, used with +key and one-time nonce. + + at item GCRY_MAC_POLY1305_SEED +This is Poly1305-SEED message authentication algorithm, used with +key and one-time nonce. + @end table @c end table of MAC algorithms @@ -3593,8 +3623,8 @@ underlying block cipher. @end deftypefun -GMAC algorithms need initialization vector to be set, which can be -performed with function: +GMAC algorithms and Poly1305-with-cipher algorithms need initialization vector to be set, +which can be performed with function: @deftypefun gcry_error_t gcry_mac_setiv (gcry_mac_hd_t @var{h}, const void *@var{iv}, size_t @var{ivlen}) commit 323b1eb80ff3396d83fedbe5bba9a4e6c412d192 Author: Jussi Kivilinna Date: Fri May 16 21:28:26 2014 +0300 chacha20: add SSE2/AMD64 optimized implementation * cipher/Makefile.am: Add 'chacha20-sse2-amd64.S'. * cipher/chacha20-sse2-amd64.S: New. * cipher/chacha20.c (USE_SSE2): New. [USE_SSE2] (_gcry_chacha20_amd64_sse2_blocks): New. (chacha20_do_setkey) [USE_SSE2]: Use SSE2 implementation for blocks function. * configure.ac [host=x86-64]: Add 'chacha20-sse2-amd64.lo'. -- Add Andrew Moon's public domain SSE2 implementation of ChaCha20. Original source is available at: https://github.com/floodyberry/chacha-opt Benchmark on Intel i5-4570 (haswell), with "--disable-hwf intel-avx2 --disable-hwf intel-ssse3": Old: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.97 ns/B 483.8 MiB/s 6.31 c/B STREAM dec | 1.97 ns/B 483.6 MiB/s 6.31 c/B New: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.931 ns/B 1024.7 MiB/s 2.98 c/B STREAM dec | 0.930 ns/B 1025.0 MiB/s 2.98 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 19b0097..8a3bd19 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -60,7 +60,7 @@ EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ +chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S new file mode 100644 index 0000000..4811f40 --- /dev/null +++ b/cipher/chacha20-sse2-amd64.S @@ -0,0 +1,652 @@ +/* chacha20-sse2-amd64.S - AMD64/SSE2 implementation of ChaCha20 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && USE_CHACHA20 + +.text + +.align 8 +.globl _gcry_chacha20_amd64_sse2_blocks +.type _gcry_chacha20_amd64_sse2_blocks, at function; +_gcry_chacha20_amd64_sse2_blocks: +.Lchacha_blocks_sse2_local: + pushq %rbx + pushq %rbp + movq %rsp, %rbp + andq $~63, %rsp + subq $512, %rsp + movdqu (%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movq $20, %rax + movq $1, %r9 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rax, 64(%rsp) + cmpq $256, %rcx + jb .Lchacha_blocks_sse2_below256 + pshufd $0x00, %xmm8, %xmm0 + pshufd $0x55, %xmm8, %xmm1 + pshufd $0xaa, %xmm8, %xmm2 + pshufd $0xff, %xmm8, %xmm3 + movdqa %xmm0, 128(%rsp) + movdqa %xmm1, 144(%rsp) + movdqa %xmm2, 160(%rsp) + movdqa %xmm3, 176(%rsp) + pshufd $0x00, %xmm9, %xmm0 + pshufd $0x55, %xmm9, %xmm1 + pshufd $0xaa, %xmm9, %xmm2 + pshufd $0xff, %xmm9, %xmm3 + movdqa %xmm0, 192(%rsp) + movdqa %xmm1, 208(%rsp) + movdqa %xmm2, 224(%rsp) + movdqa %xmm3, 240(%rsp) + pshufd $0x00, %xmm10, %xmm0 + pshufd $0x55, %xmm10, %xmm1 + pshufd $0xaa, %xmm10, %xmm2 + pshufd $0xff, %xmm10, %xmm3 + movdqa %xmm0, 256(%rsp) + movdqa %xmm1, 272(%rsp) + movdqa %xmm2, 288(%rsp) + movdqa %xmm3, 304(%rsp) + pshufd $0xaa, %xmm11, %xmm0 + pshufd $0xff, %xmm11, %xmm1 + movdqa %xmm0, 352(%rsp) + movdqa %xmm1, 368(%rsp) + jmp .Lchacha_blocks_sse2_atleast256 +.p2align 6,,63 +.Lchacha_blocks_sse2_atleast256: + movq 48(%rsp), %rax + leaq 1(%rax), %r8 + leaq 2(%rax), %r9 + leaq 3(%rax), %r10 + leaq 4(%rax), %rbx + movl %eax, 320(%rsp) + movl %r8d, 4+320(%rsp) + movl %r9d, 8+320(%rsp) + movl %r10d, 12+320(%rsp) + shrq $32, %rax + shrq $32, %r8 + shrq $32, %r9 + shrq $32, %r10 + movl %eax, 336(%rsp) + movl %r8d, 4+336(%rsp) + movl %r9d, 8+336(%rsp) + movl %r10d, 12+336(%rsp) + movq %rbx, 48(%rsp) + movq 64(%rsp), %rax + movdqa 128(%rsp), %xmm0 + movdqa 144(%rsp), %xmm1 + movdqa 160(%rsp), %xmm2 + movdqa 176(%rsp), %xmm3 + movdqa 192(%rsp), %xmm4 + movdqa 208(%rsp), %xmm5 + movdqa 224(%rsp), %xmm6 + movdqa 240(%rsp), %xmm7 + movdqa 256(%rsp), %xmm8 + movdqa 272(%rsp), %xmm9 + movdqa 288(%rsp), %xmm10 + movdqa 304(%rsp), %xmm11 + movdqa 320(%rsp), %xmm12 + movdqa 336(%rsp), %xmm13 + movdqa 352(%rsp), %xmm14 + movdqa 368(%rsp), %xmm15 +.Lchacha_blocks_sse2_mainloop1: + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + movdqa %xmm6, 96(%rsp) + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + pshuflw $0xb1,%xmm12,%xmm12 + pshufhw $0xb1,%xmm12,%xmm12 + pshuflw $0xb1,%xmm13,%xmm13 + pshufhw $0xb1,%xmm13,%xmm13 + pshuflw $0xb1,%xmm14,%xmm14 + pshufhw $0xb1,%xmm14,%xmm14 + pshuflw $0xb1,%xmm15,%xmm15 + pshufhw $0xb1,%xmm15,%xmm15 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa 96(%rsp), %xmm6 + movdqa %xmm4, %xmm12 + pslld $ 12, %xmm4 + psrld $20, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 12, %xmm5 + psrld $20, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 12, %xmm6 + psrld $20, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 12, %xmm7 + psrld $20, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm4, %xmm0 + paddd %xmm5, %xmm1 + pxor %xmm0, %xmm12 + pxor %xmm1, %xmm13 + paddd %xmm6, %xmm2 + paddd %xmm7, %xmm3 + movdqa %xmm6, 96(%rsp) + pxor %xmm2, %xmm14 + pxor %xmm3, %xmm15 + movdqa %xmm12, %xmm6 + pslld $ 8, %xmm12 + psrld $24, %xmm6 + pxor %xmm6, %xmm12 + movdqa %xmm13, %xmm6 + pslld $ 8, %xmm13 + psrld $24, %xmm6 + pxor %xmm6, %xmm13 + paddd %xmm12, %xmm8 + paddd %xmm13, %xmm9 + movdqa %xmm14, %xmm6 + pslld $ 8, %xmm14 + psrld $24, %xmm6 + pxor %xmm6, %xmm14 + movdqa %xmm15, %xmm6 + pslld $ 8, %xmm15 + psrld $24, %xmm6 + pxor %xmm6, %xmm15 + paddd %xmm14, %xmm10 + paddd %xmm15, %xmm11 + movdqa %xmm12, 112(%rsp) + pxor %xmm8, %xmm4 + pxor %xmm9, %xmm5 + movdqa 96(%rsp), %xmm6 + movdqa %xmm4, %xmm12 + pslld $ 7, %xmm4 + psrld $25, %xmm12 + pxor %xmm12, %xmm4 + movdqa %xmm5, %xmm12 + pslld $ 7, %xmm5 + psrld $25, %xmm12 + pxor %xmm12, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + movdqa %xmm6, %xmm12 + pslld $ 7, %xmm6 + psrld $25, %xmm12 + pxor %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + pslld $ 7, %xmm7 + psrld $25, %xmm12 + pxor %xmm12, %xmm7 + movdqa 112(%rsp), %xmm12 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + movdqa %xmm7, 96(%rsp) + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + pshuflw $0xb1,%xmm15,%xmm15 + pshufhw $0xb1,%xmm15,%xmm15 + pshuflw $0xb1,%xmm12,%xmm12 + pshufhw $0xb1,%xmm12,%xmm12 + pshuflw $0xb1,%xmm13,%xmm13 + pshufhw $0xb1,%xmm13,%xmm13 + pshuflw $0xb1,%xmm14,%xmm14 + pshufhw $0xb1,%xmm14,%xmm14 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa %xmm5, %xmm15 + pslld $ 12, %xmm5 + psrld $20, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 12, %xmm6 + psrld $20, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 12, %xmm7 + psrld $20, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 12, %xmm4 + psrld $20, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + paddd %xmm5, %xmm0 + paddd %xmm6, %xmm1 + pxor %xmm0, %xmm15 + pxor %xmm1, %xmm12 + paddd %xmm7, %xmm2 + paddd %xmm4, %xmm3 + movdqa %xmm7, 96(%rsp) + pxor %xmm2, %xmm13 + pxor %xmm3, %xmm14 + movdqa %xmm15, %xmm7 + pslld $ 8, %xmm15 + psrld $24, %xmm7 + pxor %xmm7, %xmm15 + movdqa %xmm12, %xmm7 + pslld $ 8, %xmm12 + psrld $24, %xmm7 + pxor %xmm7, %xmm12 + paddd %xmm15, %xmm10 + paddd %xmm12, %xmm11 + movdqa %xmm13, %xmm7 + pslld $ 8, %xmm13 + psrld $24, %xmm7 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pslld $ 8, %xmm14 + psrld $24, %xmm7 + pxor %xmm7, %xmm14 + paddd %xmm13, %xmm8 + paddd %xmm14, %xmm9 + movdqa %xmm15, 112(%rsp) + pxor %xmm10, %xmm5 + pxor %xmm11, %xmm6 + movdqa 96(%rsp), %xmm7 + movdqa %xmm5, %xmm15 + pslld $ 7, %xmm5 + psrld $25, %xmm15 + pxor %xmm15, %xmm5 + movdqa %xmm6, %xmm15 + pslld $ 7, %xmm6 + psrld $25, %xmm15 + pxor %xmm15, %xmm6 + pxor %xmm8, %xmm7 + pxor %xmm9, %xmm4 + movdqa %xmm7, %xmm15 + pslld $ 7, %xmm7 + psrld $25, %xmm15 + pxor %xmm15, %xmm7 + movdqa %xmm4, %xmm15 + pslld $ 7, %xmm4 + psrld $25, %xmm15 + pxor %xmm15, %xmm4 + movdqa 112(%rsp), %xmm15 + subq $2, %rax + jnz .Lchacha_blocks_sse2_mainloop1 + paddd 128(%rsp), %xmm0 + paddd 144(%rsp), %xmm1 + paddd 160(%rsp), %xmm2 + paddd 176(%rsp), %xmm3 + paddd 192(%rsp), %xmm4 + paddd 208(%rsp), %xmm5 + paddd 224(%rsp), %xmm6 + paddd 240(%rsp), %xmm7 + paddd 256(%rsp), %xmm8 + paddd 272(%rsp), %xmm9 + paddd 288(%rsp), %xmm10 + paddd 304(%rsp), %xmm11 + paddd 320(%rsp), %xmm12 + paddd 336(%rsp), %xmm13 + paddd 352(%rsp), %xmm14 + paddd 368(%rsp), %xmm15 + movdqa %xmm8, 384(%rsp) + movdqa %xmm9, 400(%rsp) + movdqa %xmm10, 416(%rsp) + movdqa %xmm11, 432(%rsp) + movdqa %xmm12, 448(%rsp) + movdqa %xmm13, 464(%rsp) + movdqa %xmm14, 480(%rsp) + movdqa %xmm15, 496(%rsp) + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm8, %xmm5 + movdqa %xmm10, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm1 + punpcklqdq %xmm6, %xmm3 + punpcklqdq %xmm9, %xmm5 + punpcklqdq %xmm11, %xmm7 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 64(%rsi), %xmm9 + movdqu 80(%rsi), %xmm11 + movdqu 128(%rsi), %xmm12 + movdqu 144(%rsi), %xmm13 + movdqu 192(%rsi), %xmm14 + movdqu 208(%rsi), %xmm15 + pxor %xmm2, %xmm5 + pxor %xmm6, %xmm7 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm1 + pxor %xmm13, %xmm3 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm6 + movdqu 96(%rsi), %xmm9 + movdqu 112(%rsi), %xmm11 + movdqu 160(%rsi), %xmm12 + movdqu 176(%rsi), %xmm13 + movdqu 224(%rsi), %xmm14 + movdqu 240(%rsi), %xmm15 + pxor %xmm2, %xmm1 + pxor %xmm6, %xmm5 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm12, %xmm3 + pxor %xmm13, %xmm7 + pxor %xmm14, %xmm0 + pxor %xmm15, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) + addq $256, %rsi + jmp .Lchacha_blocks_sse2_mainloop_cont +.Lchacha_blocks_sse2_noinput1: + movdqu %xmm5, 0(%rdx) + movdqu %xmm7, 16(%rdx) + movdqu %xmm8, 64(%rdx) + movdqu %xmm10, 80(%rdx) + movdqu %xmm1, 128(%rdx) + movdqu %xmm3, 144(%rdx) + movdqu %xmm0, 192(%rdx) + movdqu %xmm4, 208(%rdx) + movdqa 384(%rsp), %xmm0 + movdqa 400(%rsp), %xmm1 + movdqa 416(%rsp), %xmm2 + movdqa 432(%rsp), %xmm3 + movdqa 448(%rsp), %xmm4 + movdqa 464(%rsp), %xmm5 + movdqa 480(%rsp), %xmm6 + movdqa 496(%rsp), %xmm7 + movdqa %xmm0, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm6, %xmm11 + punpckldq %xmm1, %xmm8 + punpckldq %xmm3, %xmm9 + punpckhdq %xmm1, %xmm0 + punpckhdq %xmm3, %xmm2 + punpckldq %xmm5, %xmm10 + punpckldq %xmm7, %xmm11 + punpckhdq %xmm5, %xmm4 + punpckhdq %xmm7, %xmm6 + movdqa %xmm8, %xmm1 + movdqa %xmm0, %xmm3 + movdqa %xmm10, %xmm5 + movdqa %xmm4, %xmm7 + punpcklqdq %xmm9, %xmm1 + punpcklqdq %xmm11, %xmm5 + punpckhqdq %xmm9, %xmm8 + punpckhqdq %xmm11, %xmm10 + punpcklqdq %xmm2, %xmm3 + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm2, %xmm0 + punpckhqdq %xmm6, %xmm4 + movdqu %xmm1, 32(%rdx) + movdqu %xmm5, 48(%rdx) + movdqu %xmm8, 96(%rdx) + movdqu %xmm10, 112(%rdx) + movdqu %xmm3, 160(%rdx) + movdqu %xmm7, 176(%rdx) + movdqu %xmm0, 224(%rdx) + movdqu %xmm4, 240(%rdx) +.Lchacha_blocks_sse2_mainloop_cont: + addq $256, %rdx + subq $256, %rcx + cmp $256, %rcx + jae .Lchacha_blocks_sse2_atleast256 + movdqa 0(%rsp), %xmm8 + movdqa 16(%rsp), %xmm9 + movdqa 32(%rsp), %xmm10 + movdqa 48(%rsp), %xmm11 + movq $1, %r9 +.Lchacha_blocks_sse2_below256: + movq %r9, %xmm5 + andq %rcx, %rcx + jz .Lchacha_blocks_sse2_done + cmpq $64, %rcx + jae .Lchacha_blocks_sse2_above63 + movq %rdx, %r9 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput2 + movq %rcx, %r10 + movq %rsp, %rdx + addq %r10, %rsi + addq %r10, %rdx + negq %r10 +.Lchacha_blocks_sse2_copyinput: + movb (%rsi, %r10), %al + movb %al, (%rdx, %r10) + incq %r10 + jnz .Lchacha_blocks_sse2_copyinput + movq %rsp, %rsi +.Lchacha_blocks_sse2_noinput2: + movq %rsp, %rdx +.Lchacha_blocks_sse2_above63: + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + movq 64(%rsp), %rax +.Lchacha_blocks_sse2_mainloop2: + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshuflw $0xb1,%xmm3,%xmm3 + pshufhw $0xb1,%xmm3,%xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1,%xmm4 + pslld $12, %xmm1 + psrld $20, %xmm4 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + movdqa %xmm3,%xmm4 + pslld $8, %xmm3 + psrld $24, %xmm4 + pshufd $0x93,%xmm0,%xmm0 + pxor %xmm4, %xmm3 + paddd %xmm3, %xmm2 + pshufd $0x4e,%xmm3,%xmm3 + pxor %xmm2, %xmm1 + pshufd $0x39,%xmm2,%xmm2 + movdqa %xmm1,%xmm4 + pslld $7, %xmm1 + psrld $25, %xmm4 + pxor %xmm4, %xmm1 + subq $2, %rax + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + pshuflw $0xb1,%xmm3,%xmm3 + pshufhw $0xb1,%xmm3,%xmm3 + paddd %xmm3, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm1,%xmm4 + pslld $12, %xmm1 + psrld $20, %xmm4 + pxor %xmm4, %xmm1 + paddd %xmm1, %xmm0 + pxor %xmm0, %xmm3 + movdqa %xmm3,%xmm4 + pslld $8, %xmm3 + psrld $24, %xmm4 + pshufd $0x39,%xmm0,%xmm0 + pxor %xmm4, %xmm3 + paddd %xmm3, %xmm2 + pshufd $0x4e,%xmm3,%xmm3 + pxor %xmm2, %xmm1 + pshufd $0x93,%xmm2,%xmm2 + movdqa %xmm1,%xmm4 + pslld $7, %xmm1 + psrld $25, %xmm4 + pxor %xmm4, %xmm1 + jnz .Lchacha_blocks_sse2_mainloop2 + paddd %xmm8, %xmm0 + paddd %xmm9, %xmm1 + paddd %xmm10, %xmm2 + paddd %xmm11, %xmm3 + andq %rsi, %rsi + jz .Lchacha_blocks_sse2_noinput3 + movdqu 0(%rsi), %xmm12 + movdqu 16(%rsi), %xmm13 + movdqu 32(%rsi), %xmm14 + movdqu 48(%rsi), %xmm15 + pxor %xmm12, %xmm0 + pxor %xmm13, %xmm1 + pxor %xmm14, %xmm2 + pxor %xmm15, %xmm3 + addq $64, %rsi +.Lchacha_blocks_sse2_noinput3: + movdqu %xmm0, 0(%rdx) + movdqu %xmm1, 16(%rdx) + movdqu %xmm2, 32(%rdx) + movdqu %xmm3, 48(%rdx) + paddq %xmm5, %xmm11 + cmpq $64, %rcx + jbe .Lchacha_blocks_sse2_mainloop2_finishup + addq $64, %rdx + subq $64, %rcx + jmp .Lchacha_blocks_sse2_below256 +.Lchacha_blocks_sse2_mainloop2_finishup: + cmpq $64, %rcx + je .Lchacha_blocks_sse2_done + addq %rcx, %r9 + addq %rcx, %rdx + negq %rcx +.Lchacha_blocks_sse2_copyoutput: + movb (%rdx, %rcx), %al + movb %al, (%r9, %rcx) + incq %rcx + jnz .Lchacha_blocks_sse2_copyoutput +.Lchacha_blocks_sse2_done: + movdqu %xmm11, 48(%rdi) + movq %rbp, %rsp + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 + popq %rbp + popq %rbx + movl $(63 + 512 + 16), %eax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 + ret +.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks; + +#endif /*defined(USE_CHACHA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index e2cf442..03416d4 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -47,6 +47,12 @@ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) +/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */ +#undef USE_SSE2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_SSE2 1 +#endif + /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ @@ -77,6 +83,13 @@ typedef struct CHACHA20_context_s } CHACHA20_context_t; +#ifdef USE_SSE2 + +unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_SSE2 */ + #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, @@ -323,7 +336,12 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; +#ifdef USE_SSE2 + ctx->blocks = _gcry_chacha20_amd64_sse2_blocks; +#else ctx->blocks = chacha20_blocks; +#endif + #ifdef USE_SSSE3 if (features & HWF_INTEL_SSSE3) ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks; diff --git a/configure.ac b/configure.ac index 47a322b..c5952c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1815,6 +1815,7 @@ if test "$found" = "1" ; then case "${host}" in x86_64-*-*) # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-sse2-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; commit 98f021961ee65669037bc8bb552a69fd78f610fc Author: Jussi Kivilinna Date: Sun May 11 20:52:27 2014 +0300 poly1305: add AMD64/AVX2 optimized implementation * cipher/Makefile.am: Add 'poly1305-avx2-amd64.S'. * cipher/poly1305-avx2-amd64.S: New. * cipher/poly1305-internal.h (POLY1305_USE_AVX2) (POLY1305_AVX2_BLOCKSIZE, POLY1305_AVX2_STATESIZE) (POLY1305_AVX2_ALIGNMENT): New. (POLY1305_LARGEST_BLOCKSIZE, POLY1305_LARGEST_STATESIZE) (POLY1305_STATE_ALIGNMENT): Use AVX2 versions when needed. * cipher/poly1305.c [POLY1305_USE_AVX2] (_gcry_poly1305_amd64_avx2_init_ext) (_gcry_poly1305_amd64_avx2_finish_ext) (_gcry_poly1305_amd64_avx2_blocks, poly1305_amd64_avx2_ops): New. (_gcry_poly1305_init) [POLY1305_USE_AVX2]: Use AVX2 implementation if AVX2 supported by CPU. * configure.ac [host=x86_64]: Add 'poly1305-avx2-amd64.lo'. -- Add Andrew Moon's public domain AVX2 implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmarks on Intel i5-4570 (haswell): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.448 ns/B 2129.5 MiB/s 1.43 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.205 ns/B 4643.5 MiB/s 0.657 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a32ae89..19b0097 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,7 +72,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ -poly1305-sse2-amd64.S \ +poly1305-sse2-amd64.S poly1305-avx2-amd64.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S new file mode 100644 index 0000000..0ba7e76 --- /dev/null +++ b/cipher/poly1305-avx2-amd64.S @@ -0,0 +1,954 @@ +/* poly1305-avx2-amd64.S - AMD64/AVX2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_init_ext +.type _gcry_poly1305_amd64_avx2_init_ext, at function; +_gcry_poly1305_amd64_avx2_init_ext: +.Lpoly1305_init_ext_avx2_local: + xor %edx, %edx + vzeroupper + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + vpxor %ymm0, %ymm0, %ymm0 + movq $-1, %r8 + testq %rcx, %rcx + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, 128(%rdi) + movq 8(%rsi), %r9 + cmove %r8, %rcx + movq $0xffc0fffffff, %r8 + movq %r9, %r13 + movq (%rsi), %r10 + andq %r10, %r8 + shrq $44, %r10 + movq %r8, %r14 + shlq $20, %r13 + orq %r13, %r10 + movq $0xfffffc0ffff, %r13 + shrq $24, %r9 + andq %r13, %r10 + movq $0xffffffc0f, %r13 + andq %r13, %r9 + movl %r8d, %r13d + andl $67108863, %r13d + movl %r13d, 164(%rdi) + movq %r10, %r13 + shrq $26, %r14 + shlq $18, %r13 + orq %r13, %r14 + movq %r10, %r13 + shrq $8, %r13 + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 172(%rdi) + movq %r10, %r14 + movl %r13d, 180(%rdi) + movq %r9, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r9, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r14d, 188(%rdi) + movl %r13d, 196(%rdi) + cmpq $16, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + lea (%r9,%r9,4), %r11 + shlq $2, %r11 + lea (%r10,%r10), %rax + mulq %r11 + movq %rax, %r13 + movq %r8, %rax + movq %rdx, %r14 + mulq %r8 + addq %rax, %r13 + lea (%r8,%r8), %rax + movq %r13, %r12 + adcq %rdx, %r14 + mulq %r10 + shlq $20, %r14 + movq %rax, %r15 + shrq $44, %r12 + movq %r11, %rax + orq %r12, %r14 + movq %rdx, %r12 + mulq %r9 + addq %rax, %r15 + movq %r8, %rax + adcq %rdx, %r12 + addq %r15, %r14 + lea (%r9,%r9), %r15 + movq %r14, %rbx + adcq $0, %r12 + mulq %r15 + shlq $20, %r12 + movq %rdx, %r11 + shrq $44, %rbx + orq %rbx, %r12 + movq %rax, %rbx + movq %r10, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %r11 + addq %rbx, %r12 + movq $0xfffffffffff, %rbx + movq %r12, %r15 + adcq $0, %r11 + andq %rbx, %r13 + shlq $22, %r11 + andq %rbx, %r14 + shrq $42, %r15 + orq %r15, %r11 + lea (%r11,%r11,4), %r11 + addq %r11, %r13 + movq %rbx, %r11 + andq %r13, %r11 + shrq $44, %r13 + movq %r11, %r15 + addq %r13, %r14 + movq $0x3ffffffffff, %r13 + andq %r14, %rbx + andq %r13, %r12 + movq %rbx, %r13 + shrq $26, %r15 + shlq $18, %r13 + orq %r13, %r15 + movq %rbx, %r13 + shrq $44, %r14 + shrq $8, %r13 + addq %r14, %r12 + movl %r11d, %r14d + andl $67108863, %r15d + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 204(%rdi) + movq %rbx, %r14 + movl %r13d, 220(%rdi) + movq %r12, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r12, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r15d, 212(%rdi) + movl %r14d, 228(%rdi) + movl %r13d, 236(%rdi) + cmpq $32, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + movq %r9, %rax + lea (%rbx,%rbx,4), %r14 + shlq $2, %r14 + mulq %r14 + movq %rdi, -32(%rsp) + lea (%r12,%r12,4), %rdi + shlq $2, %rdi + movq %rax, %r14 + movq %r10, %rax + movq %rdx, %r15 + mulq %rdi + movq %rax, %r13 + movq %r11, %rax + movq %rcx, -16(%rsp) + movq %rdx, %rcx + mulq %r8 + addq %rax, %r13 + movq %rdi, %rax + movq %rsi, -24(%rsp) + adcq %rdx, %rcx + addq %r13, %r14 + adcq %rcx, %r15 + movq %r14, %rcx + mulq %r9 + shlq $20, %r15 + movq %rax, %r13 + shrq $44, %rcx + movq %r11, %rax + orq %rcx, %r15 + movq %rdx, %rcx + mulq %r10 + movq %rax, %rsi + movq %rbx, %rax + movq %rdx, %rdi + mulq %r8 + addq %rax, %rsi + movq %r11, %rax + adcq %rdx, %rdi + addq %rsi, %r13 + adcq %rdi, %rcx + addq %r13, %r15 + movq %r15, %rdi + adcq $0, %rcx + mulq %r9 + shlq $20, %rcx + movq %rdx, %rsi + shrq $44, %rdi + orq %rdi, %rcx + movq %rax, %rdi + movq %rbx, %rax + mulq %r10 + movq %rax, %r9 + movq %r8, %rax + movq %rdx, %r10 + movq $0xfffffffffff, %r8 + mulq %r12 + addq %rax, %r9 + adcq %rdx, %r10 + andq %r8, %r14 + addq %r9, %rdi + adcq %r10, %rsi + andq %r8, %r15 + addq %rdi, %rcx + movq $0x3ffffffffff, %rdi + movq %rcx, %r10 + adcq $0, %rsi + andq %rdi, %rcx + shlq $22, %rsi + shrq $42, %r10 + orq %r10, %rsi + movq -32(%rsp), %rdi + lea (%rsi,%rsi,4), %r9 + movq %r8, %rsi + addq %r9, %r14 + andq %r14, %rsi + shrq $44, %r14 + addq %r14, %r15 + andq %r15, %r8 + shrq $44, %r15 + movq %r8, %r14 + addq %r15, %rcx + movl %esi, %r15d + movq %rcx, %r10 + movq %r8, %r9 + shrq $26, %rsi + andl $67108863, %r15d + shlq $18, %r14 + shrq $34, %r8 + orq %r14, %rsi + shlq $10, %r10 + shrq $8, %r9 + orq %r10, %r8 + shrq $16, %rcx + andl $67108863, %esi + movl %esi, 252(%rdi) + andl $67108863, %r9d + movl %ecx, 276(%rdi) + andl $67108863, %r8d + movl %r15d, 244(%rdi) + movl %r9d, 260(%rdi) + movl %r8d, 268(%rdi) + movq -16(%rsp), %rcx + movq -24(%rsp), %rsi +.Lpoly1305_init_ext_avx2_continue: + movl 16(%rsi), %r8d + movl %r8d, 284(%rdi) + movl 20(%rsi), %r9d + movl %r9d, 292(%rdi) + movl 24(%rsi), %r10d + movl %r10d, 300(%rdi) + movl 28(%rsi), %esi + movl %esi, 308(%rdi) + cmpq $48, %rcx + jbe .Lpoly1305_init_ext_avx2_done + lea (%r12,%r12,4), %r9 + shlq $2, %r9 + lea (%rbx,%rbx), %rax + mulq %r9 + movq %rax, %rsi + movq %r11, %rax + movq %rdx, %r8 + mulq %r11 + addq %rax, %rsi + lea (%r11,%r11), %rax + movq %rsi, %r10 + adcq %rdx, %r8 + mulq %rbx + movq %rax, %r13 + movq %r12, %rax + movq %rdx, %rcx + addq %r12, %r12 + mulq %r9 + addq %rax, %r13 + movq %r11, %rax + movq $0xfffffffffff, %r9 + adcq %rdx, %rcx + andq %r9, %rsi + mulq %r12 + shlq $20, %r8 + movq %rax, %r11 + shrq $44, %r10 + movq %rbx, %rax + orq %r10, %r8 + movq %rdx, %r12 + mulq %rbx + addq %r13, %r8 + movq %r8, %r14 + adcq $0, %rcx + andq %r9, %r8 + addq %rax, %r11 + adcq %rdx, %r12 + shlq $20, %rcx + shrq $44, %r14 + orq %r14, %rcx + addq %r11, %rcx + movq %rcx, %rbx + adcq $0, %r12 + shlq $22, %r12 + shrq $42, %rbx + orq %rbx, %r12 + movq %r9, %rbx + lea (%r12,%r12,4), %r15 + addq %r15, %rsi + andq %rsi, %rbx + shrq $44, %rsi + movl %ebx, %r11d + addq %rsi, %r8 + movq $0x3ffffffffff, %rsi + andq %r8, %r9 + andq %rsi, %rcx + shrq $44, %r8 + movq %r9, %rax + addq %r8, %rcx + movq %r9, %r8 + movq %rcx, %r10 + andl $67108863, %r11d + shrq $26, %rbx + shlq $18, %r8 + shrq $34, %r9 + orq %r8, %rbx + shlq $10, %r10 + shrq $8, %rax + orq %r10, %r9 + shrq $16, %rcx + andl $67108863, %ebx + andl $67108863, %eax + andl $67108863, %r9d + movl %r11d, 184(%rdi) + movl %r11d, 176(%rdi) + movl %r11d, 168(%rdi) + movl %r11d, 160(%rdi) + movl %ebx, 216(%rdi) + movl %ebx, 208(%rdi) + movl %ebx, 200(%rdi) + movl %ebx, 192(%rdi) + movl %eax, 248(%rdi) + movl %eax, 240(%rdi) + movl %eax, 232(%rdi) + movl %eax, 224(%rdi) + movl %r9d, 280(%rdi) + movl %r9d, 272(%rdi) + movl %r9d, 264(%rdi) + movl %r9d, 256(%rdi) + movl %ecx, 312(%rdi) + movl %ecx, 304(%rdi) + movl %ecx, 296(%rdi) + movl %ecx, 288(%rdi) +.Lpoly1305_init_ext_avx2_done: + movq $0, 320(%rdi) + vzeroall + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + ret +.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_blocks +.type _gcry_poly1305_amd64_avx2_blocks, at function; +_gcry_poly1305_amd64_avx2_blocks: +.Lpoly1305_blocks_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $200, %rsp + movl $((1<<26)-1), %r8d + movl $(5), %r9d + movl $((1<<24)), %r10d + vmovd %r8d, %xmm0 + vmovd %r9d, %xmm8 + vmovd %r10d, %xmm7 + vpbroadcastq %xmm0, %ymm0 + vpbroadcastq %xmm8, %ymm8 + vpbroadcastq %xmm7, %ymm7 + vmovdqa %ymm7, 168(%rsp) + movq 320(%rdi), %rax + testb $60, %al + je .Lpoly1305_blocks_avx2_9 + vmovdqa 168(%rsp), %ymm7 + vpsrldq $8, %ymm7, %ymm1 + vmovdqa %ymm1, 168(%rsp) + testb $4, %al + je .Lpoly1305_blocks_avx2_10 + vpermq $192, %ymm1, %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_10: + testb $8, %al + je .Lpoly1305_blocks_avx2_11 + vpermq $240, 168(%rsp), %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_11: + testb $16, %al + je .Lpoly1305_blocks_avx2_12 + vpermq $252, 168(%rsp), %ymm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_12: + testb $32, %al + je .Lpoly1305_blocks_avx2_9 + vpxor %xmm6, %xmm6, %xmm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_9: + testb $1, %al + jne .Lpoly1305_blocks_avx2_13 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm1 + vpunpcklqdq %ymm1, %ymm3, %ymm2 + vpunpckhqdq %ymm1, %ymm3, %ymm1 + vpermq $216, %ymm2, %ymm2 + vpermq $216, %ymm1, %ymm1 + vpand %ymm2, %ymm0, %ymm5 + vpsrlq $26, %ymm2, %ymm4 + vpand %ymm4, %ymm0, %ymm4 + vpsllq $12, %ymm1, %ymm3 + vpsrlq $52, %ymm2, %ymm2 + vpor %ymm3, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm3 + vpsrlq $26, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm2 + vpsrlq $40, %ymm1, %ymm1 + vpor 168(%rsp), %ymm1, %ymm1 + addq $64, %rsi + subq $64, %rdx + orq $1, 320(%rdi) + jmp .Lpoly1305_blocks_avx2_14 +.Lpoly1305_blocks_avx2_13: + vmovdqa (%rdi), %ymm5 + vmovdqa 32(%rdi), %ymm4 + vmovdqa 64(%rdi), %ymm3 + vmovdqa 96(%rdi), %ymm2 + vmovdqa 128(%rdi), %ymm1 +.Lpoly1305_blocks_avx2_14: + cmpq $63, %rdx + jbe .Lpoly1305_blocks_avx2_15 + vmovdqa 160(%rdi), %ymm6 + vmovdqa %ymm8, 136(%rsp) + vmovdqa 192(%rdi), %ymm7 + vpmuludq %ymm8, %ymm7, %ymm11 + vmovdqa %ymm11, 104(%rsp) + vmovdqa 224(%rdi), %ymm11 + vmovdqa %ymm11, 72(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, 40(%rsp) + vmovdqa 256(%rdi), %ymm11 + vmovdqa %ymm11, 8(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, -24(%rsp) + vmovdqa 288(%rdi), %ymm13 + vmovdqa %ymm13, -56(%rsp) + vpmuludq %ymm13, %ymm8, %ymm13 + vmovdqa %ymm13, -88(%rsp) +.Lpoly1305_blocks_avx2_16: + vpmuludq 104(%rsp), %ymm1, %ymm14 + vmovdqa 40(%rsp), %ymm13 + vpmuludq %ymm13, %ymm2, %ymm8 + vpmuludq %ymm13, %ymm1, %ymm13 + vmovdqa -24(%rsp), %ymm9 + vpmuludq %ymm9, %ymm2, %ymm10 + vpmuludq %ymm9, %ymm1, %ymm11 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm9, %ymm3, %ymm8 + vmovdqa -88(%rsp), %ymm12 + vpmuludq %ymm12, %ymm1, %ymm9 + vpaddq %ymm10, %ymm13, %ymm13 + vpmuludq %ymm12, %ymm4, %ymm15 + vmovdqa %ymm12, %ymm10 + vpmuludq %ymm12, %ymm3, %ymm12 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm10, %ymm2, %ymm10 + vpmuludq %ymm6, %ymm2, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm6, %ymm1, %ymm1 + vpaddq %ymm12, %ymm13, %ymm13 + vpmuludq %ymm6, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm11 + vpmuludq %ymm6, %ymm4, %ymm12 + vpaddq %ymm8, %ymm9, %ymm9 + vpmuludq %ymm6, %ymm3, %ymm10 + vpmuludq %ymm7, %ymm3, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm7, %ymm2, %ymm2 + vpaddq %ymm12, %ymm13, %ymm12 + vpmuludq %ymm7, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm10 + vpmuludq %ymm7, %ymm4, %ymm13 + vpaddq %ymm8, %ymm9, %ymm8 + vmovdqa 72(%rsp), %ymm9 + vpmuludq %ymm9, %ymm4, %ymm11 + vpaddq %ymm2, %ymm1, %ymm1 + vpmuludq %ymm9, %ymm3, %ymm3 + vpaddq %ymm15, %ymm12, %ymm12 + vpmuludq %ymm9, %ymm5, %ymm15 + vpaddq %ymm13, %ymm10, %ymm10 + vmovdqa 8(%rsp), %ymm2 + vpmuludq %ymm2, %ymm5, %ymm9 + vpaddq %ymm11, %ymm8, %ymm8 + vpmuludq %ymm2, %ymm4, %ymm4 + vpaddq %ymm3, %ymm1, %ymm1 + vpmuludq -56(%rsp), %ymm5, %ymm5 + vpaddq %ymm15, %ymm10, %ymm10 + vpaddq %ymm9, %ymm8, %ymm8 + vpaddq %ymm4, %ymm1, %ymm1 + vpaddq %ymm5, %ymm1, %ymm5 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm2 + vperm2i128 $32, %ymm2, %ymm3, %ymm1 + vperm2i128 $49, %ymm2, %ymm3, %ymm2 + vpunpckldq %ymm2, %ymm1, %ymm15 + vpunpckhdq %ymm2, %ymm1, %ymm2 + vpxor %xmm4, %xmm4, %xmm4 + vpunpckldq %ymm4, %ymm15, %ymm1 + vpunpckhdq %ymm4, %ymm15, %ymm15 + vpunpckldq %ymm4, %ymm2, %ymm3 + vpunpckhdq %ymm4, %ymm2, %ymm2 + vpsllq $6, %ymm15, %ymm15 + vpsllq $12, %ymm3, %ymm3 + vpsllq $18, %ymm2, %ymm2 + vpaddq %ymm1, %ymm14, %ymm14 + vpaddq %ymm15, %ymm12, %ymm12 + vpaddq %ymm3, %ymm10, %ymm10 + vpaddq %ymm2, %ymm8, %ymm8 + vpaddq 168(%rsp), %ymm5, %ymm5 + addq $64, %rsi + vpsrlq $26, %ymm14, %ymm4 + vpsrlq $26, %ymm8, %ymm2 + vpand %ymm0, %ymm14, %ymm14 + vpand %ymm0, %ymm8, %ymm8 + vpaddq %ymm4, %ymm12, %ymm12 + vpaddq %ymm2, %ymm5, %ymm5 + vpsrlq $26, %ymm12, %ymm3 + vpsrlq $26, %ymm5, %ymm9 + vpand %ymm0, %ymm12, %ymm12 + vpand %ymm0, %ymm5, %ymm11 + vpaddq %ymm3, %ymm10, %ymm3 + vpmuludq 136(%rsp), %ymm9, %ymm9 + vpaddq %ymm9, %ymm14, %ymm14 + vpsrlq $26, %ymm3, %ymm2 + vpsrlq $26, %ymm14, %ymm4 + vpand %ymm0, %ymm3, %ymm3 + vpand %ymm0, %ymm14, %ymm5 + vpaddq %ymm2, %ymm8, %ymm2 + vpaddq %ymm4, %ymm12, %ymm4 + vpsrlq $26, %ymm2, %ymm1 + vpand %ymm0, %ymm2, %ymm2 + vpaddq %ymm1, %ymm11, %ymm1 + subq $64, %rdx + cmpq $63, %rdx + ja .Lpoly1305_blocks_avx2_16 +.Lpoly1305_blocks_avx2_15: + testb $64, 320(%rdi) + jne .Lpoly1305_blocks_avx2_17 + vmovdqa %ymm5, (%rdi) + vmovdqa %ymm4, 32(%rdi) + vmovdqa %ymm3, 64(%rdi) + vmovdqa %ymm2, 96(%rdi) + vmovdqa %ymm1, 128(%rdi) + jmp .Lpoly1305_blocks_avx2_8 +.Lpoly1305_blocks_avx2_17: + vpermq $245, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $245, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $245, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $245, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $245, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vpermq $170, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $170, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $170, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $170, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $170, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vmovd %xmm5, %eax + vmovd %xmm4, %edx + movl %eax, %ecx + shrl $26, %ecx + addl %edx, %ecx + movl %ecx, %edx + andl $67108863, %edx + vmovd %xmm3, %esi + shrl $26, %ecx + movl %ecx, %r11d + addl %esi, %r11d + vmovd %xmm2, %ecx + movl %r11d, %r10d + shrl $26, %r10d + addl %ecx, %r10d + movl %r10d, %r9d + andl $67108863, %r9d + vmovd %xmm1, %r8d + movl %edx, %esi + salq $26, %rsi + andl $67108863, %eax + orq %rax, %rsi + movabsq $17592186044415, %rax + andq %rax, %rsi + andl $67108863, %r11d + salq $8, %r11 + shrl $18, %edx + movl %edx, %edx + orq %r11, %rdx + movq %r9, %rcx + salq $34, %rcx + orq %rcx, %rdx + andq %rax, %rdx + shrl $26, %r10d + addl %r10d, %r8d + salq $16, %r8 + shrl $10, %r9d + movl %r9d, %r9d + orq %r9, %r8 + movabsq $4398046511103, %r10 + movq %r8, %r9 + andq %r10, %r9 + shrq $42, %r8 + leaq (%r8,%r8,4), %rcx + addq %rcx, %rsi + movq %rsi, %r8 + andq %rax, %r8 + movq %rsi, %rcx + shrq $44, %rcx + addq %rdx, %rcx + movq %rcx, %rsi + andq %rax, %rsi + shrq $44, %rcx + movq %rcx, %rdx + addq %r9, %rdx + andq %rdx, %r10 + shrq $42, %rdx + leaq (%r8,%rdx,4), %rcx + leaq (%rcx,%rdx), %rdx + movq %rdx, %rbx + andq %rax, %rbx + shrq $44, %rdx + movq %rdx, %r11 + addq %rsi, %r11 + leaq 5(%rbx), %r9 + movq %r9, %r8 + shrq $44, %r8 + addq %r11, %r8 + movabsq $-4398046511104, %rsi + addq %r10, %rsi + movq %r8, %rdx + shrq $44, %rdx + addq %rdx, %rsi + movq %rsi, %rdx + shrq $63, %rdx + subq $1, %rdx + movq %rdx, %rcx + notq %rcx + andq %rcx, %rbx + andq %rcx, %r11 + andq %r10, %rcx + andq %rax, %r9 + andq %rdx, %r9 + orq %r9, %rbx + movq %rbx, (%rdi) + andq %r8, %rax + andq %rdx, %rax + orq %rax, %r11 + movq %r11, 8(%rdi) + andq %rsi, %rdx + orq %rcx, %rdx + movq %rdx, 16(%rdi) +.Lpoly1305_blocks_avx2_8: + movq -8(%rbp), %rbx + vzeroall + movq %rbp, %rax + subq %rsp, %rax + leave + addq $8, %rax + ret +.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_finish_ext +.type _gcry_poly1305_amd64_avx2_finish_ext, at function; +_gcry_poly1305_amd64_avx2_finish_ext: +.Lpoly1305_finish_ext_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %r13 + pushq %r12 + pushq %rbx + andq $-64, %rsp + subq $64, %rsp + movq %rdi, %rbx + movq %rdx, %r13 + movq %rcx, %r12 + testq %rdx, %rdx + je .Lpoly1305_finish_ext_avx2_22 + vpxor %xmm0, %xmm0, %xmm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movq %rsp, %rax + subq %rsp, %rsi + testb $32, %dl + je .Lpoly1305_finish_ext_avx2_23 + vmovdqu (%rsp,%rsi), %ymm0 + vmovdqa %ymm0, (%rsp) + leaq 32(%rsp), %rax +.Lpoly1305_finish_ext_avx2_23: + testb $16, %r13b + je .Lpoly1305_finish_ext_avx2_24 + vmovdqu (%rax,%rsi), %xmm0 + vmovdqa %xmm0, (%rax) + addq $16, %rax +.Lpoly1305_finish_ext_avx2_24: + testb $8, %r13b + je .Lpoly1305_finish_ext_avx2_25 + movq (%rax,%rsi), %rdx + movq %rdx, (%rax) + addq $8, %rax +.Lpoly1305_finish_ext_avx2_25: + testb $4, %r13b + je .Lpoly1305_finish_ext_avx2_26 + movl (%rax,%rsi), %edx + movl %edx, (%rax) + addq $4, %rax +.Lpoly1305_finish_ext_avx2_26: + testb $2, %r13b + je .Lpoly1305_finish_ext_avx2_27 + movzwl (%rax,%rsi), %edx + movw %dx, (%rax) + addq $2, %rax +.Lpoly1305_finish_ext_avx2_27: + testb $1, %r13b + je .Lpoly1305_finish_ext_avx2_28 + movzbl (%rax,%rsi), %edx + movb %dl, (%rax) +.Lpoly1305_finish_ext_avx2_28: + testb $15, %r13b + je .Lpoly1305_finish_ext_avx2_29 + movb $1, (%rsp,%r13) +.Lpoly1305_finish_ext_avx2_29: + cmpq $47, %r13 + jbe .Lpoly1305_finish_ext_avx2_30 + orq $4, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_30: + cmpq $31, %r13 + jbe .Lpoly1305_finish_ext_avx2_32 + orq $8, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_32: + cmpq $15, %r13 + jbe .Lpoly1305_finish_ext_avx2_33 + orq $16, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_33: + orq $32, 320(%rbx) +.Lpoly1305_finish_ext_avx2_31: + testb $1, 320(%rbx) + je .Lpoly1305_finish_ext_avx2_34 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_34 + cmpq $17, %r13 + sbbq %rsi, %rsi + notq %rsi + addq $2, %rsi + cmpq $17, %r13 + sbbq %rax, %rax + movq %rbx, %rdx + addq $23, %rax + leaq (%rbx,%rax,8), %rax + movl $0, %ecx +.Lpoly1305_finish_ext_avx2_37: + movl 244(%rdx), %edi + movl %edi, (%rax) + movl 252(%rdx), %edi + movl %edi, 32(%rax) + movl 260(%rdx), %edi + movl %edi, 64(%rax) + movl 268(%rdx), %edi + movl %edi, 96(%rax) + movl 276(%rdx), %edi + movl %edi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + cmpq %rcx, %rsi + ja .Lpoly1305_finish_ext_avx2_37 +.Lpoly1305_finish_ext_avx2_34: + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_22: + movq 320(%rbx), %r8 + testb $1, %r8b + je .Lpoly1305_finish_ext_avx2_38 + leaq -1(%r13), %rax + cmpq $47, %rax + ja .Lpoly1305_finish_ext_avx2_46 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_47 + cmpq $17, %r13 + sbbq %r9, %r9 + addq $2, %r9 + movl $0, %edi + cmpq $17, %r13 + sbbq %rax, %rax + notq %rax + andl $5, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_41: + movl (%rdx), %esi + movl %esi, (%rax) + movl 8(%rdx), %esi + movl %esi, 32(%rax) + movl 16(%rdx), %esi + movl %esi, 64(%rax) + movl 24(%rdx), %esi + movl %esi, 96(%rax) + movl 32(%rdx), %esi + movl %esi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + movq %rcx, %rsi + subq %rdi, %rsi + cmpq %rsi, %r9 + ja .Lpoly1305_finish_ext_avx2_41 + cmpq $3, %rcx + ja .Lpoly1305_finish_ext_avx2_42 + leaq 160(%rbx,%rcx,8), %rax +.Lpoly1305_finish_ext_avx2_43: + movl $1, (%rax) + movl $0, 32(%rax) + movl $0, 64(%rax) + movl $0, 96(%rax) + movl $0, 128(%rax) + addq $1, %rcx + addq $8, %rax + cmpq $4, %rcx + jne .Lpoly1305_finish_ext_avx2_43 +.Lpoly1305_finish_ext_avx2_42: + orq $96, %r8 + movq %r8, 320(%rbx) + vpxor %ymm0, %ymm0, %ymm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_38: + movq 8(%rbx), %rax + movq %rax, %rdx + salq $44, %rdx + orq (%rbx), %rdx + shrq $20, %rax + movl $24, %edi + shlx %rdi, 16(%rbx), %rcx + orq %rcx, %rax + movl 292(%rbx), %ecx + salq $32, %rcx + movl 284(%rbx), %esi + orq %rsi, %rcx + movl 308(%rbx), %esi + salq $32, %rsi + movl 300(%rbx), %edi + orq %rdi, %rsi + addq %rcx, %rdx + adcq %rsi, %rax + movq %rdx, (%r12) + movq %rax, 8(%r12) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm0, 32(%rbx) + vmovdqu %ymm0, 64(%rbx) + vmovdqu %ymm0, 96(%rbx) + vmovdqu %ymm0, 128(%rbx) + vmovdqu %ymm0, 160(%rbx) + vmovdqu %ymm0, 192(%rbx) + vmovdqu %ymm0, 224(%rbx) + jmp .Lpoly1305_finish_ext_avx2_49 +.Lpoly1305_finish_ext_avx2_46: + movl $3, %r9d + movl $1, %edi + movl $10, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_47: + movl $3, %r9d + movl $0, %edi + movl $10, %eax +.Lpoly1305_finish_ext_avx2_39: + leaq 164(%rbx,%rax,8), %rdx + leaq 160(%rbx,%rdi,8), %rax + movq %rdi, %rcx + jmp .Lpoly1305_finish_ext_avx2_41 +.Lpoly1305_finish_ext_avx2_49: + movq %rbp, %rax + subq %rsp, %rax + leaq -24(%rbp), %rsp + vzeroall + popq %rbx + popq %r12 + popq %r13 + popq %rbp + addq $(8*5), %rax +ret +.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext; + +#endif diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index fa3fe75..0299c43 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -54,23 +54,40 @@ #endif +/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ +#undef POLY1305_USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) +# define POLY1305_USE_AVX2 1 +# define POLY1305_AVX2_BLOCKSIZE 64 +# define POLY1305_AVX2_STATESIZE 328 +# define POLY1305_AVX2_ALIGNMENT 32 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE #else # define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE #endif /* Largest state-size used in any implementation. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE #else # define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE #endif /* Minimum alignment for state pointer passed to implementations. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT +#elif defined(POLY1305_USE_SSE2) # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT #else # define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT diff --git a/cipher/poly1305.c b/cipher/poly1305.c index cd1902a..fe241c1 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -57,6 +57,25 @@ static const poly1305_ops_t poly1305_amd64_sse2_ops = { #endif +#ifdef POLY1305_USE_AVX2 + +void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_amd64_avx2_ops = { + POLY1305_AVX2_BLOCKSIZE, + _gcry_poly1305_amd64_avx2_init_ext, + _gcry_poly1305_amd64_avx2_blocks, + _gcry_poly1305_amd64_avx2_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -616,6 +635,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, static int initialized; static const char *selftest_failed; poly1305_key_t keytmp; + unsigned int features = _gcry_get_hw_features (); if (!initialized) { @@ -637,6 +657,12 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, ctx->ops = &poly1305_default_ops; #endif +#ifdef POLY1305_USE_AVX2 + if (features & HWF_INTEL_AVX2) + ctx->ops = &poly1305_amd64_avx2_ops; +#endif + (void)features; + buf_cpy (keytmp.b, key, POLY1305_KEYLEN); poly1305_init (ctx, &keytmp); diff --git a/configure.ac b/configure.ac index 4dc36d5..47a322b 100644 --- a/configure.ac +++ b/configure.ac @@ -1825,6 +1825,7 @@ case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo" ;; esac commit 297532602ed2d881d8fdc393d1961068a143a891 Author: Jussi Kivilinna Date: Sun May 11 20:18:49 2014 +0300 poly1305: add AMD64/SSE2 optimized implementation * cipher/Makefile.am: Add 'poly1305-sse2-amd64.S'. * cipher/poly1305-internal.h (POLY1305_USE_SSE2) (POLY1305_SSE2_BLOCKSIZE, POLY1305_SSE2_STATESIZE) (POLY1305_SSE2_ALIGNMENT): New. (POLY1305_LARGEST_BLOCKSIZE, POLY1305_LARGEST_STATESIZE) (POLY1305_STATE_ALIGNMENT): Use SSE2 versions when needed. * cipher/poly1305-sse2-amd64.S: New. * cipher/poly1305.c [POLY1305_USE_SSE2] (_gcry_poly1305_amd64_sse2_init_ext) (_gcry_poly1305_amd64_sse2_finish_ext) (_gcry_poly1305_amd64_sse2_blocks, poly1305_amd64_sse2_ops): New. (_gcry_polu1305_init) [POLY1305_USE_SSE2]: Use SSE2 version. * configure.ac [host=x86_64]: Add 'poly1305-sse2-amd64.lo'. -- Add Andrew Moon's public domain SSE2 implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmarks on Intel i5-4570 (haswell): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.844 ns/B 1130.2 MiB/s 2.70 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.448 ns/B 2129.5 MiB/s 1.43 c/B Benchmarks on Intel i5-2450M (sandy-bridge): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 1.25 ns/B 763.0 MiB/s 3.12 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 0.605 ns/B 1575.9 MiB/s 1.51 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 4468647..a32ae89 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,6 +72,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ +poly1305-sse2-amd64.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index d2c6b5c..fa3fe75 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -44,15 +44,37 @@ #define POLY1305_REF_ALIGNMENT sizeof(void *) +/* POLY1305_USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */ +#undef POLY1305_USE_SSE2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define POLY1305_USE_SSE2 1 +# define POLY1305_SSE2_BLOCKSIZE 32 +# define POLY1305_SSE2_STATESIZE 248 +# define POLY1305_SSE2_ALIGNMENT 16 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ -#define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE +#ifdef POLY1305_USE_SSE2 +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE +#else +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE +#endif /* Largest state-size used in any implementation. */ -#define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE +#ifdef POLY1305_USE_SSE2 +# define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE +#else +# define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE +#endif /* Minimum alignment for state pointer passed to implementations. */ -#define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT +#ifdef POLY1305_USE_SSE2 +# define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT +#else +# define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT +#endif typedef struct poly1305_key_s diff --git a/cipher/poly1305-sse2-amd64.S b/cipher/poly1305-sse2-amd64.S new file mode 100644 index 0000000..106b119 --- /dev/null +++ b/cipher/poly1305-sse2-amd64.S @@ -0,0 +1,1035 @@ +/* poly1305-sse2-amd64.S - AMD64/SSE2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_init_ext +.type _gcry_poly1305_amd64_sse2_init_ext, at function; +_gcry_poly1305_amd64_sse2_init_ext: +.Lpoly1305_init_ext_x86_local: + xor %edx, %edx + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdx, %r10 + movq $-1, %rcx + testq %r10, %r10 + pxor %xmm0, %xmm0 + movq $0xfffffc0ffff, %r9 + movdqa %xmm0, (%rdi) + cmove %rcx, %r10 + movdqa %xmm0, 16(%rdi) + movq $0xffc0fffffff, %rcx + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movq 8(%rsi), %r11 + movq %r11, %r8 + movq (%rsi), %r12 + andq %r12, %rcx + shrq $44, %r12 + shlq $20, %r8 + shrq $24, %r11 + orq %r8, %r12 + movq $0xffffffc0f, %r8 + andq %r9, %r12 + andq %r8, %r11 + movl %ecx, %r8d + andl $67108863, %r8d + movq %rcx, %r9 + movl %r8d, 84(%rdi) + movq %r12, %r8 + shrq $26, %r9 + shlq $18, %r8 + orq %r8, %r9 + movq %r12, %r8 + shrq $8, %r8 + andl $67108863, %r9d + andl $67108863, %r8d + movl %r9d, 92(%rdi) + movq %r12, %r9 + movl %r8d, 100(%rdi) + movq %r11, %r8 + shrq $34, %r9 + shlq $10, %r8 + orq %r8, %r9 + movq %r11, %r8 + shrq $16, %r8 + andl $67108863, %r9d + movl %r9d, 108(%rdi) + cmpq $16, %r10 + movl %r8d, 116(%rdi) + movl 16(%rsi), %r8d + movl %r8d, 124(%rdi) + movl 20(%rsi), %r8d + movl %r8d, 132(%rdi) + movl 24(%rsi), %r8d + movl %r8d, 140(%rdi) + movl 28(%rsi), %esi + movl %esi, 148(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r11,%r11,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r13 + movq %rcx, %rax + movq %rdx, %r8 + mulq %rcx + addq %rax, %r13 + lea (%rcx,%rcx), %rax + movq %r13, %r9 + adcq %rdx, %r8 + mulq %r12 + shlq $20, %r8 + movq %rax, %rsi + shrq $44, %r9 + movq %r11, %rax + orq %r9, %r8 + movq %rdx, %r9 + mulq %r14 + addq %rax, %rsi + movq %rcx, %rax + adcq %rdx, %r9 + addq %r11, %r11 + mulq %r11 + addq %rsi, %r8 + movq %rax, %r11 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r9 + mulq %r12 + addq %rax, %r11 + movq %r8, %rsi + adcq %rdx, %rcx + shlq $20, %r9 + shrq $44, %rsi + orq %rsi, %r9 + movq $0xfffffffffff, %rsi + addq %r11, %r9 + movq %r9, %r12 + adcq $0, %rcx + andq %rsi, %r13 + shlq $22, %rcx + andq %rsi, %r8 + shrq $42, %r12 + orq %r12, %rcx + movq %rsi, %r12 + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r13 + movq %rsi, %rcx + andq %r13, %rcx + shrq $44, %r13 + movq %rcx, %r14 + addq %r13, %r8 + movq $0x3ffffffffff, %r13 + andq %r8, %r12 + andq %r13, %r9 + shrq $44, %r8 + movq %r12, %r11 + addq %r8, %r9 + movq %r12, %rax + movq %r9, %r13 + movl %ecx, %r8d + shrq $26, %r14 + andl $67108863, %r8d + shlq $18, %r11 + shrq $34, %rax + orq %r11, %r14 + shlq $10, %r13 + movq %r12, %r11 + orq %r13, %rax + movq %r9, %r13 + shrq $8, %r11 + shrq $16, %r13 + andl $67108863, %r14d + andl $67108863, %r11d + andl $67108863, %eax + movl %r8d, 88(%rdi) + cmpq $64, %r10 + movl %r8d, 80(%rdi) + movl %r14d, 104(%rdi) + movl %r14d, 96(%rdi) + movl %r11d, 120(%rdi) + movl %r11d, 112(%rdi) + movl %eax, 136(%rdi) + movl %eax, 128(%rdi) + movl %r13d, 152(%rdi) + movl %r13d, 144(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r9,%r9,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r8 + movq %rcx, %rax + movq %rdx, %r10 + mulq %rcx + addq %rax, %r8 + lea (%rcx,%rcx), %rax + movq %r8, %r11 + adcq %rdx, %r10 + andq %rsi, %r8 + mulq %r12 + shlq $20, %r10 + movq %rax, %r13 + shrq $44, %r11 + movq %r9, %rax + orq %r11, %r10 + movq %rdx, %r11 + mulq %r14 + addq %rax, %r13 + movq %rcx, %rax + adcq %rdx, %r11 + addq %r9, %r9 + mulq %r9 + addq %r13, %r10 + movq %rax, %r9 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r11 + mulq %r12 + addq %rax, %r9 + movq %r10, %r13 + adcq %rdx, %rcx + andq %rsi, %r10 + shlq $20, %r11 + shrq $44, %r13 + orq %r13, %r11 + addq %r9, %r11 + movq %rsi, %r9 + movq %r11, %r12 + adcq $0, %rcx + shlq $22, %rcx + shrq $42, %r12 + orq %r12, %rcx + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r8 + andq %r8, %r9 + shrq $44, %r8 + movl %r9d, %eax + addq %r8, %r10 + movq $0x3ffffffffff, %r8 + andq %r10, %rsi + andq %r8, %r11 + shrq $44, %r10 + movq %rsi, %r8 + addq %r10, %r11 + andl $67108863, %eax + shrq $26, %r9 + movq %r11, %r10 + shlq $18, %r8 + shlq $10, %r10 + orq %r8, %r9 + movq %rsi, %r8 + shrq $34, %rsi + andl $67108863, %r9d + shrq $8, %r8 + orq %r10, %rsi + shrq $16, %r11 + andl $67108863, %r8d + andl $67108863, %esi + movl %eax, 168(%rdi) + movl %eax, 160(%rdi) + movl %r9d, 184(%rdi) + movl %r9d, 176(%rdi) + movl %r8d, 200(%rdi) + movl %r8d, 192(%rdi) + movl %esi, 216(%rdi) + movl %esi, 208(%rdi) + movl %r11d, 232(%rdi) + movl %r11d, 224(%rdi) +.Lpoly1305_init_ext_sse2_done: + movq $0, 240(%rdi) + popq %r14 + popq %r13 + popq %r12 + ret +.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_finish_ext +.type _gcry_poly1305_amd64_sse2_finish_ext, at function; +_gcry_poly1305_amd64_sse2_finish_ext: +.Lpoly1305_finish_ext_x86_local: + pushq %rbp + movq %rsp, %rbp + subq $64, %rsp + andq $~63, %rsp + movq %rdx, 32(%rsp) + movq %rcx, 40(%rsp) + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_no_leftover + pxor %xmm0, %xmm0 + movdqa %xmm0, 0+0(%rsp) + movdqa %xmm0, 16+0(%rsp) + leaq 0(%rsp), %r8 + testq $16, %rdx + jz .Lpoly1305_finish_x86_skip16 + movdqu 0(%rsi), %xmm0 + movdqa %xmm0, 0(%r8) + addq $16, %rsi + addq $16, %r8 +.Lpoly1305_finish_x86_skip16: + testq $8, %rdx + jz .Lpoly1305_finish_x86_skip8 + movq 0(%rsi), %rax + movq %rax, 0(%r8) + addq $8, %rsi + addq $8, %r8 +.Lpoly1305_finish_x86_skip8: + testq $4, %rdx + jz .Lpoly1305_finish_x86_skip4 + movl 0(%rsi), %eax + movl %eax, 0(%r8) + addq $4, %rsi + addq $4, %r8 +.Lpoly1305_finish_x86_skip4: + testq $2, %rdx + jz .Lpoly1305_finish_x86_skip2 + movw 0(%rsi), %ax + movw %ax, 0(%r8) + addq $2, %rsi + addq $2, %r8 +.Lpoly1305_finish_x86_skip2: + testq $1, %rdx + jz .Lpoly1305_finish_x86_skip1 + movb 0(%rsi), %al + movb %al, 0(%r8) + addq $1, %r8 +.Lpoly1305_finish_x86_skip1: + cmpq $16, %rdx + je .Lpoly1305_finish_x86_is16 + movb $1, 0(%r8) +.Lpoly1305_finish_x86_is16: + movq $4, %rax + jae .Lpoly1305_finish_x86_16andover + movq $8, %rax +.Lpoly1305_finish_x86_16andover: + orq %rax, 240(%rdi) + leaq 0(%rsp), %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_no_leftover: + testq $1, 240(%rdi) + jz .Lpoly1305_finish_x86_not_started + movq 32(%rsp), %rdx + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_r2r + cmpq $16, %rdx + jg .Lpoly1305_finish_x86_r2r + xorl %r10d, %r10d + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 80(%rdi) + movl $1, 8+80(%rdi) + movl %ecx, 96(%rdi) + movl %r10d, 8+96(%rdi) + movl %edx, 112(%rdi) + movl %r10d, 8+112(%rdi) + movl %r8d, 128(%rdi) + movl %r10d, 8+128(%rdi) + movl %r9d, 144(%rdi) + movl %r10d, 8+144(%rdi) + jmp .Lpoly1305_finish_x86_combine +.Lpoly1305_finish_x86_r2r: + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 8+80(%rdi) + movl %ecx, 8+96(%rdi) + movl %edx, 8+112(%rdi) + movl %r8d, 8+128(%rdi) + movl %r9d, 8+144(%rdi) +.Lpoly1305_finish_x86_combine: + xorq %rsi, %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_not_started: + movq 0(%rdi), %r8 + movq 8(%rdi), %r9 + movq %r9, %r10 + movq 16(%rdi), %r11 + shlq $44, %r9 + shrq $20, %r10 + shlq $24, %r11 + orq %r9, %r8 + orq %r11, %r10 + pxor %xmm0, %xmm0 + movl 124(%rdi), %eax + movl 132(%rdi), %ecx + movl 140(%rdi), %edx + movl 148(%rdi), %esi + movq 40(%rsp), %r11 + shlq $32, %rcx + shlq $32, %rsi + orq %rcx, %rax + orq %rsi, %rdx + addq %r8, %rax + adcq %r10, %rdx + movq %rax, 0(%r11) + movq %rdx, 8(%r11) + movq %rbp, %rax + subq %rsp, %rax + movq %rbp, %rsp + movdqa %xmm0, 0(%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm0, 80(%rdi) + movdqa %xmm0, 96(%rdi) + movdqa %xmm0, 112(%rdi) + movdqa %xmm0, 128(%rdi) + movdqa %xmm0, 144(%rdi) + movdqa %xmm0, 160(%rdi) + movdqa %xmm0, 176(%rdi) + movdqa %xmm0, 192(%rdi) + movdqa %xmm0, 208(%rdi) + movdqa %xmm0, 224(%rdi) + popq %rbp + addq $8, %rax + ret +.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_blocks +.type _gcry_poly1305_amd64_sse2_blocks, at function; +_gcry_poly1305_amd64_sse2_blocks: +.Lpoly1305_blocks_x86_local: + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $328, %rsp + movq 240(%rdi), %rax + movl $(1<<24), %r8d + movl $((1<<26)-1), %r9d + movd %r8, %xmm0 + movd %r9, %xmm5 + pshufd $0x44, %xmm0, %xmm0 + pshufd $0x44, %xmm5, %xmm5 + testb $4, %al + je .Lpoly1305_blocks_x86_3 + psrldq $8, %xmm0 +.Lpoly1305_blocks_x86_3: + testb $8, %al + je .Lpoly1305_blocks_x86_4 + pxor %xmm0, %xmm0 +.Lpoly1305_blocks_x86_4: + movdqa %xmm0, 168(%rsp) + testb $1, %al + jne .Lpoly1305_blocks_x86_5 + movq 16(%rsi), %xmm0 + movdqa %xmm5, %xmm7 + movdqa %xmm5, %xmm10 + movq (%rsi), %xmm6 + orq $1, %rax + subq $32, %rdx + movq 8(%rsi), %xmm1 + punpcklqdq %xmm0, %xmm6 + movq 24(%rsi), %xmm0 + pand %xmm6, %xmm7 + movdqa %xmm6, %xmm9 + psrlq $52, %xmm6 + addq $32, %rsi + punpcklqdq %xmm0, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $26, %xmm9 + psllq $12, %xmm0 + movq %rax, 240(%rdi) + pand %xmm5, %xmm9 + por %xmm0, %xmm6 + psrlq $40, %xmm1 + pand %xmm6, %xmm10 + por 168(%rsp), %xmm1 + psrlq $26, %xmm6 + pand %xmm5, %xmm6 +.Lpoly1305_blocks_x86_6: + movdqa 80(%rdi), %xmm13 + cmpq $63, %rdx + movl $(5), %r8d + movd %r8, %xmm14 + pshufd $0x44, %xmm14, %xmm14 + movdqa 96(%rdi), %xmm15 + movdqa %xmm13, -8(%rsp) + movdqa 112(%rdi), %xmm0 + movdqa %xmm14, 136(%rsp) + movdqa 128(%rdi), %xmm3 + movdqa %xmm15, 312(%rsp) + pmuludq %xmm14, %xmm15 + movdqa 144(%rdi), %xmm13 + movdqa %xmm0, 232(%rsp) + pmuludq %xmm14, %xmm0 + movdqa %xmm3, 152(%rsp) + pmuludq %xmm14, %xmm3 + movdqa %xmm13, 56(%rsp) + pmuludq %xmm14, %xmm13 + movdqa %xmm15, 40(%rsp) + movdqa %xmm0, -24(%rsp) + movdqa %xmm3, -40(%rsp) + movdqa %xmm13, -56(%rsp) + jbe .Lpoly1305_blocks_x86_7 + movdqa 192(%rdi), %xmm15 + leaq 32(%rsi), %rax + movq %rdx, %rcx + movdqa 176(%rdi), %xmm14 + movdqa %xmm15, %xmm2 + movdqa 208(%rdi), %xmm0 + movdqa %xmm15, 216(%rsp) + movdqa %xmm14, 296(%rsp) + movdqa 224(%rdi), %xmm3 + pmuludq 136(%rsp), %xmm14 + movdqa -24(%rsp), %xmm13 + movdqa %xmm14, 8(%rsp) + pmuludq 136(%rsp), %xmm2 + movdqa -40(%rsp), %xmm14 + movdqa %xmm0, 120(%rsp) + pmuludq 136(%rsp), %xmm0 + movdqa %xmm3, 24(%rsp) + movdqa 160(%rdi), %xmm12 + movdqa %xmm0, %xmm8 + movdqa -56(%rsp), %xmm15 + movdqa %xmm13, 88(%rsp) + pmuludq 136(%rsp), %xmm3 + movdqa %xmm2, 104(%rsp) + movdqa %xmm0, %xmm13 + movdqa -8(%rsp), %xmm11 + movdqa %xmm3, 280(%rsp) + movdqa %xmm2, %xmm3 + movdqa %xmm0, 200(%rsp) + movdqa %xmm14, 184(%rsp) + movdqa %xmm15, 264(%rsp) + jmp .Lpoly1305_blocks_x86_8 +.p2align 6,,63 +.Lpoly1305_blocks_x86_13: + movdqa 200(%rsp), %xmm13 + movdqa %xmm3, %xmm6 + movdqa 200(%rsp), %xmm8 + movdqa 104(%rsp), %xmm3 +.Lpoly1305_blocks_x86_8: + movdqa 8(%rsp), %xmm4 + pmuludq %xmm6, %xmm3 + subq $64, %rcx + pmuludq %xmm10, %xmm8 + movdqa 104(%rsp), %xmm2 + movdqa 200(%rsp), %xmm0 + pmuludq %xmm1, %xmm4 + movdqa 280(%rsp), %xmm15 + pmuludq %xmm6, %xmm13 + movdqa 280(%rsp), %xmm14 + pmuludq %xmm1, %xmm0 + paddq %xmm3, %xmm4 + pmuludq %xmm1, %xmm2 + movdqa 280(%rsp), %xmm3 + paddq %xmm8, %xmm4 + pmuludq %xmm9, %xmm15 + movdqa 280(%rsp), %xmm8 + pmuludq %xmm10, %xmm14 + pmuludq %xmm6, %xmm8 + paddq %xmm13, %xmm2 + movdqa %xmm6, %xmm13 + pmuludq %xmm1, %xmm3 + paddq %xmm15, %xmm4 + movdqa 296(%rsp), %xmm15 + pmuludq %xmm12, %xmm13 + paddq %xmm14, %xmm2 + movdqa %xmm7, %xmm14 + paddq %xmm8, %xmm0 + pmuludq %xmm12, %xmm14 + movdqa %xmm9, %xmm8 + pmuludq 296(%rsp), %xmm6 + pmuludq %xmm12, %xmm8 + movdqa %xmm6, 248(%rsp) + pmuludq %xmm10, %xmm15 + movq -16(%rax), %xmm6 + paddq %xmm13, %xmm3 + movdqa %xmm10, %xmm13 + paddq %xmm14, %xmm4 + movq -8(%rax), %xmm14 + paddq %xmm8, %xmm2 + movq -32(%rax), %xmm8 + pmuludq %xmm12, %xmm13 + paddq %xmm15, %xmm3 + pmuludq %xmm12, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq 216(%rsp), %xmm10 + punpcklqdq %xmm6, %xmm8 + movq -24(%rax), %xmm6 + pmuludq %xmm9, %xmm15 + paddq %xmm13, %xmm0 + movdqa 296(%rsp), %xmm13 + paddq 248(%rsp), %xmm1 + punpcklqdq %xmm14, %xmm6 + movdqa 296(%rsp), %xmm14 + pmuludq %xmm9, %xmm13 + pmuludq 120(%rsp), %xmm9 + movdqa %xmm15, 72(%rsp) + paddq %xmm10, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq %xmm7, %xmm14 + movdqa %xmm6, %xmm10 + paddq %xmm9, %xmm1 + pmuludq %xmm7, %xmm15 + paddq %xmm13, %xmm0 + paddq 72(%rsp), %xmm3 + movdqa 120(%rsp), %xmm13 + psllq $12, %xmm10 + paddq %xmm14, %xmm2 + movdqa %xmm5, %xmm14 + pand %xmm8, %xmm14 + pmuludq %xmm7, %xmm13 + paddq %xmm15, %xmm0 + movdqa %xmm14, 248(%rsp) + movdqa %xmm8, %xmm14 + psrlq $52, %xmm8 + movdqu (%rax), %xmm9 + por %xmm10, %xmm8 + pmuludq 24(%rsp), %xmm7 + movdqu 16(%rax), %xmm10 + paddq %xmm13, %xmm3 + pxor %xmm13, %xmm13 + movdqa %xmm9, %xmm15 + paddq %xmm7, %xmm1 + movdqa %xmm6, %xmm7 + movdqa %xmm10, -72(%rsp) + punpckldq %xmm10, %xmm15 + movdqa %xmm15, %xmm10 + punpckldq %xmm13, %xmm10 + punpckhdq -72(%rsp), %xmm9 + psrlq $40, %xmm6 + movdqa %xmm10, 72(%rsp) + movdqa %xmm9, %xmm10 + punpckhdq %xmm13, %xmm9 + psllq $18, %xmm9 + paddq 72(%rsp), %xmm4 + addq $64, %rax + paddq %xmm9, %xmm3 + movdqa 40(%rsp), %xmm9 + cmpq $63, %rcx + punpckhdq %xmm13, %xmm15 + psllq $6, %xmm15 + punpckldq %xmm13, %xmm10 + paddq %xmm15, %xmm2 + psllq $12, %xmm10 + por 168(%rsp), %xmm6 + pmuludq %xmm6, %xmm9 + movdqa 88(%rsp), %xmm15 + paddq %xmm10, %xmm0 + movdqa 88(%rsp), %xmm13 + psrlq $14, %xmm7 + pand %xmm5, %xmm8 + movdqa 184(%rsp), %xmm10 + pand %xmm5, %xmm7 + pmuludq %xmm7, %xmm15 + paddq %xmm9, %xmm4 + pmuludq %xmm6, %xmm13 + movdqa 184(%rsp), %xmm9 + paddq 168(%rsp), %xmm1 + pmuludq %xmm7, %xmm10 + pmuludq %xmm6, %xmm9 + paddq %xmm15, %xmm4 + movdqa 184(%rsp), %xmm15 + paddq %xmm13, %xmm2 + psrlq $26, %xmm14 + movdqa 264(%rsp), %xmm13 + paddq %xmm10, %xmm2 + pmuludq %xmm8, %xmm15 + pand %xmm5, %xmm14 + paddq %xmm9, %xmm0 + pmuludq %xmm6, %xmm13 + movdqa 264(%rsp), %xmm9 + movdqa 264(%rsp), %xmm10 + pmuludq %xmm11, %xmm6 + pmuludq %xmm8, %xmm9 + paddq %xmm15, %xmm4 + movdqa 264(%rsp), %xmm15 + pmuludq %xmm14, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm7, %xmm13 + pmuludq %xmm7, %xmm15 + paddq %xmm6, %xmm1 + movdqa 312(%rsp), %xmm6 + paddq %xmm9, %xmm2 + pmuludq %xmm11, %xmm13 + movdqa 248(%rsp), %xmm9 + paddq %xmm10, %xmm4 + pmuludq %xmm8, %xmm6 + pmuludq 312(%rsp), %xmm7 + paddq %xmm15, %xmm0 + movdqa %xmm9, %xmm10 + movdqa %xmm14, %xmm15 + pmuludq %xmm11, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm8, %xmm13 + pmuludq %xmm11, %xmm13 + paddq %xmm6, %xmm3 + paddq %xmm7, %xmm1 + movdqa 232(%rsp), %xmm6 + pmuludq %xmm11, %xmm15 + pmuludq 232(%rsp), %xmm8 + paddq %xmm10, %xmm4 + paddq %xmm8, %xmm1 + movdqa 312(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm14, %xmm6 + movdqa 312(%rsp), %xmm13 + pmuludq %xmm9, %xmm10 + paddq %xmm15, %xmm2 + movdqa 232(%rsp), %xmm7 + pmuludq %xmm14, %xmm13 + pmuludq 152(%rsp), %xmm14 + paddq %xmm14, %xmm1 + pmuludq %xmm9, %xmm7 + paddq %xmm6, %xmm3 + paddq %xmm10, %xmm2 + movdqa 152(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm9, %xmm10 + paddq %xmm7, %xmm0 + movdqa %xmm4, %xmm7 + psrlq $26, %xmm7 + pmuludq 56(%rsp), %xmm9 + pand %xmm5, %xmm4 + paddq %xmm7, %xmm2 + paddq %xmm9, %xmm1 + paddq %xmm10, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm9 + movdqa %xmm3, %xmm6 + psrlq $26, %xmm7 + pand %xmm5, %xmm3 + psrlq $26, %xmm6 + paddq %xmm7, %xmm0 + pand %xmm5, %xmm9 + paddq %xmm6, %xmm1 + movdqa %xmm0, %xmm10 + movdqa %xmm1, %xmm6 + pand %xmm5, %xmm10 + pand %xmm5, %xmm1 + psrlq $26, %xmm6 + pmuludq 136(%rsp), %xmm6 + paddq %xmm6, %xmm4 + movdqa %xmm0, %xmm6 + psrlq $26, %xmm6 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm7 + paddq %xmm6, %xmm3 + psrlq $26, %xmm2 + pand %xmm5, %xmm7 + movdqa %xmm3, %xmm0 + paddq %xmm2, %xmm9 + pand %xmm5, %xmm3 + psrlq $26, %xmm0 + paddq %xmm0, %xmm1 + ja .Lpoly1305_blocks_x86_13 + leaq -64(%rdx), %rax + movdqa %xmm3, %xmm6 + andl $63, %edx + andq $-64, %rax + leaq 64(%rsi,%rax), %rsi +.Lpoly1305_blocks_x86_7: + cmpq $31, %rdx + jbe .Lpoly1305_blocks_x86_9 + movdqa -24(%rsp), %xmm13 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm3 + movdqa 40(%rsp), %xmm11 + movdqa %xmm1, %xmm12 + testq %rsi, %rsi + movdqa -40(%rsp), %xmm2 + pmuludq %xmm13, %xmm0 + movdqa %xmm1, %xmm8 + pmuludq %xmm1, %xmm11 + movdqa %xmm10, %xmm4 + movdqa %xmm1, %xmm14 + pmuludq %xmm2, %xmm3 + movdqa %xmm6, %xmm15 + pmuludq %xmm1, %xmm13 + movdqa %xmm7, %xmm1 + pmuludq %xmm2, %xmm12 + paddq %xmm0, %xmm11 + movdqa -56(%rsp), %xmm0 + pmuludq %xmm10, %xmm2 + paddq %xmm3, %xmm13 + pmuludq %xmm0, %xmm4 + movdqa %xmm9, %xmm3 + pmuludq %xmm0, %xmm3 + paddq %xmm2, %xmm11 + pmuludq %xmm0, %xmm8 + movdqa %xmm6, %xmm2 + pmuludq %xmm0, %xmm2 + movdqa -8(%rsp), %xmm0 + paddq %xmm4, %xmm13 + movdqa 312(%rsp), %xmm4 + paddq %xmm3, %xmm11 + pmuludq 312(%rsp), %xmm6 + movdqa 312(%rsp), %xmm3 + pmuludq %xmm0, %xmm1 + paddq %xmm2, %xmm12 + pmuludq %xmm0, %xmm15 + movdqa %xmm9, %xmm2 + pmuludq %xmm0, %xmm2 + pmuludq %xmm7, %xmm3 + paddq %xmm1, %xmm11 + movdqa 232(%rsp), %xmm1 + pmuludq %xmm0, %xmm14 + paddq %xmm15, %xmm8 + pmuludq %xmm10, %xmm0 + paddq %xmm2, %xmm13 + movdqa 312(%rsp), %xmm2 + pmuludq %xmm10, %xmm4 + paddq %xmm3, %xmm13 + movdqa 152(%rsp), %xmm3 + pmuludq %xmm9, %xmm2 + paddq %xmm6, %xmm14 + pmuludq 232(%rsp), %xmm10 + paddq %xmm0, %xmm12 + pmuludq %xmm9, %xmm1 + paddq %xmm10, %xmm14 + movdqa 232(%rsp), %xmm0 + pmuludq %xmm7, %xmm3 + paddq %xmm4, %xmm8 + pmuludq 152(%rsp), %xmm9 + paddq %xmm2, %xmm12 + paddq %xmm9, %xmm14 + pmuludq %xmm7, %xmm0 + paddq %xmm1, %xmm8 + pmuludq 56(%rsp), %xmm7 + paddq %xmm3, %xmm8 + paddq %xmm7, %xmm14 + paddq %xmm0, %xmm12 + je .Lpoly1305_blocks_x86_10 + movdqu (%rsi), %xmm1 + pxor %xmm0, %xmm0 + paddq 168(%rsp), %xmm14 + movdqu 16(%rsi), %xmm2 + movdqa %xmm1, %xmm3 + punpckldq %xmm2, %xmm3 + punpckhdq %xmm2, %xmm1 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm2 + punpckldq %xmm0, %xmm4 + punpckhdq %xmm0, %xmm3 + punpckhdq %xmm0, %xmm1 + punpckldq %xmm0, %xmm2 + movdqa %xmm2, %xmm0 + psllq $6, %xmm3 + paddq %xmm4, %xmm11 + psllq $12, %xmm0 + paddq %xmm3, %xmm13 + psllq $18, %xmm1 + paddq %xmm0, %xmm12 + paddq %xmm1, %xmm8 +.Lpoly1305_blocks_x86_10: + movdqa %xmm11, %xmm9 + movdqa %xmm8, %xmm1 + movdqa %xmm11, %xmm7 + psrlq $26, %xmm9 + movdqa %xmm8, %xmm6 + pand %xmm5, %xmm7 + paddq %xmm13, %xmm9 + psrlq $26, %xmm1 + pand %xmm5, %xmm6 + movdqa %xmm9, %xmm10 + paddq %xmm14, %xmm1 + pand %xmm5, %xmm9 + psrlq $26, %xmm10 + movdqa %xmm1, %xmm0 + pand %xmm5, %xmm1 + paddq %xmm12, %xmm10 + psrlq $26, %xmm0 + pmuludq 136(%rsp), %xmm0 + movdqa %xmm10, %xmm2 + paddq %xmm0, %xmm7 + psrlq $26, %xmm2 + movdqa %xmm7, %xmm0 + pand %xmm5, %xmm10 + paddq %xmm2, %xmm6 + psrlq $26, %xmm0 + pand %xmm5, %xmm7 + movdqa %xmm6, %xmm2 + paddq %xmm0, %xmm9 + pand %xmm5, %xmm6 + psrlq $26, %xmm2 + paddq %xmm2, %xmm1 +.Lpoly1305_blocks_x86_9: + testq %rsi, %rsi + je .Lpoly1305_blocks_x86_11 + movdqa %xmm7, 0(%rdi) + movdqa %xmm9, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm6, 48(%rdi) + movdqa %xmm1, 64(%rdi) + movq -8(%rbp), %rbx + leave + ret +.Lpoly1305_blocks_x86_5: + movdqa 0(%rdi), %xmm7 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm6 + movdqa 64(%rdi), %xmm1 + jmp .Lpoly1305_blocks_x86_6 +.Lpoly1305_blocks_x86_11: + movdqa %xmm7, %xmm0 + movdqa %xmm9, %xmm2 + movdqa %xmm6, %xmm3 + psrldq $8, %xmm0 + movabsq $4398046511103, %rbx + paddq %xmm0, %xmm7 + psrldq $8, %xmm2 + movdqa %xmm10, %xmm0 + movd %xmm7, %edx + paddq %xmm2, %xmm9 + psrldq $8, %xmm0 + movl %edx, %ecx + movd %xmm9, %eax + paddq %xmm0, %xmm10 + shrl $26, %ecx + psrldq $8, %xmm3 + movdqa %xmm1, %xmm0 + addl %ecx, %eax + movd %xmm10, %ecx + paddq %xmm3, %xmm6 + movl %eax, %r9d + shrl $26, %eax + psrldq $8, %xmm0 + addl %ecx, %eax + movd %xmm6, %ecx + paddq %xmm0, %xmm1 + movl %eax, %esi + andl $67108863, %r9d + movd %xmm1, %r10d + shrl $26, %esi + andl $67108863, %eax + andl $67108863, %edx + addl %ecx, %esi + salq $8, %rax + movl %r9d, %ecx + shrl $18, %r9d + movl %esi, %r8d + shrl $26, %esi + andl $67108863, %r8d + addl %r10d, %esi + orq %r9, %rax + salq $16, %rsi + movq %r8, %r9 + shrl $10, %r8d + salq $26, %rcx + orq %r8, %rsi + salq $34, %r9 + orq %rdx, %rcx + movq %rsi, %r8 + shrq $42, %rsi + movabsq $17592186044415, %rdx + orq %r9, %rax + andq %rbx, %r8 + leaq (%rsi,%rsi,4), %rsi + andq %rdx, %rcx + andq %rdx, %rax + movabsq $-4398046511104, %r10 + addq %rsi, %rcx + movq %rcx, %rsi + shrq $44, %rcx + addq %rcx, %rax + andq %rdx, %rsi + movq %rax, %rcx + shrq $44, %rax + addq %r8, %rax + andq %rdx, %rcx + andq %rax, %rbx + shrq $42, %rax + leaq (%rsi,%rax,4), %rsi + addq %rbx, %r10 + addq %rax, %rsi + movq %rsi, %r8 + shrq $44, %rsi + andq %rdx, %r8 + addq %rcx, %rsi + leaq 5(%r8), %r9 + movq %r9, %r11 + andq %rdx, %r9 + shrq $44, %r11 + addq %rsi, %r11 + movq %r11, %rax + andq %r11, %rdx + shrq $44, %rax + addq %rax, %r10 + movq %r10, %rax + shrq $63, %rax + subq $1, %rax + movq %rax, %rcx + andq %rax, %r9 + andq %rax, %rdx + notq %rcx + andq %r10, %rax + andq %rcx, %r8 + andq %rcx, %rsi + andq %rbx, %rcx + orq %r9, %r8 + orq %rdx, %rsi + orq %rax, %rcx + movq %r8, 0(%rdi) + movq %rsi, 8(%rdi) + movq %rcx, 16(%rdi) + movq -8(%rbp), %rbx + movq %rbp, %rax + subq %rsp, %rax + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 + leave + addq $8, %rax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 + ret +.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks; + +#endif diff --git a/cipher/poly1305.c b/cipher/poly1305.c index 472ae42..cd1902a 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -38,6 +38,25 @@ static const char *selftest (void); +#ifdef POLY1305_USE_SSE2 + +void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_amd64_sse2_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_amd64_sse2_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_amd64_sse2_ops = { + POLY1305_SSE2_BLOCKSIZE, + _gcry_poly1305_amd64_sse2_init_ext, + _gcry_poly1305_amd64_sse2_blocks, + _gcry_poly1305_amd64_sse2_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -612,7 +631,11 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; +#ifdef POLY1305_USE_SSE2 + ctx->ops = &poly1305_amd64_sse2_ops; +#else ctx->ops = &poly1305_default_ops; +#endif buf_cpy (keytmp.b, key, POLY1305_KEYLEN); poly1305_init (ctx, &keytmp); diff --git a/configure.ac b/configure.ac index 3a0fd52..4dc36d5 100644 --- a/configure.ac +++ b/configure.ac @@ -1821,6 +1821,13 @@ if test "$found" = "1" ; then esac fi +case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo" + ;; +esac + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 3 +- ...hacha20-ssse3-amd64.S => chacha20-sse2-amd64.S} | 226 +++-- cipher/chacha20.c | 18 + cipher/poly1305-avx2-amd64.S | 954 ++++++++++++++++++ cipher/poly1305-internal.h | 45 +- cipher/poly1305-sse2-amd64.S | 1035 ++++++++++++++++++++ cipher/poly1305.c | 49 + configure.ac | 9 + doc/gcrypt.texi | 42 +- 9 files changed, 2271 insertions(+), 110 deletions(-) copy cipher/{chacha20-ssse3-amd64.S => chacha20-sse2-amd64.S} (78%) create mode 100644 cipher/poly1305-avx2-amd64.S create mode 100644 cipher/poly1305-sse2-amd64.S hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From smueller at chronox.de Tue May 20 02:23:26 2014 From: smueller at chronox.de (Stephan Mueller) Date: Tue, 20 May 2014 02:23:26 +0200 Subject: [PATCH v3 0/7] SP800-90A Deterministic Random Bit Generator In-Reply-To: <1573675.V5Uuq2JQxy@myon.chronox.de> References: <1573675.V5Uuq2JQxy@myon.chronox.de> Message-ID: <1441178.HmOpXN0I6q@myon.chronox.de> Am Mittwoch, 19. M?rz 2014, 08:25:43 schrieb Stephan Mueller: Hi, > Hi, > > the following set of patches against the current GIT development tree of > libgcrypt implements the SP800-90A DRBG and integrates it with libgcrypt. May I ask for the inclusion of the DRBG code into libgcrypt or for suggestions on how to improve the code? Please note that I have seen the patches offered here to appear in OpenSUSE beta code. As this code adds two more control values, I would like to ask for inclusion to prevent breaking binary compatibility. Of course, any comments or change requests are highly welcome. Please note that the code was subject to the following tests: - CAVS testing - stress testing of the kernel version of this code showing no breakage or memleaks - standard testing using a test application I am not aware of pending changes or change requests. Although a preview of an update to SP800-90A is available which I need to check for technical changes (beyond dropping the Dual EC DRBG). If needed, I can submit a full patch set to ensure nobody gets lost in the tree of patches submitted in this thread. As you see with the patch set, the ANSI X9.31 DRNG will be put out of business in line with SP800-131A. Ciao Stephan -- | Cui bono? | From kmcmarti at redhat.com Mon May 19 23:20:18 2014 From: kmcmarti at redhat.com (Kyle McMartin) Date: Mon, 19 May 2014 17:20:18 -0400 Subject: [PATCH] libgcrypt: disable some ARM assembly when building __PIC__ Message-ID: <20140519212018.GF31851@redacted.bos.redhat.com> Hi, We're currently getting DT_TEXTREL set for libgcrypt.so.20... after some investigation, it seems the camellia, cast5, and rijndael arm assembly which was added since libgcrypt v1.5 is responsible. I'll spend some cycles attempting to fix this when __PIC__, but for the moment, we might as well disable them when building -fPIC. https://bugzilla.redhat.com/show_bug.cgi?id=1069792 regards, Kyle diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S index 255a246..57ed361 100644 --- a/cipher/camellia-arm.S +++ b/cipher/camellia-arm.S @@ -20,7 +20,7 @@ #include -#if defined(__ARMEL__) +#if defined(__ARMEL__) && !defined(__PIC__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text @@ -609,5 +609,5 @@ _gcry_camellia_arm_decrypt_block: .long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7 .long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e -#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/ #endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/camellia.h b/cipher/camellia.h index d0e3c18..c8f9556 100644 --- a/cipher/camellia.h +++ b/cipher/camellia.h @@ -32,7 +32,7 @@ #include /* USE_ARM_ASM indicates whether to use ARM assembly code. */ # undef USE_ARM_ASM -# if defined(__ARMEL__) +# if defined(__ARMEL__) && !defined(__PIC__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARM_ASM 1 # endif diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S index 57c89b7..7707ba0 100644 --- a/cipher/cast5-arm.S +++ b/cipher/cast5-arm.S @@ -20,7 +20,7 @@ #include -#if defined(__ARMEL__) +#if defined(__ARMEL__) && !defined(__PIC__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text diff --git a/cipher/cast5.c b/cipher/cast5.c index 115e1e6..bf4123e 100644 --- a/cipher/cast5.c +++ b/cipher/cast5.c @@ -54,7 +54,7 @@ /* USE_ARM_ASM indicates whether to use ARM assembly code. */ #undef USE_ARM_ASM -#if defined(__ARMEL__) +#if defined(__ARMEL__) && !defined(__PIC__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARM_ASM 1 # endif diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S index cea8c51..f06be60 100644 --- a/cipher/rijndael-arm.S +++ b/cipher/rijndael-arm.S @@ -20,7 +20,7 @@ #include -#if defined(__ARMEL__) +#if defined(__ARMEL__) && !defined(__PIC__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text @@ -849,5 +849,5 @@ _gcry_aes_arm_decrypt_block: .long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021 .long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d -#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/ #endif /*__ARMEL__ */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 8019f0a..1ffc559 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -69,7 +69,7 @@ /* USE_ARM_ASM indicates whether to use ARM assembly code. */ #undef USE_ARM_ASM -#if defined(__ARMEL__) +#if defined(__ARMEL__) && !defined(__PIC__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARM_ASM 1 # endif From jussi.kivilinna at iki.fi Tue May 20 19:55:56 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 20 May 2014 20:55:56 +0300 Subject: [PATCH] libgcrypt: disable some ARM assembly when building __PIC__ In-Reply-To: <20140519212018.GF31851@redacted.bos.redhat.com> References: <20140519212018.GF31851@redacted.bos.redhat.com> Message-ID: <537B972C.1070103@iki.fi> On 20.05.2014 00:20, Kyle McMartin wrote: > Hi, > > We're currently getting DT_TEXTREL set for libgcrypt.so.20... after some > investigation, it seems the camellia, cast5, and rijndael arm assembly > which was added since libgcrypt v1.5 is responsible. I'll spend some > cycles attempting to fix this when __PIC__, but for the moment, we might > as well disable them when building -fPIC. > > https://bugzilla.redhat.com/show_bug.cgi?id=1069792 Does the attached patch fix issues with __PIC__? -Jussi > > regards, Kyle > > diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S > index 255a246..57ed361 100644 > --- a/cipher/camellia-arm.S > +++ b/cipher/camellia-arm.S > @@ -20,7 +20,7 @@ > > #include > > -#if defined(__ARMEL__) > +#if defined(__ARMEL__) && !defined(__PIC__) > #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > > .text > @@ -609,5 +609,5 @@ _gcry_camellia_arm_decrypt_block: > .long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7 > .long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e > > -#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ > +#endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/ > #endif /*__ARM_ARCH >= 6*/ > diff --git a/cipher/camellia.h b/cipher/camellia.h > index d0e3c18..c8f9556 100644 > --- a/cipher/camellia.h > +++ b/cipher/camellia.h > @@ -32,7 +32,7 @@ > #include > /* USE_ARM_ASM indicates whether to use ARM assembly code. */ > # undef USE_ARM_ASM > -# if defined(__ARMEL__) > +# if defined(__ARMEL__) && !defined(__PIC__) > # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > # define USE_ARM_ASM 1 > # endif > diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S > index 57c89b7..7707ba0 100644 > --- a/cipher/cast5-arm.S > +++ b/cipher/cast5-arm.S > @@ -20,7 +20,7 @@ > > #include > > -#if defined(__ARMEL__) > +#if defined(__ARMEL__) && !defined(__PIC__) > #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > > .text > diff --git a/cipher/cast5.c b/cipher/cast5.c > index 115e1e6..bf4123e 100644 > --- a/cipher/cast5.c > +++ b/cipher/cast5.c > @@ -54,7 +54,7 @@ > > /* USE_ARM_ASM indicates whether to use ARM assembly code. */ > #undef USE_ARM_ASM > -#if defined(__ARMEL__) > +#if defined(__ARMEL__) && !defined(__PIC__) > # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > # define USE_ARM_ASM 1 > # endif > diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S > index cea8c51..f06be60 100644 > --- a/cipher/rijndael-arm.S > +++ b/cipher/rijndael-arm.S > @@ -20,7 +20,7 @@ > > #include > > -#if defined(__ARMEL__) > +#if defined(__ARMEL__) && !defined(__PIC__) > #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > > .text > @@ -849,5 +849,5 @@ _gcry_aes_arm_decrypt_block: > .long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021 > .long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d > > -#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ > +#endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/ > #endif /*__ARMEL__ */ > diff --git a/cipher/rijndael.c b/cipher/rijndael.c > index 8019f0a..1ffc559 100644 > --- a/cipher/rijndael.c > +++ b/cipher/rijndael.c > @@ -69,7 +69,7 @@ > > /* USE_ARM_ASM indicates whether to use ARM assembly code. */ > #undef USE_ARM_ASM > -#if defined(__ARMEL__) > +#if defined(__ARMEL__) && !defined(__PIC__) > # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS > # define USE_ARM_ASM 1 > # endif > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -------------- next part -------------- A non-text attachment was scrubbed... Name: 01-fix-arm-assembly-when-building.patch Type: text/x-patch Size: 4344 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 713 bytes Desc: OpenPGP digital signature URL: From cvs at cvs.gnupg.org Tue May 20 21:10:00 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Tue, 20 May 2014 21:10:00 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-81-g994c758 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 994c758d8f5471c7e9c38c2834742cca2502d35f (commit) from bf4943932dae95a0573b63bf32a9b9acd5a6ddf3 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 994c758d8f5471c7e9c38c2834742cca2502d35f Author: Jussi Kivilinna Date: Tue May 20 20:35:51 2014 +0300 Fix ARM assembly when building __PIC__ * cipher/camellia-arm.S (GET_DATA_POINTER): New. (_gcry_camellia_arm_encrypt_block): Use GET_DATA_POINTER. (_gcry_camellia_arm_decrypt_block): Ditto. * cipher/cast5-arm.S (GET_DATA_POINTER): New. (_gcry_cast5_arm_encrypt_block, _gcry_cast5_arm_decrypt_block) (_gcry_cast5_arm_enc_blk2, _gcry_cast5_arm_dec_blk2): Use GET_DATA_POINTER. * cipher/rijndael-arm.S (GET_DATA_POINTER): New. (_gcry_aes_arm_encrypt_block, _gcry_aes_arm_decrypt_block): Use GET_DATA_POINTER. * cipher/sha1-armv7-neon.S (GET_DATA_POINTER): New. (.LK_VEC): Move from .text to .data section. (_gcry_sha1_transform_armv7_neon): Use GET_DATA_POINTER. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S index 255a246..a3d87d1 100644 --- a/cipher/camellia-arm.S +++ b/cipher/camellia-arm.S @@ -28,6 +28,19 @@ .syntax unified .arm +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + /* struct camellia_ctx: */ #define key_table 0 @@ -261,7 +274,7 @@ _gcry_camellia_arm_encrypt_block: */ push {%r1, %r4-%r11, %ip, %lr}; - ldr RTAB1, =.Lcamellia_sp1110; + GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3); mov RMASK, #0xff; add RTAB3, RTAB1, #(2 * 4); push {%r3}; @@ -309,7 +322,7 @@ _gcry_camellia_arm_decrypt_block: */ push {%r1, %r4-%r11, %ip, %lr}; - ldr RTAB1, =.Lcamellia_sp1110; + GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3); mov RMASK, #0xff; add RTAB3, RTAB1, #(2 * 4); mov RMASK, RMASK, lsl#4 /* byte mask */ diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S index 57c89b7..76ddd2e 100644 --- a/cipher/cast5-arm.S +++ b/cipher/cast5-arm.S @@ -30,6 +30,19 @@ .extern _gcry_cast5_s1to4; +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + /* structure of crypto context */ #define Km 0 #define Kr (Km + (16 * 4)) @@ -260,7 +273,7 @@ _gcry_cast5_arm_encrypt_block: */ push {%r1, %r4-%r11, %ip, %lr}; - ldr Rs1, =_gcry_cast5_s1to4; + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); add Rs2, Rs1, #(0x100*4); add Rs3, Rs1, #(0x100*4*2); @@ -306,7 +319,7 @@ _gcry_cast5_arm_decrypt_block: */ push {%r1, %r4-%r11, %ip, %lr}; - ldr Rs1, =_gcry_cast5_s1to4; + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); add Rs2, Rs1, #(0x100 * 4); add Rs3, Rs1, #(0x100 * 4 * 2); @@ -500,7 +513,7 @@ _gcry_cast5_arm_enc_blk2: */ push {%lr}; - ldr Rs1, =_gcry_cast5_s1to4; + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); add Rs2, Rs1, #(0x100 * 4); @@ -631,7 +644,7 @@ _gcry_cast5_arm_dec_blk2: * [RR0, RL0], [RR1, RL1]: dst */ - ldr Rs1, =_gcry_cast5_s1to4; + GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); add Rs2, Rs1, #(0x100 * 4); diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S index cea8c51..6004ce8 100644 --- a/cipher/rijndael-arm.S +++ b/cipher/rijndael-arm.S @@ -28,6 +28,19 @@ .syntax unified .arm +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + /* register macros */ #define CTX %r0 #define RTAB %lr @@ -249,7 +262,7 @@ _gcry_aes_arm_encrypt_block: 2: sub %sp, #16; - ldr RTAB, =.LtableE0; + GET_DATA_POINTER(RTAB, .LtableE0, RMASK); str %r1, [%sp, #4]; /* dst */ mov RMASK, #0xff; @@ -503,7 +516,7 @@ _gcry_aes_arm_decrypt_block: 2: sub %sp, #16; - ldr RTAB, =.LtableD0; + GET_DATA_POINTER(RTAB, .LtableD0, RMASK); mov RMASK, #0xff; str %r1, [%sp, #4]; /* dst */ diff --git a/cipher/sha1-armv7-neon.S b/cipher/sha1-armv7-neon.S index 0c19025..95b677d 100644 --- a/cipher/sha1-armv7-neon.S +++ b/cipher/sha1-armv7-neon.S @@ -26,12 +26,25 @@ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_SHA1) -.text +.data .syntax unified .fpu neon .arm +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + /* Context structure */ #define state_h0 0 @@ -56,6 +69,8 @@ gcry_sha1_armv7_neon_K_VEC: .LK4: .long K4, K4, K4, K4 +.text + /* Register macros */ #define RSTATE r0 @@ -314,7 +329,7 @@ _gcry_sha1_transform_armv7_neon: vpush {q4-q7}; mov ROLDSTACK, sp; - ldr RK, =.LK_VEC; + GET_DATA_POINTER(RK, .LK_VEC, _a); /* Align stack. */ sub sp, #(16*4); ----------------------------------------------------------------------- Summary of changes: cipher/camellia-arm.S | 17 +++++++++++++++-- cipher/cast5-arm.S | 21 +++++++++++++++++---- cipher/rijndael-arm.S | 17 +++++++++++++++-- cipher/sha1-armv7-neon.S | 19 +++++++++++++++++-- 4 files changed, 64 insertions(+), 10 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Wed May 21 07:28:35 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Wed, 21 May 2014 07:28:35 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-83-gf14fb5b Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via f14fb5b427b5159fcd9603d2b3cde936889cf430 (commit) via beb901575f0d6cd6a0a27506ebea9a725754d0cc (commit) from 994c758d8f5471c7e9c38c2834742cca2502d35f (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit f14fb5b427b5159fcd9603d2b3cde936889cf430 Author: Jussi Kivilinna Date: Wed May 21 08:30:30 2014 +0300 tests: add larger test-vectors for hash algorithms * tests/basic.c (check_digests): Add large test-vectors for MD5, SHA1, SHA224, SHA256, SHA384, RMD160, CRC32, TIGER1, WHIRLPOOL and GOSTR3411_94. -- Signed-off-by: Jussi Kivilinna diff --git a/tests/basic.c b/tests/basic.c index 2be6432..875b36c 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -4526,6 +4526,16 @@ check_digests (void) "\x90\x01\x50\x98\x3C\xD2\x4F\xB0\xD6\x96\x3F\x7D\x28\xE1\x7F\x72" }, { GCRY_MD_MD5, "message digest", "\xF9\x6B\x69\x7D\x7C\xB7\x93\x8D\x52\x5A\x2F\x31\xAA\xF1\x61\xD0" }, + { GCRY_MD_MD5, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\xc4\x1a\x5c\x0b\x44\x5f\xba\x1a\xda\xbc\xc0\x38\x0e\x0c\x9e\x33" }, { GCRY_MD_SHA1, "abc", "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E" "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D" }, @@ -4536,6 +4546,17 @@ check_digests (void) { GCRY_MD_SHA1, "!" /* kludge for "a"*1000000 */ , "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E" "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F" }, + { GCRY_MD_SHA1, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\xf5\xd9\xcb\x66\x91\xb4\x7a\x7c\x60\x35\xe2\x1c\x38\x26\x52\x13" + "\x8e\xd5\xe5\xdf" }, /* From RFC3874 */ { GCRY_MD_SHA224, "abc", "\x23\x09\x7d\x22\x34\x05\xd8\x22\x86\x42\xa4\x77\xbd\xa2\x55\xb3" @@ -4547,6 +4568,17 @@ check_digests (void) { GCRY_MD_SHA224, "!", "\x20\x79\x46\x55\x98\x0c\x91\xd8\xbb\xb4\xc1\xea\x97\x61\x8a\x4b" "\xf0\x3f\x42\x58\x19\x48\xb2\xee\x4e\xe7\xad\x67" }, + { GCRY_MD_SHA224, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x80\xf0\x60\x79\xb0\xe9\x65\xab\x8a\x76\xbf\x6e\x88\x64\x75\xe7" + "\xfd\xf0\xc2\x4c\xf6\xf2\xa6\x01\xed\x50\x71\x08" }, { GCRY_MD_SHA256, "abc", "\xba\x78\x16\xbf\x8f\x01\xcf\xea\x41\x41\x40\xde\x5d\xae\x22\x23" "\xb0\x03\x61\xa3\x96\x17\x7a\x9c\xb4\x10\xff\x61\xf2\x00\x15\xad" }, @@ -4557,10 +4589,33 @@ check_digests (void) { GCRY_MD_SHA256, "!", "\xcd\xc7\x6e\x5c\x99\x14\xfb\x92\x81\xa1\xc7\xe2\x84\xd7\x3e\x67" "\xf1\x80\x9a\x48\xa4\x97\x20\x0e\x04\x6d\x39\xcc\xc7\x11\x2c\xd0" }, + { GCRY_MD_SHA256, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\xb0\x18\x70\x67\xb8\xac\x68\x50\xec\x95\x43\x77\xb5\x44\x5b\x0f" + "\x2e\xbd\x40\xc9\xdc\x2a\x2c\x33\x8b\x53\xeb\x3e\x9e\x01\xd7\x02" }, { GCRY_MD_SHA384, "abc", "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07" "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed" "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7" }, + { GCRY_MD_SHA384, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\xe4\x6d\xb4\x28\x33\x77\x99\x49\x94\x0f\xcf\x87\xc2\x2f\x30\xd6" + "\x06\x24\x82\x9d\x80\x64\x8a\x07\xa1\x20\x8f\x5f\xf3\x85\xb3\xaa" + "\x39\xb8\x61\x00\xfc\x7f\x18\xc6\x82\x23\x4b\x45\xfa\xf1\xbc\x69" }, { GCRY_MD_SHA512, "abc", "\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31" "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A" @@ -4591,8 +4646,29 @@ check_digests (void) { GCRY_MD_RMD160, "message digest", "\x5d\x06\x89\xef\x49\xd2\xfa\xe5\x72\xb8" "\x81\xb1\x23\xa8\x5f\xfa\x21\x59\x5f\x36" }, + { GCRY_MD_RMD160, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x06\x6d\x3c\x4e\xc9\xba\x89\x75\x16\x90\x96\x4e\xfd\x43\x07\xde" + "\x04\xca\x69\x6b" }, { GCRY_MD_CRC32, "", "\x00\x00\x00\x00" }, { GCRY_MD_CRC32, "foo", "\x8c\x73\x65\x21" }, + { GCRY_MD_CRC32, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x4A\x53\x7D\x67" }, { GCRY_MD_CRC32_RFC1510, "", "\x00\x00\x00\x00" }, { GCRY_MD_CRC32_RFC1510, "foo", "\x73\x32\xbc\x33" }, { GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" }, @@ -4680,6 +4756,17 @@ check_digests (void) { GCRY_MD_TIGER1, "!", "\x6D\xB0\xE2\x72\x9C\xBE\xAD\x93\xD7\x15\xC6\xA7" "\xD3\x63\x02\xE9\xB3\xCE\xE0\xD2\xBC\x31\x4B\x41" }, + { GCRY_MD_TIGER1, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x60\xee\xdf\x95\x39\xc8\x44\x94\x64\xdc\xdf\x3d\x2e\x1c\xe5\x79" + "\x6a\x95\xbd\x30\x68\x8c\x7e\xb8" }, { GCRY_MD_TIGER2, "", "\x44\x41\xBE\x75\xF6\x01\x87\x73\xC2\x06\xC2\x27" @@ -4741,6 +4828,19 @@ check_digests (void) "\x29\x05\x7F\xD8\x6B\x20\xBF\xD6\x2D\xEC\xA0\xF1\xCC\xEA\x4A\xF5" "\x1F\xC1\x54\x90\xED\xDC\x47\xAF\x32\xBB\x2B\x66\xC3\x4F\xF9\xAD" "\x8C\x60\x08\xAD\x67\x7F\x77\x12\x69\x53\xB2\x26\xE4\xED\x8B\x01" }, + { GCRY_MD_WHIRLPOOL, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\xcd\x4a\xa4\xaf\xf6\x7f\xec\xce\xbb\x6c\xdf\x91\x96\xe1\xf3\xf6" + "\x78\xe2\x8e\x3a\x76\xcf\x06\xc7\xa1\x20\x7b\x81\x32\x60\xf7\x8e" + "\x68\x19\x62\x33\x4f\xe5\x0a\x24\xfb\x9e\x74\x03\x74\xe4\x61\x29" + "\x6f\xb3\x13\xe6\x7e\xc2\x88\x99\x9e\xfb\xe7\x9d\x11\x30\x89\xd2" }, { GCRY_MD_GOSTR3411_94, "This is message, length=32 bytes", "\xB1\xC4\x66\xD3\x75\x19\xB8\x2E\x83\x19\x81\x9F\xF3\x25\x95\xE0" @@ -4757,6 +4857,17 @@ check_digests (void) "!", "\x5C\x00\xCC\xC2\x73\x4C\xDD\x33\x32\xD3\xD4\x74\x95\x76\xE3\xC1" "\xA7\xDB\xAF\x0E\x7E\xA7\x4E\x9F\xA6\x02\x41\x3C\x90\xA1\x29\xFA" }, + { GCRY_MD_GOSTR3411_94, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x00\x0c\x85\xc8\x54\xd2\x9a\x6e\x47\x2e\xff\xa4\xa2\xe7\xd0\x2e" + "\x8a\xcc\x14\x53\xb4\x87\xc8\x5c\x95\x9a\x3e\x85\x8c\x7d\x6e\x0c" }, { GCRY_MD_STRIBOG512, "012345678901234567890123456789012345678901234567890123456789012", "\x1b\x54\xd0\x1a\x4a\xf5\xb9\xd5\xcc\x3d\x86\xd6\x8d\x28\x54\x62" commit beb901575f0d6cd6a0a27506ebea9a725754d0cc Author: Jussi Kivilinna Date: Wed May 21 08:30:30 2014 +0300 sha512: fix ARM/NEON implementation * cipher/sha512-armv7-neon.S (_gcry_sha512_transform_armv7_neon): Byte-swap RW67q and RW1011q correctly in multi-block loop. * tests/basic.c (check_digests): Add large test vector for SHA512. -- Patch fixes bug introduced to multi-block processing by commit df629ba53a6, "Improve performance of SHA-512/ARM/NEON implementation". Patch also adds multi-block test vector for SHA-512. Signed-off-by: Jussi Kivilinna diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S index bb2cbb4..a9d1272 100644 --- a/cipher/sha512-armv7-neon.S +++ b/cipher/sha512-armv7-neon.S @@ -361,8 +361,8 @@ _gcry_sha512_transform_armv7_neon: rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); #ifdef __ARMEL__ - vrev64.8 RW67q, RW67q; vrev64.8 RW89q, RW89q; + vrev64.8 RW1011q, RW1011q; #endif vld1.64 {RW12-RW15}, [%r1]!; vadd_rg_RT0(RA); diff --git a/tests/basic.c b/tests/basic.c index 9740919..2be6432 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -4566,6 +4566,19 @@ check_digests (void) "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A" "\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD" "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F" }, + { GCRY_MD_SHA512, + "Libgcrypt is free software; you can redistribute it and/or modif" + "y it under the terms of the GNU Lesser general Public License as" + " published by the Free Software Foundation; either version 2.1 o" + "f the License, or (at your option) any later version.\nLibgcrypt" + " is distributed in the hope that it will be useful, but WITHOUT " + "ANY WARRANTY; without even the implied warranty of MERCHANTABILI" + "TY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser Gene" + "ral Public License for more details.", + "\x72\x8c\xde\xd8\xe4\xd7\xb6\xa5\x0f\xde\x6b\x4d\x33\xaf\x15\x19" + "\xdd\xec\x62\x0f\xf7\x1a\x1e\x10\x32\x05\x02\xa6\xb0\x1f\x70\x37" + "\xbc\xd7\x15\xed\x71\x6c\x78\x20\xc8\x54\x87\xd0\x66\x6a\x17\x83" + "\x05\x61\x92\xbe\xcc\x8f\x3b\xbf\x11\x72\x22\x69\x23\x5b\x48\x5c" }, { GCRY_MD_RMD160, "", "\x9c\x11\x85\xa5\xc5\xe9\xfc\x54\x61\x28" "\x08\x97\x7e\xe8\xf5\x48\xb2\x25\x8d\x31" }, ----------------------------------------------------------------------- Summary of changes: cipher/sha512-armv7-neon.S | 2 +- tests/basic.c | 124 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From kmcmarti at redhat.com Tue May 20 20:27:24 2014 From: kmcmarti at redhat.com (Kyle McMartin) Date: Tue, 20 May 2014 14:27:24 -0400 Subject: [PATCH] libgcrypt: disable some ARM assembly when building __PIC__ In-Reply-To: <537B972C.1070103@iki.fi> References: <20140519212018.GF31851@redacted.bos.redhat.com> <537B972C.1070103@iki.fi> Message-ID: <20140520182724.GE4637@redacted.bos.redhat.com> On Tue, May 20, 2014 at 08:55:56PM +0300, Jussi Kivilinna wrote: > On 20.05.2014 00:20, Kyle McMartin wrote: > > Hi, > > > > We're currently getting DT_TEXTREL set for libgcrypt.so.20... after some > > investigation, it seems the camellia, cast5, and rijndael arm assembly > > which was added since libgcrypt v1.5 is responsible. I'll spend some > > cycles attempting to fix this when __PIC__, but for the moment, we might > > as well disable them when building -fPIC. > > > > https://bugzilla.redhat.com/show_bug.cgi?id=1069792 > > Does the attached patch fix issues with __PIC__? > > -Jussi Yup, no TEXTREL is present when I add that patch in our build. Thanks! Was afraid I'd need to do it. :) --Kyle > > > > > regards, Kyle > > From ddahl at nulltxt.se Fri May 30 01:23:09 2014 From: ddahl at nulltxt.se (David Dahl) Date: Thu, 29 May 2014 18:23:09 -0500 Subject: GCM mode and GCRY_CIPHER_AES256 Message-ID: <5387C15D.2010705@nulltxt.se> Hello, Please excuse my dumb question that comes with a bit of ignorance. I want to wrap a key generated with gcry_kdf_derive using AES256 + GCM mode: err = gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_CIPHER_MODE, 0); returns with: "GCM algo 9, gcry_cipher_open failed: Invalid cipher mode" (I am trying to match an existing library that generates a key ring with SJCL, which uses GCM mode when wrapping the keys). After seeing this error, I re-read the docs and see that is says: "GCRY_CIPHER_MODE_CCM and GCRY_CIPHER_MODE_GCM modes will only work with block cipher algorithms which have the block size of 16 bytes" Doesn't AES 256 use a block size of 16 bytes? Please help me understand what I am doing wrong. I should mention that SJCL and gcrypt both seem to be using the same NIST test vectors for the GCM mode tests. Theoretically, I should be able to write a compatible set of functions with gcrypt to handle gcrypt <-> SJCL interop. Best Regards, David From wk at gnupg.org Fri May 30 19:38:00 2014 From: wk at gnupg.org (Werner Koch) Date: Fri, 30 May 2014 19:38:00 +0200 Subject: GCM mode and GCRY_CIPHER_AES256 In-Reply-To: <5387C15D.2010705@nulltxt.se> (David Dahl's message of "Thu, 29 May 2014 18:23:09 -0500") References: <5387C15D.2010705@nulltxt.se> Message-ID: <87r43bupzb.fsf@vigenere.g10code.de> On Fri, 30 May 2014 01:23, ddahl at nulltxt.se said: > err = gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_CIPHER_MODE, 0); > > returns with: "GCM algo 9, gcry_cipher_open failed: Invalid cipher mode" Please paste the actual code so that we can understand and replicate the case. > Doesn't AES 256 use a block size of 16 bytes? The block length of AES is indeed 16 bytes. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From ddahl at nulltxt.se Fri May 30 21:02:10 2014 From: ddahl at nulltxt.se (David Dahl) Date: Fri, 30 May 2014 14:02:10 -0500 Subject: GCM mode and GCRY_CIPHER_AES256 In-Reply-To: <87r43bupzb.fsf@vigenere.g10code.de> References: <5387C15D.2010705@nulltxt.se> <87r43bupzb.fsf@vigenere.g10code.de> Message-ID: <5388D5B2.6090007@nulltxt.se> Here is the code: ----------------- #include "pbkdf2.h" #define GCRY_CIPHER GCRY_CIPHER_AES256 #define GCRY_CIPHER_MODE GCRY_CIPHER_MODE_GCM #define RNDM_BYTES_LENGTH 32 #define SALT_LENGTH 16 int wrapKeyItem (char* privateKey, struct keyItem key, unsigned char name, struct wrappedKeyItem* out) { size_t keyLength = gcry_cipher_get_algo_keylen(GCRY_CIPHER); size_t blkLength = gcry_cipher_get_algo_blklen(GCRY_CIPHER); size_t privateKeyLength = strlen(privateKey) + 1; char *encBuffer = malloc(privateKeyLength); gcry_error_t err = 0; // Create a handle gcry_cipher_hd_t handle; err = gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_CIPHER_MODE, 0); if (!handle) { printf("GCM algo %d, gcry_cipher_open failed: %s\n", GCRY_CIPHER, gpg_strerror (err)); printf("Error no: %d and message: %s\n ", err, gcry_strerror(err)); return 1; } // Set the key err = gcry_cipher_setkey(handle, key.key, keyLength); if (err) { printf("gcry_cipher_setkey failed."); printf("Error no: %d and message: %s\n ", err, gcry_strerror(err)); return 1; } // set the IV unsigned int *iv; iv = gcry_random_bytes(RNDM_BYTES_LENGTH, GCRY_STRONG_RANDOM); err = gcry_cipher_setiv(handle, iv, blkLength); if (err) { printf("gcry_cipher_setiv failed."); printf("Error no: %d and message: %s\n ", err, gcry_strerror(err)); return 1; } // Do encrypt err = gcry_cipher_encrypt(handle, encBuffer, privateKeyLength, privateKey, privateKeyLength); if (err) { printf("gcry_cipher_encrypt failed."); printf("Error no: %d and message: %s\n ", err, gcry_strerror(err)); return 1; } printf("encBuffer: %s\n ", encBuffer); out->ciphertext = encBuffer; out->iv = iv; out->name = name; // Free memory gcry_cipher_close(handle); free(encBuffer); return 0; } On 05/30/2014 12:38 PM, Werner Koch wrote: > On Fri, 30 May 2014 01:23, ddahl at nulltxt.se said: > >> err = gcry_cipher_open(&handle, GCRY_CIPHER, GCRY_CIPHER_MODE, 0); >> >> returns with: "GCM algo 9, gcry_cipher_open failed: Invalid cipher mode" > > Please paste the actual code so that we can understand and replicate the > case. >