From horst.wente at posteo.de Tue Mar 2 00:02:14 2021 From: horst.wente at posteo.de (horst.wente at posteo.de) Date: Tue, 02 Mar 2021 00:02:14 +0100 Subject: libgcrypt-1.9.2: cipher/poly1305.c does not compile on i686/32 with gcc-4.7.3 ( --disable-asm does not work) Message-ID: Hello! The "#define ADD_1305_32" ( for "defined (__i386__) && __GNUC__ >= 4" ) leads to > poly1305.c: In function 'poly1305_blocks': > poly1305.c:424:7: error: can't find a register in class 'GENERAL_REGS' > while reloading 'asm' > poly1305.c:424:7: error: 'asm' operand has impossible constraints I suppose the template uses too many registers, so I redefined it and it seems to work for me ( all tests successful ). hth horst wente --- cipher/poly1305.c.orig 2021-01-28 13:53:52.000000000 +0100 +++ cipher/poly1305.c 2021-03-01 20:25:26.000000000 +0100 @@ -314,14 +314,19 @@ /* A += B (i386) */ #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \ - __asm__ ("addl %5, %0\n" \ - "adcl %6, %1\n" \ - "adcl %7, %2\n" \ - "adcl %8, %3\n" \ - "adcl %9, %4\n" \ - : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \ - : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ - : "cc" ) + __asm__ ("clc\n" \ + "addl %5, %0\n" \ + "movl %6, %5\n" \ + "adcl %5, %1\n" \ + "movl %7, %5\n" \ + "adcl %5, %2\n" \ + "movl %8, %5\n" \ + "adcl %5, %3\n" \ + "movl %9, %5\n" \ + "adcl %5, %4\n" \ + : "+m" (A0), "+m" (A1), "+m" (A2), "+m" (A3), "+m" (A4) \ + : "r" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ + : "cc" ) #endif /* __i386__ */ From jussi.kivilinna at iki.fi Wed Mar 3 18:07:03 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 3 Mar 2021 19:07:03 +0200 Subject: [PATCH 1/3] cipher-gcm-intel-pclmul: fix compiling with i386 gcc-4.7 Message-ID: <20210303170705.1614871-1-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-intel-pclmul.c (gcm_lsh): Pass '*pconst' instead of 'pconst' to asm block. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher-gcm-intel-pclmul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 28165c65..334c89cd 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -397,7 +397,7 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs) "pxor %%xmm4, %%xmm2\n\t" "movdqu %%xmm2, (%[h])\n\t" : - : [pconst] "m" (pconst), + : [pconst] "m" (*pconst), [h] "r" ((byte *)h + hoffs) : "memory" ); } -- 2.27.0 From jussi.kivilinna at iki.fi Wed Mar 3 18:07:05 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 3 Mar 2021 19:07:05 +0200 Subject: [PATCH 3/3] poly1305: make --disable-asm work on x86, aarch64 and ppc In-Reply-To: <20210303170705.1614871-1-jussi.kivilinna@iki.fi> References: <20210303170705.1614871-1-jussi.kivilinna@iki.fi> Message-ID: <20210303170705.1614871-3-jussi.kivilinna@iki.fi> * cipher/poly1305.c [__aarch64__] (ADD_1305_64): Check for HAVE_CPU_ARCH_ARM. [__x86_64__] (ADD_1305_64): Check for HAVE_CPU_ARCH_X86. [__powerpc__] (ADD_1305_64): Check for HAVE_CPU_ARCH_PPC. [__i386__] (ADD_1305_32): Check for HAVE_CPU_ARCH_X86. -- Reported-by: Horst Wente Signed-off-by: Jussi Kivilinna --- cipher/poly1305.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cipher/poly1305.c b/cipher/poly1305.c index da8f4601..41e55e8d 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -105,7 +105,7 @@ static void poly1305_init (poly1305_context_t *ctx, #ifdef USE_MPI_64BIT -#if defined (__aarch64__) && __GNUC__ >= 4 +#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4 /* A += B (armv8/aarch64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ @@ -118,7 +118,7 @@ static void poly1305_init (poly1305_context_t *ctx, #endif /* __aarch64__ */ -#if defined (__x86_64__) && __GNUC__ >= 4 +#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4 /* A += B (x86-64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ @@ -131,7 +131,7 @@ static void poly1305_init (poly1305_context_t *ctx, #endif /* __x86_64__ */ -#if defined (__powerpc__) && __GNUC__ >= 4 +#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4 /* A += B (ppc64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ @@ -310,7 +310,7 @@ static unsigned int poly1305_final (poly1305_context_t *ctx, #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */ -#if defined (__i386__) && __GNUC__ >= 5 +#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5 /* Note: ADD_1305_32 below does not compile on GCC-4.7 */ /* A += B (i386) */ -- 2.27.0 From jussi.kivilinna at iki.fi Wed Mar 3 18:07:04 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 3 Mar 2021 19:07:04 +0200 Subject: [PATCH 2/3] poly1305: fix compiling on i386 gcc-4.7 In-Reply-To: <20210303170705.1614871-1-jussi.kivilinna@iki.fi> References: <20210303170705.1614871-1-jussi.kivilinna@iki.fi> Message-ID: <20210303170705.1614871-2-jussi.kivilinna@iki.fi> * cipher/poly1305.c [__i386__]: Limit i386 variant of ADD_1305_32 to GCC-5 or newer. -- Reported-by: Horst Wente Signed-off-by: Jussi Kivilinna --- cipher/poly1305.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cipher/poly1305.c b/cipher/poly1305.c index 6cb4d2b7..da8f4601 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -310,7 +310,8 @@ static unsigned int poly1305_final (poly1305_context_t *ctx, #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */ -#if defined (__i386__) && __GNUC__ >= 4 +#if defined (__i386__) && __GNUC__ >= 5 +/* Note: ADD_1305_32 below does not compile on GCC-4.7 */ /* A += B (i386) */ #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \ -- 2.27.0 From jussi.kivilinna at iki.fi Wed Mar 3 18:08:08 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 3 Mar 2021 19:08:08 +0200 Subject: libgcrypt-1.9.2: cipher/poly1305.c does not compile on i686/32 with gcc-4.7.3 ( --disable-asm does not work) In-Reply-To: References: Message-ID: <3a4a2a1e-1bb2-9707-908c-6b0fe007effe@iki.fi> On 2.3.2021 1.02, horst.wente--- via Gcrypt-devel wrote: > Hello! > > The "#define ADD_1305_32" ( for "defined (__i386__) && __GNUC__ >= 4" ) leads to > >> poly1305.c: In function 'poly1305_blocks': >> poly1305.c:424:7: error: can't find a register in class 'GENERAL_REGS' while > ??? reloading 'asm' >> poly1305.c:424:7: error: 'asm' operand has impossible constraints > > I suppose the template uses too many registers, so I redefined it and it seems to > work for me ( all tests successful ). > Thanks for reporting. I think it is better to just disable this macro for GCC-4.x as any changes to asm operand types cause performance drop with newer GCC. When ADD_1305_32 is not provided specific arch, generic version is used. -Jussi > hth > horst wente > > > --- cipher/poly1305.c.orig????? 2021-01-28 13:53:52.000000000 +0100 > +++ cipher/poly1305.c?? 2021-03-01 20:25:26.000000000 +0100 > @@ -314,14 +314,19 @@ > > ?/* A += B (i386) */ > ?#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \ > -????? __asm__ ("addl %5, %0\n" \ > -????????????? "adcl %6, %1\n" \ > -????????????? "adcl %7, %2\n" \ > -????????????? "adcl %8, %3\n" \ > -????????????? "adcl %9, %4\n" \ > -????????????? : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \ > -????????????? : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ > -????????????? : "cc" ) > +????? __asm__ ("clc\n"???????? \ > +?????????????? "addl %5, %0\n" \ > +?????????????? "movl %6, %5\n" \ > +?????????????? "adcl %5, %1\n" \ > +?????????????? "movl %7, %5\n" \ > +?????????????? "adcl %5, %2\n" \ > +?????????????? "movl %8, %5\n" \ > +?????????????? "adcl %5, %3\n" \ > +?????????????? "movl %9, %5\n" \ > +?????????????? "adcl %5, %4\n" \ > +?????????????? : "+m" (A0), "+m" (A1), "+m" (A2), "+m" (A3), "+m" (A4) \ > +?????????????? : "r" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ > +?????????????? : "cc" ) > > ?#endif /* __i386__ */ > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From jussi.kivilinna at iki.fi Sun Mar 7 17:44:07 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 7 Mar 2021 18:44:07 +0200 Subject: [PATCH 1/4] VPMSUMD acceleration for GCM mode on PPC Message-ID: <20210307164410.508295-1-jussi.kivilinna@iki.fi> From: Shawn Landden * cipher/Makefile.am: Add 'cipher-gcm-ppc.c'. * cipher/cipher-gcm-ppc.c: New. * cipher/cipher-gcm.c [GCM_USE_PPC_VPMSUM] (_gcry_ghash_setup_ppc_vpmsum) (_gcry_ghash_ppc_vpmsum, ghash_setup_ppc_vpsum, ghash_ppc_vpmsum): New. (setupM) [GCM_USE_PPC_VPMSUM]: Select ppc-vpmsum implementation if HW feature "ppc-vcrypto" is available. * cipher/cipher-internal.h (GCM_USE_PPC_VPMSUM): New. (gcry_cipher_handle): Move 'ghash_fn' at end of 'gcm' block to align 'gcm_table' to 16 bytes. * configure.ac: Add 'cipher-gcm-ppc.lo'. * tests/basic.c (_check_gcm_cipher): New AES256 test vector. * AUTHORS: Add 'CRYPTOGAMS'. * LICENSES: Add original license to 3-clause-BSD section. -- https://dev.gnupg.org/D501: 10-20X speed. However this Power 9 machine is faster than the last Power 9 benchmarks on the optimized versions, so while better than the last patch, it is not all due to the code. Before: GCM enc | 4.23 ns/B 225.3 MiB/s - c/B GCM dec | 3.58 ns/B 266.2 MiB/s - c/B GCM auth | 3.34 ns/B 285.3 MiB/s - c/B After: GCM enc | 0.370 ns/B 2578 MiB/s - c/B GCM dec | 0.371 ns/B 2571 MiB/s - c/B GCM auth | 0.159 ns/B 6003 MiB/s - c/B Signed-off-by: Shawn Landden [jk: coding style fixes, Makefile.am integration, patch from Differential to git, commit changelog, fixed few compiler warnings] GnuPG-bug-id: 5040 Signed-off-by: Jussi Kivilinna --- AUTHORS | 1 + LICENSES | 43 +++- cipher/Makefile.am | 7 + cipher/cipher-gcm-ppc.c | 510 +++++++++++++++++++++++++++++++++++++++ cipher/cipher-gcm.c | 31 ++- cipher/cipher-internal.h | 18 +- configure.ac | 13 + tests/basic.c | 16 ++ 8 files changed, 634 insertions(+), 5 deletions(-) create mode 100644 cipher/cipher-gcm-ppc.c diff --git a/AUTHORS b/AUTHORS index f6bfcb85..f237baa5 100644 --- a/AUTHORS +++ b/AUTHORS @@ -30,6 +30,7 @@ List of Copyright holders Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett Copyright (C) 2003 Nikos Mavroyanopoulos + Copyright (c) 2006 CRYPTOGAMS Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation) Copyright (C) 2012-2021 g10 Code GmbH Copyright (C) 2012 Simon Josefsson, Niels M?ller diff --git a/LICENSES b/LICENSES index 31f8eae8..bdd16aab 100644 --- a/LICENSES +++ b/LICENSES @@ -55,7 +55,6 @@ with any binary distributions derived from the GNU C Library. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+end_quote - For files: - random/jitterentropy-base.c - random/jitterentropy.h @@ -100,6 +99,48 @@ with any binary distributions derived from the GNU C Library. * DAMAGE. #+end_quote + For files: + - cipher/cipher-gcm-ppc.c + +#+begin_quote + Copyright (c) 2006, CRYPTOGAMS by + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the CRYPTOGAMS nor the names of its + copyright holder and contributors may be used to endorse or + promote products derived from this software without specific + prior written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this + product may be distributed under the terms of the GNU General Public + License (GPL), in which case the provisions of the GPL apply INSTEAD OF + those given above. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#+end_quote + * X License For files: diff --git a/cipher/Makefile.am b/cipher/Makefile.am index e2100cf3..da8cc126 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -83,6 +83,7 @@ EXTRA_libcipher_la_SOURCES = \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ chacha20-ppc.c chacha20-s390x.S \ + cipher-gcm-ppc.c \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ crc-ppc.c \ @@ -258,3 +259,9 @@ crc-ppc.o: $(srcdir)/crc-ppc.c Makefile crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` + +cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile + `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` + +cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile + `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c new file mode 100644 index 00000000..ed27ef15 --- /dev/null +++ b/cipher/cipher-gcm-ppc.c @@ -0,0 +1,510 @@ +/* cipher-gcm-ppc.c - Power 8 vpmsum accelerated Galois Counter Mode + * implementation + * Copyright (C) 2019 Shawn Landden + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Based on GHASH implementation by Andy Polyakov from CRYPTOGAMS + * distribution (ppc/ghashp8-ppc.pl). Specifically, it uses his register + * allocation (which then defers to your compiler's register allocation), + * instead of re-implementing Gerald Estrin's Scheme of parallelized + * multiplication of polynomials, as I did not understand this algorithm at + * the time. + * + * Original copyright license follows: + * + * Copyright (c) 2006, CRYPTOGAMS by + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain copyright notices, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * * Neither the name of the CRYPTOGAMS nor the names of its + * copyright holder and contributors may be used to endorse or + * promote products derived from this software without specific + * prior written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this + * product may be distributed under the terms of the GNU General Public + * License (GPL), in which case the provisions of the GPL apply INSTEAD OF + * those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) + */ + +#include +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" + +#ifdef GCM_USE_PPC_VPMSUM + +#include + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE + +typedef vector unsigned char vector16x_u8; +typedef vector signed char vector16x_s8; +typedef vector unsigned long long vector2x_u64; +typedef vector unsigned long long block; + +static ASM_FUNC_ATTR_INLINE block +asm_vpmsumd(block a, block b) +{ + block r; + __asm__("vpmsumd %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (b)); + return r; +} + +static ASM_FUNC_ATTR_INLINE block +asm_swap_u64(block a) +{ + __asm__("xxswapd %x0, %x1" + : "=wa" (a) + : "wa" (a)); + return a; +} + +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_left(block a) +{ + block zero = {0, 0}; + block mask = {2, 0}; + return __builtin_shuffle(a, zero, mask); +} + +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_right(block a) +{ + block zero = {0, 0}; + block mask = {1, 2}; + return __builtin_shuffle(a, zero, mask); +} + +/* vsl is a slightly strange function in the way the shift is passed... */ +static ASM_FUNC_ATTR_INLINE block +asm_ashl_128(block a, vector16x_u8 shift) +{ + block r; + __asm__("vsl %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (shift)); + return r; +} + +#define ALIGNED_LOAD(in_ptr) \ + (vec_aligned_ld (0, (const unsigned char *)(in_ptr))) + +static ASM_FUNC_ATTR_INLINE block +vec_aligned_ld(unsigned long offset, const unsigned char *ptr) +{ +#ifndef WORDS_BIGENDIAN + block vec; + __asm__ ("lvx %0,%1,%2\n\t" + : "=v" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); + return vec; +#else + return vec_vsx_ld (offset, ptr); +#endif +} + +#define STORE_TABLE(gcm_table, slot, vec) \ + vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table)); + + +static ASM_FUNC_ATTR_INLINE void +vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr) +{ +#ifndef WORDS_BIGENDIAN + __asm__ ("stvx %0,%1,%2\n\t" + : + : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); +#else + vec_vsx_st ((vector16x_u8)vec, offset, ptr); +#endif +} + +#define VEC_LOAD_BE(in_ptr, bswap_const) \ + (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const)) + +static ASM_FUNC_ATTR_INLINE block +vec_load_be(unsigned long offset, const unsigned char *ptr, + vector unsigned char be_bswap_const) +{ +#ifndef WORDS_BIGENDIAN + block vec; + /* GCC vec_vsx_ld is generating two instructions on little-endian. Use + * lxvw4x directly instead. */ + __asm__ ("lxvw4x %x0,%1,%2\n\t" + : "=wa" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); + __asm__ ("vperm %0,%1,%1,%2\n\t" + : "=v" (vec) + : "v" (vec), "v" (be_bswap_const)); + return vec; +#else + (void)be_bswap_const; + return vec_vsx_ld (offset, ptr); +#endif +} + +/* Power ghash based on papers: + "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega + "Intel? Carry-Less Multiplication Instruction and its Usage for Computing + the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. + + After saving the magic c2 constant and pre-formatted version of the key, + we pre-process the key for parallel hashing. This takes advantage of the + identity of addition over a galois field being identital to XOR, and thus + can be parellized (S 2.2, page 3). We multiply and add (galois field + versions) the key over multiple iterations and save the result. This can + later be galois added (XORed) with parallel processed input (Estrin's + Scheme). + + The ghash "key" is a salt. */ +void ASM_FUNC_ATTR +_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) +{ + vector16x_u8 bswap_const = + { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; + vector16x_u8 c2 = + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 }; + block T0, T1, T2; + block C2, H, H1, H1l, H1h, H2, H2l, H2h; + block H3l, H3, H3h, H4l, H4, H4h, T3, T4; + vector16x_s8 most_sig_of_H, t7, carry; + vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + H = VEC_LOAD_BE(gcm_key, bswap_const); + most_sig_of_H = vec_splat((vector16x_s8)H, 15); + t7 = vec_splat_s8(7); + carry = most_sig_of_H >> t7; + carry &= c2; /* only interested in certain carries. */ + H1 = asm_ashl_128(H, one); + H1 ^= (block)carry; /* complete the <<< 1 */ + + T1 = asm_swap_u64 (H1); + H1l = asm_rot_block_right (T1); + H1h = asm_rot_block_left (T1); + C2 = asm_rot_block_right ((block)c2); + + STORE_TABLE (gcm_table, 0, C2); + STORE_TABLE (gcm_table, 1, H1l); + STORE_TABLE (gcm_table, 2, T1); + STORE_TABLE (gcm_table, 3, H1h); + + /* pre-process coefficients for Gerald Estrin's scheme for parallel + * multiplication of polynomials + */ + H2l = asm_vpmsumd (H1l, H1); /* do not need to mask in + because 0 * anything -> 0 */ + H2 = asm_vpmsumd (T1, H1); + H2h = asm_vpmsumd (H1h, H1); + + /* reduce 1 */ + T0 = asm_vpmsumd (H2l, C2); + + H2l ^= asm_rot_block_left (H2);; + H2h ^= asm_rot_block_right (H2); + H2l = asm_swap_u64 (H2l); + H2l ^= T0; + /* reduce 2 */ + T0 = asm_swap_u64 (H2l); + H2l = asm_vpmsumd (H2l, C2); + H2 = H2l ^ H2h ^ T0; + + T2 = asm_swap_u64 (H2); + H2l = asm_rot_block_right (T2); + H2h = asm_rot_block_left (T2); + + STORE_TABLE (gcm_table, 4, H2l); + STORE_TABLE (gcm_table, 5, T2); + STORE_TABLE (gcm_table, 6, H2h); + + H3l = asm_vpmsumd (H2l, H1); + H4l = asm_vpmsumd (H2l, H2); + H3 = asm_vpmsumd (T2, H1); + H4 = asm_vpmsumd (T2, H2); + H3h = asm_vpmsumd (H2h, H1); + H4h = asm_vpmsumd (H2h, H2); + + T3 = asm_vpmsumd (H3l, C2); + T4 = asm_vpmsumd (H4l, C2); + + H3l ^= asm_rot_block_left (H3); + H3h ^= asm_rot_block_right (H3); + H4l ^= asm_rot_block_left (H4); + H4h ^= asm_rot_block_right (H4); + + H3 = asm_swap_u64 (H3l); + H4 = asm_swap_u64 (H4l); + + H3 ^= T3; + H4 ^= T4; + + /* We could have also b64 switched reduce and reduce2, however as we are + using the unrotated H and H2 above to vpmsum, this is marginally better. */ + T3 = asm_swap_u64 (H3); + T4 = asm_swap_u64 (H4); + + H3 = asm_vpmsumd (H3, C2); + H4 = asm_vpmsumd (H4, C2); + + T3 ^= H3h; + T4 ^= H4h; + H3 ^= T3; + H4 ^= T4; + H3 = asm_swap_u64 (H3); + H4 = asm_swap_u64 (H4); + + H3l = asm_rot_block_right (H3); + H3h = asm_rot_block_left (H3); + H4l = asm_rot_block_right (H4); + H4h = asm_rot_block_left (H4); + + STORE_TABLE (gcm_table, 7, H3l); + STORE_TABLE (gcm_table, 8, H3); + STORE_TABLE (gcm_table, 9, H3h); + STORE_TABLE (gcm_table, 10, H4l); + STORE_TABLE (gcm_table, 11, H4); + STORE_TABLE (gcm_table, 12, H4h); +} + +ASM_FUNC_ATTR_INLINE +block +vec_perm2(block l, block r, vector16x_u8 perm) { + block ret; + __asm__ ("vperm %0,%1,%2,%3\n\t" + : "=v" (ret) + : "v" (l), "v" (r), "v" (perm)); + return ret; +} + +void ASM_FUNC_ATTR +_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, + const byte *const buf, const size_t nblocks) +{ + /* This const is strange, it is reversing the bytes, and also reversing + the u32s that get switched by lxvw4 and it also addresses bytes big-endian, + and is here due to lack of proper peep-hole optimization. */ + vector16x_u8 bswap_const = + { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; + vector16x_u8 bswap_8_const = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl; + block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur; + size_t blocks_remaining = nblocks, off = 0; + size_t not_multiple_of_four; + block t0; + + cur = vec_load_be (0, result, bswap_const); + + c2 = vec_aligned_ld (0, gcm_table); + H0l = vec_aligned_ld (16, gcm_table); + H0m = vec_aligned_ld (32, gcm_table); + H0h = vec_aligned_ld (48, gcm_table); + + for (not_multiple_of_four = nblocks % 4; not_multiple_of_four; + not_multiple_of_four--) + { + in = vec_load_be (off, buf, bswap_const); + off += 16; + blocks_remaining--; + cur ^= in; + + Hl = asm_vpmsumd (cur, H0l); + Hm = asm_vpmsumd (cur, H0m); + Hh = asm_vpmsumd (cur, H0h); + + t0 = asm_vpmsumd (Hl, c2); + + Hl ^= asm_rot_block_left (Hm); + + Hm_right = asm_rot_block_right (Hm); + Hh ^= Hm_right; + Hl_rotate = asm_swap_u64 (Hl); + Hl_rotate ^= t0; + Hl = asm_swap_u64 (Hl_rotate); + Hl_rotate = asm_vpmsumd (Hl_rotate, c2); + Hl ^= Hh; + Hl ^= Hl_rotate; + + cur = Hl; + } + + if (blocks_remaining > 0) + { + vector16x_u8 hiperm = + { + 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 + }; + vector16x_u8 loperm = + { + 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, + 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8 + }; + block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate; + block H21l, H21h, merge_l, merge_h; + + H2m = vec_aligned_ld (48 + 32, gcm_table); + H3l = vec_aligned_ld (48 * 2 + 16, gcm_table); + H3m = vec_aligned_ld (48 * 2 + 32, gcm_table); + H3h = vec_aligned_ld (48 * 2 + 48, gcm_table); + H4l = vec_aligned_ld (48 * 3 + 16, gcm_table); + H4m = vec_aligned_ld (48 * 3 + 32, gcm_table); + H4h = vec_aligned_ld (48 * 3 + 48, gcm_table); + + in0 = vec_load_be (off, buf, bswap_const); + in1 = vec_load_be (off + 16, buf, bswap_const); + in2 = vec_load_be (off + 32, buf, bswap_const); + in3 = vec_load_be (off + 48, buf, bswap_const); + blocks_remaining -= 4; + off += 64; + + Xh = in0 ^ cur; + + Xl1 = asm_vpmsumd (in1, H3l); + Xm1 = asm_vpmsumd (in1, H3m); + Xh1 = asm_vpmsumd (in1, H3h); + + H21l = vec_perm2 (H2m, H0m, hiperm); + H21h = vec_perm2 (H2m, H0m, loperm); + merge_l = vec_perm2 (in2, in3, loperm); + merge_h = vec_perm2 (in2, in3, hiperm); + + Xm2 = asm_vpmsumd (in2, H2m); + Xl3 = asm_vpmsumd (merge_l, H21l); + Xm3 = asm_vpmsumd (in3, H0m); + Xh3 = asm_vpmsumd (merge_h, H21h); + + Xm2 ^= Xm1; + Xl3 ^= Xl1; + Xm3 ^= Xm2; + Xh3 ^= Xh1; + + /* Gerald Estrin's scheme for parallel multiplication of polynomials */ + for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64) + { + in0 = vec_load_be (off, buf, bswap_const); + in1 = vec_load_be (off + 16, buf, bswap_const); + in2 = vec_load_be (off + 32, buf, bswap_const); + in3 = vec_load_be (off + 48, buf, bswap_const); + + Xl = asm_vpmsumd (Xh, H4l); + Xm = asm_vpmsumd (Xh, H4m); + Xh = asm_vpmsumd (Xh, H4h); + Xl1 = asm_vpmsumd (in1, H3l); + Xm1 = asm_vpmsumd (in1, H3m); + Xh1 = asm_vpmsumd (in1, H3h); + + Xl ^= Xl3; + Xm ^= Xm3; + Xh ^= Xh3; + merge_l = vec_perm2 (in2, in3, loperm); + merge_h = vec_perm2 (in2, in3, hiperm); + + t0 = asm_vpmsumd (Xl, c2); + Xl3 = asm_vpmsumd (merge_l, H21l); + Xh3 = asm_vpmsumd (merge_h, H21h); + + Xl ^= asm_rot_block_left (Xm); + Xh ^= asm_rot_block_right (Xm); + + Xl = asm_swap_u64 (Xl); + Xl ^= t0; + + Xl_rotate = asm_swap_u64 (Xl); + Xm2 = asm_vpmsumd (in2, H2m); + Xm3 = asm_vpmsumd (in3, H0m); + Xl = asm_vpmsumd (Xl, c2); + + Xl3 ^= Xl1; + Xh3 ^= Xh1; + Xh ^= in0; + Xm2 ^= Xm1; + Xh ^= Xl_rotate; + Xm3 ^= Xm2; + Xh ^= Xl; + } + + Xl = asm_vpmsumd (Xh, H4l); + Xm = asm_vpmsumd (Xh, H4m); + Xh = asm_vpmsumd (Xh, H4h); + + Xl ^= Xl3; + Xm ^= Xm3; + + t0 = asm_vpmsumd (Xl, c2); + + Xh ^= Xh3; + Xl ^= asm_rot_block_left (Xm); + Xh ^= asm_rot_block_right (Xm); + + Xl = asm_swap_u64 (Xl); + Xl ^= t0; + + Xl_rotate = asm_swap_u64 (Xl); + Xl = asm_vpmsumd (Xl, c2); + Xl_rotate ^= Xh; + Xl ^= Xl_rotate; + + cur = Xl; + } + + cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const); + STORE_TABLE (result, 0, cur); +} + +#endif /* GCM_USE_PPC_VPMSUM */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 7aad1277..598ea5fb 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -109,6 +109,28 @@ ghash_s390x_kimd (gcry_cipher_hd_t c, byte *result, const byte *buf, } #endif /* GCM_USE_S390X_CRYPTO*/ +#ifdef GCM_USE_PPC_VPMSUM +extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key); + +/* result is 128-bits */ +extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table, + const byte *buf, size_t nblocks); + +static void +ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c) +{ + _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key); +} + +static unsigned int +ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf, + nblocks); + return 0; +} +#endif /* GCM_USE_PPC_VPMSUM */ #ifdef GCM_USE_TABLES static struct @@ -543,7 +565,7 @@ static void setupM (gcry_cipher_hd_t c) { #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \ - defined(GCM_USE_S390X_CRYPTO) + defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM) unsigned int features = _gcry_get_hw_features (); #endif @@ -572,6 +594,13 @@ setupM (gcry_cipher_hd_t c) ghash_setup_armv7_neon (c); } #endif +#ifdef GCM_USE_PPC_VPMSUM + else if (features & HWF_PPC_VCRYPTO) + { + c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum; + ghash_setup_ppc_vpmsum (c); + } +#endif #ifdef GCM_USE_S390X_CRYPTO else if (features & HWF_S390X_MSA) { diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 59b36ce7..1d62b11e 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -102,6 +102,18 @@ # define GCM_USE_S390X_CRYPTO 1 #endif /* GCM_USE_S390X_CRYPTO */ +/* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8 + * polynomial multiplication instruction. */ +#undef GCM_USE_PPC_VPMSUM +#if defined(GCM_USE_TABLES) +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \ + !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4 +# define GCM_USE_PPC_VPMSUM 1 +# define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */ +#endif +#endif /* GCM_USE_PPC_VPMSUM */ + typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); @@ -322,9 +334,6 @@ struct gcry_cipher_handle unsigned char key[MAX_BLOCKSIZE]; } u_ghash_key; - /* GHASH implementation in use. */ - ghash_fn_t ghash_fn; - /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) @@ -335,6 +344,9 @@ struct gcry_cipher_handle u32 gcm_table[8 * 16]; #endif #endif + + /* GHASH implementation in use. */ + ghash_fn_t ghash_fn; } gcm; /* Mode specific storage for OCB mode. */ diff --git a/configure.ac b/configure.ac index 564d361b..fb1a7a70 100644 --- a/configure.ac +++ b/configure.ac @@ -3058,6 +3058,19 @@ case "$mpi_cpu_arch" in ;; esac +# Arch specific GCM implementations +case "${host}" in + powerpc64le-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; + powerpc64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; + powerpc-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; +esac + LIST_MEMBER(sm3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo" diff --git a/tests/basic.c b/tests/basic.c index 2b543846..9a7e33cc 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -3233,6 +3233,22 @@ _check_gcm_cipher (unsigned int step) "\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde" "\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f", "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" }, + { GCRY_CIPHER_AES256, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" + "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa" + "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38" + "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a\xbc\xc9\xf6\x62", + "\x76\xfc\x6e\xce\x0f\x4e\x17\x68\xcd\xdf\x88\x53\xbb\x2d\x55\x1b" }, /* Test vectors for overflowing CTR. */ /* After setiv, ctr_low: 0xffffffff */ { GCRY_CIPHER_AES256, -- 2.27.0 From jussi.kivilinna at iki.fi Sun Mar 7 17:44:08 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 7 Mar 2021 18:44:08 +0200 Subject: [PATCH 2/4] cipher-gcm-ppc: tweak for better performance In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi> References: <20210307164410.508295-1-jussi.kivilinna@iki.fi> Message-ID: <20210307164410.508295-2-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-ppc.c (asm_xor, asm_mergelo, asm_mergehi) (vec_be_swap, vec_load_he, vec_store_he): New. (vec_load_be, vec_perm2, vec_aligned_st,vec_aligned_ld): Remove. (asm_vpmsumd, asm_swap_u64, vec_perm2, asm_rot_block_left) (asm_rot_block_right, asm_ashl_128, vec_aligned_ld) (vec_aligned_st, vec_load_be): Use 'asm volatile'. (_gcry_ghash_setup_ppc_vpmsum): Update 'bswap_const'. (_gcry_ghash_ppc_vpmsum): Update 'bswap_const'; Use 'asm_mergehi' and 'asm_mergelo' instead of vec_perm2; Use 'asm_xor' for fast path to enforce instruction ordering; Use 'vec_load_he' and 'vec_be_swap' for big-endian loads. -- Benchmark on POWER8 (3700Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.169 ns/B 5647 MiB/s 0.625 c/B After (~13% faster): | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.149 ns/B 6385 MiB/s 0.553 c/B Signed-off-by: Jussi Kivilinna --- cipher/cipher-gcm-ppc.c | 336 ++++++++++++++++++++++------------------ 1 file changed, 185 insertions(+), 151 deletions(-) diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c index ed27ef15..2f60c09d 100644 --- a/cipher/cipher-gcm-ppc.c +++ b/cipher/cipher-gcm-ppc.c @@ -93,112 +93,157 @@ typedef vector signed char vector16x_s8; typedef vector unsigned long long vector2x_u64; typedef vector unsigned long long block; +static ASM_FUNC_ATTR_INLINE block +asm_xor(block a, block b) +{ + block r; + __asm__ volatile ("xxlxor %x0, %x1, %x2" + : "=wa" (r) + : "wa" (a), "wa" (b)); + return r; +} + static ASM_FUNC_ATTR_INLINE block asm_vpmsumd(block a, block b) { block r; - __asm__("vpmsumd %0, %1, %2" - : "=v" (r) - : "v" (a), "v" (b)); + __asm__ volatile ("vpmsumd %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (b)); return r; } static ASM_FUNC_ATTR_INLINE block asm_swap_u64(block a) { - __asm__("xxswapd %x0, %x1" - : "=wa" (a) - : "wa" (a)); - return a; + block r; + __asm__ volatile ("xxswapd %x0, %x1" + : "=wa" (r) + : "wa" (a)); + return r; } static ASM_FUNC_ATTR_INLINE block -asm_rot_block_left(block a) +asm_mergelo(block l, block r) { - block zero = {0, 0}; - block mask = {2, 0}; - return __builtin_shuffle(a, zero, mask); + block ret; + __asm__ volatile ("xxmrgld %x0, %x1, %x2\n\t" + : "=wa" (ret) + : "wa" (l), "wa" (r)); + return ret; } static ASM_FUNC_ATTR_INLINE block -asm_rot_block_right(block a) +asm_mergehi(block l, block r) { - block zero = {0, 0}; - block mask = {1, 2}; - return __builtin_shuffle(a, zero, mask); + block ret; + __asm__ volatile ("xxmrghd %x0, %x1, %x2\n\t" + : "=wa" (ret) + : "wa" (l), "wa" (r)); + return ret; } -/* vsl is a slightly strange function in the way the shift is passed... */ static ASM_FUNC_ATTR_INLINE block -asm_ashl_128(block a, vector16x_u8 shift) +asm_rot_block_left(block a) { block r; - __asm__("vsl %0, %1, %2" - : "=v" (r) - : "v" (a), "v" (shift)); + block zero = { 0, 0 }; + __asm__ volatile ("xxmrgld %x0, %x1, %x2" + : "=wa" (r) + : "wa" (a), "wa" (zero)); return r; } -#define ALIGNED_LOAD(in_ptr) \ - (vec_aligned_ld (0, (const unsigned char *)(in_ptr))) +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_right(block a) +{ + block r; + block zero = { 0, 0 }; + __asm__ volatile ("xxsldwi %x0, %x2, %x1, 2" + : "=wa" (r) + : "wa" (a), "wa" (zero)); + return r; +} +/* vsl is a slightly strange function in the way the shift is passed... */ static ASM_FUNC_ATTR_INLINE block -vec_aligned_ld(unsigned long offset, const unsigned char *ptr) +asm_ashl_128(block a, vector16x_u8 shift) { -#ifndef WORDS_BIGENDIAN - block vec; - __asm__ ("lvx %0,%1,%2\n\t" - : "=v" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); - return vec; -#else - return vec_vsx_ld (offset, ptr); -#endif + block r; + __asm__ volatile ("vsl %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (shift)); + return r; } #define STORE_TABLE(gcm_table, slot, vec) \ - vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table)); - + vec_store_he (((block)vec), slot * 16, (unsigned char *)(gcm_table)); static ASM_FUNC_ATTR_INLINE void -vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr) +vec_store_he(block vec, unsigned long offset, unsigned char *ptr) { #ifndef WORDS_BIGENDIAN - __asm__ ("stvx %0,%1,%2\n\t" - : - : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); + /* GCC vec_vsx_ld is generating two instructions on little-endian. Use + * lxvd2x directly instead. */ +#if __GNUC__ >= 4 + if (__builtin_constant_p (offset) && offset == 0) + __asm__ volatile ("stxvd2x %x0, 0, %1\n\t" + : + : "wa" (vec), "r" ((uintptr_t)ptr) + : "memory", "r0"); + else +#endif + __asm__ volatile ("stxvd2x %x0, %1, %2\n\t" + : + : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); #else vec_vsx_st ((vector16x_u8)vec, offset, ptr); #endif } #define VEC_LOAD_BE(in_ptr, bswap_const) \ - (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const)) + vec_be_swap(vec_load_he (0, (const unsigned char *)(in_ptr)), bswap_const) static ASM_FUNC_ATTR_INLINE block -vec_load_be(unsigned long offset, const unsigned char *ptr, - vector unsigned char be_bswap_const) +vec_load_he(unsigned long offset, const unsigned char *ptr) { #ifndef WORDS_BIGENDIAN block vec; /* GCC vec_vsx_ld is generating two instructions on little-endian. Use - * lxvw4x directly instead. */ - __asm__ ("lxvw4x %x0,%1,%2\n\t" - : "=wa" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); - __asm__ ("vperm %0,%1,%1,%2\n\t" - : "=v" (vec) - : "v" (vec), "v" (be_bswap_const)); + * lxvd2x directly instead. */ +#if __GNUC__ >= 4 + if (__builtin_constant_p (offset) && offset == 0) + __asm__ volatile ("lxvd2x %x0, 0, %1\n\t" + : "=wa" (vec) + : "r" ((uintptr_t)ptr) + : "memory", "r0"); + else +#endif + __asm__ volatile ("lxvd2x %x0, %1, %2\n\t" + : "=wa" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); return vec; #else - (void)be_bswap_const; return vec_vsx_ld (offset, ptr); #endif } +static ASM_FUNC_ATTR_INLINE block +vec_be_swap(block vec, vector16x_u8 be_bswap_const) +{ +#ifndef WORDS_BIGENDIAN + __asm__ volatile ("vperm %0, %1, %1, %2\n\t" + : "=v" (vec) + : "v" (vec), "v" (be_bswap_const)); +#else + (void)be_bswap_const; +#endif + return vec; +} + + /* Power ghash based on papers: "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega "Intel? Carry-Less Multiplication Instruction and its Usage for Computing @@ -216,15 +261,16 @@ vec_load_be(unsigned long offset, const unsigned char *ptr, void ASM_FUNC_ATTR _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) { - vector16x_u8 bswap_const = - { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; - vector16x_u8 c2 = + static const vector16x_u8 bswap_const = + { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; + static const vector16x_u8 c2 = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 }; + static const vector16x_u8 one = + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; block T0, T1, T2; block C2, H, H1, H1l, H1h, H2, H2l, H2h; block H3l, H3, H3h, H4l, H4, H4h, T3, T4; vector16x_s8 most_sig_of_H, t7, carry; - vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; H = VEC_LOAD_BE(gcm_key, bswap_const); most_sig_of_H = vec_splat((vector16x_s8)H, 15); @@ -255,7 +301,7 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) /* reduce 1 */ T0 = asm_vpmsumd (H2l, C2); - H2l ^= asm_rot_block_left (H2);; + H2l ^= asm_rot_block_left (H2); H2h ^= asm_rot_block_right (H2); H2l = asm_swap_u64 (H2l); H2l ^= T0; @@ -321,45 +367,30 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) STORE_TABLE (gcm_table, 12, H4h); } -ASM_FUNC_ATTR_INLINE -block -vec_perm2(block l, block r, vector16x_u8 perm) { - block ret; - __asm__ ("vperm %0,%1,%2,%3\n\t" - : "=v" (ret) - : "v" (l), "v" (r), "v" (perm)); - return ret; -} - void ASM_FUNC_ATTR -_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, - const byte *const buf, const size_t nblocks) +_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table, + const byte *buf, const size_t nblocks) { - /* This const is strange, it is reversing the bytes, and also reversing - the u32s that get switched by lxvw4 and it also addresses bytes big-endian, - and is here due to lack of proper peep-hole optimization. */ - vector16x_u8 bswap_const = - { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; - vector16x_u8 bswap_8_const = - { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + static const vector16x_u8 bswap_const = + { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl; block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur; - size_t blocks_remaining = nblocks, off = 0; + size_t blocks_remaining = nblocks; size_t not_multiple_of_four; block t0; - cur = vec_load_be (0, result, bswap_const); + cur = vec_be_swap (vec_load_he (0, result), bswap_const); - c2 = vec_aligned_ld (0, gcm_table); - H0l = vec_aligned_ld (16, gcm_table); - H0m = vec_aligned_ld (32, gcm_table); - H0h = vec_aligned_ld (48, gcm_table); + c2 = vec_load_he (0, gcm_table); + H0l = vec_load_he (16, gcm_table); + H0m = vec_load_he (32, gcm_table); + H0h = vec_load_he (48, gcm_table); for (not_multiple_of_four = nblocks % 4; not_multiple_of_four; not_multiple_of_four--) { - in = vec_load_be (off, buf, bswap_const); - off += 16; + in = vec_be_swap (vec_load_he (0, buf), bswap_const); + buf += 16; blocks_remaining--; cur ^= in; @@ -385,62 +416,64 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, if (blocks_remaining > 0) { - vector16x_u8 hiperm = - { - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 - }; - vector16x_u8 loperm = - { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8 - }; block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate; block H21l, H21h, merge_l, merge_h; - - H2m = vec_aligned_ld (48 + 32, gcm_table); - H3l = vec_aligned_ld (48 * 2 + 16, gcm_table); - H3m = vec_aligned_ld (48 * 2 + 32, gcm_table); - H3h = vec_aligned_ld (48 * 2 + 48, gcm_table); - H4l = vec_aligned_ld (48 * 3 + 16, gcm_table); - H4m = vec_aligned_ld (48 * 3 + 32, gcm_table); - H4h = vec_aligned_ld (48 * 3 + 48, gcm_table); - - in0 = vec_load_be (off, buf, bswap_const); - in1 = vec_load_be (off + 16, buf, bswap_const); - in2 = vec_load_be (off + 32, buf, bswap_const); - in3 = vec_load_be (off + 48, buf, bswap_const); - blocks_remaining -= 4; - off += 64; - - Xh = in0 ^ cur; + block t1, t2; + + H2m = vec_load_he (48 + 32, gcm_table); + H3l = vec_load_he (48 * 2 + 16, gcm_table); + H3m = vec_load_he (48 * 2 + 32, gcm_table); + H3h = vec_load_he (48 * 2 + 48, gcm_table); + H4l = vec_load_he (48 * 3 + 16, gcm_table); + H4m = vec_load_he (48 * 3 + 32, gcm_table); + H4h = vec_load_he (48 * 3 + 48, gcm_table); + + in0 = vec_load_he (0, buf); + in1 = vec_load_he (16, buf); + in2 = vec_load_he (32, buf); + in3 = vec_load_he (48, buf); + in0 = vec_be_swap(in0, bswap_const); + in1 = vec_be_swap(in1, bswap_const); + in2 = vec_be_swap(in2, bswap_const); + in3 = vec_be_swap(in3, bswap_const); + + Xh = asm_xor (in0, cur); Xl1 = asm_vpmsumd (in1, H3l); Xm1 = asm_vpmsumd (in1, H3m); Xh1 = asm_vpmsumd (in1, H3h); - H21l = vec_perm2 (H2m, H0m, hiperm); - H21h = vec_perm2 (H2m, H0m, loperm); - merge_l = vec_perm2 (in2, in3, loperm); - merge_h = vec_perm2 (in2, in3, hiperm); + H21l = asm_mergehi (H2m, H0m); + H21h = asm_mergelo (H2m, H0m); + merge_l = asm_mergelo (in2, in3); + merge_h = asm_mergehi (in2, in3); Xm2 = asm_vpmsumd (in2, H2m); Xl3 = asm_vpmsumd (merge_l, H21l); Xm3 = asm_vpmsumd (in3, H0m); Xh3 = asm_vpmsumd (merge_h, H21h); - Xm2 ^= Xm1; - Xl3 ^= Xl1; - Xm3 ^= Xm2; - Xh3 ^= Xh1; + Xm2 = asm_xor (Xm2, Xm1); + Xl3 = asm_xor (Xl3, Xl1); + Xm3 = asm_xor (Xm3, Xm2); + Xh3 = asm_xor (Xh3, Xh1); /* Gerald Estrin's scheme for parallel multiplication of polynomials */ - for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64) + while (1) { - in0 = vec_load_be (off, buf, bswap_const); - in1 = vec_load_be (off + 16, buf, bswap_const); - in2 = vec_load_be (off + 32, buf, bswap_const); - in3 = vec_load_be (off + 48, buf, bswap_const); + buf += 64; + blocks_remaining -= 4; + if (!blocks_remaining) + break; + + in0 = vec_load_he (0, buf); + in1 = vec_load_he (16, buf); + in2 = vec_load_he (32, buf); + in3 = vec_load_he (48, buf); + in1 = vec_be_swap(in1, bswap_const); + in2 = vec_be_swap(in2, bswap_const); + in3 = vec_be_swap(in3, bswap_const); + in0 = vec_be_swap(in0, bswap_const); Xl = asm_vpmsumd (Xh, H4l); Xm = asm_vpmsumd (Xh, H4m); @@ -449,62 +482,63 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, Xm1 = asm_vpmsumd (in1, H3m); Xh1 = asm_vpmsumd (in1, H3h); - Xl ^= Xl3; - Xm ^= Xm3; - Xh ^= Xh3; - merge_l = vec_perm2 (in2, in3, loperm); - merge_h = vec_perm2 (in2, in3, hiperm); + Xl = asm_xor (Xl, Xl3); + Xm = asm_xor (Xm, Xm3); + Xh = asm_xor (Xh, Xh3); + merge_l = asm_mergelo (in2, in3); + merge_h = asm_mergehi (in2, in3); t0 = asm_vpmsumd (Xl, c2); Xl3 = asm_vpmsumd (merge_l, H21l); Xh3 = asm_vpmsumd (merge_h, H21h); - Xl ^= asm_rot_block_left (Xm); - Xh ^= asm_rot_block_right (Xm); + t1 = asm_rot_block_left (Xm); + t2 = asm_rot_block_right (Xm); + Xl = asm_xor(Xl, t1); + Xh = asm_xor(Xh, t2); Xl = asm_swap_u64 (Xl); - Xl ^= t0; + Xl = asm_xor(Xl, t0); Xl_rotate = asm_swap_u64 (Xl); Xm2 = asm_vpmsumd (in2, H2m); Xm3 = asm_vpmsumd (in3, H0m); Xl = asm_vpmsumd (Xl, c2); - Xl3 ^= Xl1; - Xh3 ^= Xh1; - Xh ^= in0; - Xm2 ^= Xm1; - Xh ^= Xl_rotate; - Xm3 ^= Xm2; - Xh ^= Xl; + Xl3 = asm_xor (Xl3, Xl1); + Xh3 = asm_xor (Xh3, Xh1); + Xh = asm_xor (Xh, in0); + Xm2 = asm_xor (Xm2, Xm1); + Xh = asm_xor (Xh, Xl_rotate); + Xm3 = asm_xor (Xm3, Xm2); + Xh = asm_xor (Xh, Xl); } Xl = asm_vpmsumd (Xh, H4l); Xm = asm_vpmsumd (Xh, H4m); Xh = asm_vpmsumd (Xh, H4h); - Xl ^= Xl3; - Xm ^= Xm3; + Xl = asm_xor (Xl, Xl3); + Xm = asm_xor (Xm, Xm3); t0 = asm_vpmsumd (Xl, c2); - Xh ^= Xh3; - Xl ^= asm_rot_block_left (Xm); - Xh ^= asm_rot_block_right (Xm); + Xh = asm_xor (Xh, Xh3); + t1 = asm_rot_block_left (Xm); + t2 = asm_rot_block_right (Xm); + Xl = asm_xor (Xl, t1); + Xh = asm_xor (Xh, t2); Xl = asm_swap_u64 (Xl); - Xl ^= t0; + Xl = asm_xor (Xl, t0); Xl_rotate = asm_swap_u64 (Xl); Xl = asm_vpmsumd (Xl, c2); - Xl_rotate ^= Xh; - Xl ^= Xl_rotate; - - cur = Xl; + Xh = asm_xor (Xh, Xl_rotate); + cur = asm_xor (Xh, Xl); } - cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const); - STORE_TABLE (result, 0, cur); + vec_store_he (vec_be_swap (cur, bswap_const), 0, result); } #endif /* GCM_USE_PPC_VPMSUM */ -- 2.27.0 From jussi.kivilinna at iki.fi Sun Mar 7 17:44:09 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 7 Mar 2021 18:44:09 +0200 Subject: [PATCH 3/4] configure.ac: fix digest implementations going to cipher list In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi> References: <20210307164410.508295-1-jussi.kivilinna@iki.fi> Message-ID: <20210307164410.508295-3-jussi.kivilinna@iki.fi> * configure.ac: Add 'crc-arm*.lo', 'crc-ppc.lo', 'sha*-ppc.lo' to GCRYPT_DIGESTS instead of GCRYPT_CIPHERS. -- Signed-off-by: Jussi Kivilinna --- configure.ac | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/configure.ac b/configure.ac index fb1a7a70..3734cf7a 100644 --- a/configure.ac +++ b/configure.ac @@ -2837,17 +2837,17 @@ if test "$found" = "1" ; then ;; aarch64-*-*) # Build with the assembly implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo" - GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-armv8-ce.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) - GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo" ;; powerpc64-*-*) - GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo" ;; powerpc-*-*) - GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo" ;; esac fi @@ -2914,17 +2914,17 @@ if test "$found" = "1" ; then ;; powerpc64le-*-*) # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo" esac case "$mpi_cpu_arch" in @@ -2957,17 +2957,17 @@ if test "$found" = "1" ; then ;; powerpc64le-*-*) # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo" esac if test x"$neonsupport" = xyes ; then -- 2.27.0 From jussi.kivilinna at iki.fi Sun Mar 7 17:44:10 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 7 Mar 2021 18:44:10 +0200 Subject: [PATCH 4/4] Compile arch specific GCM implementations only on target arch In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi> References: <20210307164410.508295-1-jussi.kivilinna@iki.fi> Message-ID: <20210307164410.508295-4-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Move arch specific 'cipher-gcm-*.[cS]' files from libcipher_la_SOURCES to EXTRA_libcipher_la_SOURCES. * configure.ac: Add 'cipher-gcm-intel-pclmul.lo' and 'cipher-gcm-arm*.lo'. -- Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 6 +++--- configure.ac | 16 ++++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cipher/Makefile.am b/cipher/Makefile.am index da8cc126..52a00aa9 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -48,8 +48,7 @@ libcipher_la_SOURCES = \ cipher-aeswrap.c \ cipher-ccm.c \ cipher-cmac.c \ - cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ - cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ + cipher-gcm.c \ cipher-poly1305.c \ cipher-ocb.c \ cipher-xts.c \ @@ -83,7 +82,8 @@ EXTRA_libcipher_la_SOURCES = \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ chacha20-ppc.c chacha20-s390x.S \ - cipher-gcm-ppc.c \ + cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ + cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ crc-ppc.c \ diff --git a/configure.ac b/configure.ac index 3734cf7a..44ffffa7 100644 --- a/configure.ac +++ b/configure.ac @@ -3060,14 +3060,18 @@ esac # Arch specific GCM implementations case "${host}" in - powerpc64le-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + i?86-*-* | x86_64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-intel-pclmul.lo" ;; - powerpc64-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + arm*-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv7-neon.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch32-ce.lo" + ;; + aarch64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch64-ce.lo" ;; - powerpc-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + powerpc64le-*-* | powerpc64-*-* | powerpc-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" ;; esac -- 2.27.0 From jussi.kivilinna at iki.fi Sat Mar 27 11:40:33 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Mar 2021 12:40:33 +0200 Subject: [PATCH] cipher-gcm-ppc: add big-endian support Message-ID: <20210327104033.70161-1-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-ppc.c (ALIGNED_16): New. (vec_store_he, vec_load_he): Remove WORDS_BIGENDIAN ifdef. (vec_dup_byte_elem): New. (_gcry_ghash_setup_ppc_vpmsum): Match function declaration with prototype in cipher-gcm.c; Load C2 with VEC_LOAD_BE; Use vec_dup_byte_elem; Align constants to 16 bytes. (_gcry_ghash_ppc_vpmsum): Match function declaration with prototype in cipher-gcm.c; Align constant to 16 bytes. * cipher/cipher-gcm.c (ghash_ppc_vpmsum): Return value from _gcry_ghash_ppc_vpmsum. * cipher/cipher-internal.h (GCM_USE_PPC_VPMSUM): Remove requirement for !WORDS_BIGENDIAN. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher-gcm-ppc.c | 45 +++++++++++++++++++++++----------------- cipher/cipher-gcm.c | 8 +++---- cipher/cipher-internal.h | 2 +- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c index 2f60c09d..4f75e95c 100644 --- a/cipher/cipher-gcm-ppc.c +++ b/cipher/cipher-gcm-ppc.c @@ -88,6 +88,8 @@ #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE +#define ALIGNED_16 __attribute__ ((aligned (16))) + typedef vector unsigned char vector16x_u8; typedef vector signed char vector16x_s8; typedef vector unsigned long long vector2x_u64; @@ -182,7 +184,6 @@ asm_ashl_128(block a, vector16x_u8 shift) static ASM_FUNC_ATTR_INLINE void vec_store_he(block vec, unsigned long offset, unsigned char *ptr) { -#ifndef WORDS_BIGENDIAN /* GCC vec_vsx_ld is generating two instructions on little-endian. Use * lxvd2x directly instead. */ #if __GNUC__ >= 4 @@ -197,9 +198,6 @@ vec_store_he(block vec, unsigned long offset, unsigned char *ptr) : : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr) : "memory", "r0"); -#else - vec_vsx_st ((vector16x_u8)vec, offset, ptr); -#endif } #define VEC_LOAD_BE(in_ptr, bswap_const) \ @@ -208,7 +206,6 @@ vec_store_he(block vec, unsigned long offset, unsigned char *ptr) static ASM_FUNC_ATTR_INLINE block vec_load_he(unsigned long offset, const unsigned char *ptr) { -#ifndef WORDS_BIGENDIAN block vec; /* GCC vec_vsx_ld is generating two instructions on little-endian. Use * lxvd2x directly instead. */ @@ -225,9 +222,6 @@ vec_load_he(unsigned long offset, const unsigned char *ptr) : "r" (offset), "r" ((uintptr_t)ptr) : "memory", "r0"); return vec; -#else - return vec_vsx_ld (offset, ptr); -#endif } static ASM_FUNC_ATTR_INLINE block @@ -243,6 +237,15 @@ vec_be_swap(block vec, vector16x_u8 be_bswap_const) return vec; } +static ASM_FUNC_ATTR_INLINE block +vec_dup_byte_elem(block vec, int idx) +{ +#ifndef WORDS_BIGENDIAN + return (block)vec_splat((vector16x_s8)vec, idx); +#else + return (block)vec_splat((vector16x_s8)vec, (15 - idx) & 15); +#endif +} /* Power ghash based on papers: "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega @@ -259,31 +262,33 @@ vec_be_swap(block vec, vector16x_u8 be_bswap_const) The ghash "key" is a salt. */ void ASM_FUNC_ATTR -_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) +_gcry_ghash_setup_ppc_vpmsum (void *gcm_table_arg, void *gcm_key) { - static const vector16x_u8 bswap_const = + static const vector16x_u8 bswap_const ALIGNED_16 = { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; - static const vector16x_u8 c2 = - { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 }; - static const vector16x_u8 one = + static const byte c2[16] ALIGNED_16 = + { 0xc2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + static const vector16x_u8 one ALIGNED_16 = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + uint64_t *gcm_table = gcm_table_arg; block T0, T1, T2; block C2, H, H1, H1l, H1h, H2, H2l, H2h; block H3l, H3, H3h, H4l, H4, H4h, T3, T4; vector16x_s8 most_sig_of_H, t7, carry; H = VEC_LOAD_BE(gcm_key, bswap_const); - most_sig_of_H = vec_splat((vector16x_s8)H, 15); + C2 = VEC_LOAD_BE(c2, bswap_const); + most_sig_of_H = (vector16x_s8)vec_dup_byte_elem(H, 15); t7 = vec_splat_s8(7); carry = most_sig_of_H >> t7; - carry &= c2; /* only interested in certain carries. */ + carry &= (vector16x_s8)C2; /* only interested in certain carries. */ H1 = asm_ashl_128(H, one); H1 ^= (block)carry; /* complete the <<< 1 */ T1 = asm_swap_u64 (H1); H1l = asm_rot_block_right (T1); H1h = asm_rot_block_left (T1); - C2 = asm_rot_block_right ((block)c2); + C2 = asm_rot_block_right (C2); STORE_TABLE (gcm_table, 0, C2); STORE_TABLE (gcm_table, 1, H1l); @@ -367,11 +372,11 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) STORE_TABLE (gcm_table, 12, H4h); } -void ASM_FUNC_ATTR -_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table, +unsigned int ASM_FUNC_ATTR +_gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table, const byte *buf, const size_t nblocks) { - static const vector16x_u8 bswap_const = + static const vector16x_u8 bswap_const ALIGNED_16 = { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl; block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur; @@ -539,6 +544,8 @@ _gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table, } vec_store_he (vec_be_swap (cur, bswap_const), 0, result); + + return 0; } #endif /* GCM_USE_PPC_VPMSUM */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 598ea5fb..4ce85408 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -119,16 +119,16 @@ extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table, static void ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c) { - _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key); + _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, + c->u_mode.gcm.u_ghash_key.key); } static unsigned int ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks) { - _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf, - nblocks); - return 0; + return _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf, + nblocks); } #endif /* GCM_USE_PPC_VPMSUM */ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 1d62b11e..0e4a90fc 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -107,7 +107,7 @@ #undef GCM_USE_PPC_VPMSUM #if defined(GCM_USE_TABLES) #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \ - !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4 # define GCM_USE_PPC_VPMSUM 1 # define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */ -- 2.27.0 From jussi.kivilinna at iki.fi Sat Mar 27 15:31:05 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Mar 2021 16:31:05 +0200 Subject: [PATCH] Use memset for short constant length wipememory2 Message-ID: <20210327143105.91148-1-jussi.kivilinna@iki.fi> * src/g10lib.h (fast_wipememory2_inline): New. (wipememory2): Use 'fast_wipememory2_inline', remove 'fast_wipememory2' use; Use _gcry_fast_wipememory* only when _len is not constant. (fast_wipememory_s, fast_wipememory2): Remove. -- Use of memset allows better code generation by compiler - for example, use of vector registers for memory clearing. Signed-off-by: Jussi Kivilinna --- src/g10lib.h | 59 ++++++++++++++++------------------------------------ 1 file changed, 18 insertions(+), 41 deletions(-) diff --git a/src/g10lib.h b/src/g10lib.h index b0b73852..bc6e378f 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -359,56 +359,33 @@ void __gcry_burn_stack (unsigned int bytes); __gcry_burn_stack_dummy (); } while(0) /* To avoid that a compiler optimizes certain memset calls away, these - macros may be used instead. For small constant length buffers, - memory wiping is inlined. For non-constant or large length buffers, - memory is wiped with memset through _gcry_fast_wipememory. */ + macros may be used instead. For constant length buffers, memory + wiping is inlined. For non-constant or large length buffers, + memory is wiped through _gcry_fast_wipememory. */ +#ifdef HAVE_GCC_ASM_VOLATILE_MEMORY +#define fast_wipememory2_inline(_ptr,_set,_len) do { \ + memset((_ptr), (_set), (_len)); \ + asm volatile ("\n" :: "r" (_ptr) : "memory"); \ + } while(0) +#else +#define fast_wipememory2_inline(_ptr,_set,_len) \ + _gcry_fast_wipememory2((void *)_ptr, _set, _len) +#endif #define wipememory2(_ptr,_set,_len) do { \ - if (!CONSTANT_P(_len) || _len > 64) { \ + if (!CONSTANT_P(_len)) { \ if (CONSTANT_P(_set) && (_set) == 0) \ - _gcry_fast_wipememory((void *)_ptr, _len); \ + _gcry_fast_wipememory((void *)(_ptr), (_len)); \ else \ - _gcry_fast_wipememory2((void *)_ptr, _set, _len); \ - } else {\ - volatile char *_vptr = (volatile char *)(_ptr); \ - size_t _vlen = (_len); \ - const unsigned char _vset = (_set); \ - fast_wipememory2(_vptr, _vset, _vlen); \ - while(_vlen) { *_vptr = (_vset); _vptr++; _vlen--; } \ + _gcry_fast_wipememory2((void *)(_ptr), (_set), (_len)); \ + } else { \ + fast_wipememory2_inline((void *)(_ptr), (_set), (_len)); \ } \ } while(0) -#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len) +#define wipememory(_ptr,_len) wipememory2((_ptr),0,(_len)) void _gcry_fast_wipememory(void *ptr, size_t len); void _gcry_fast_wipememory2(void *ptr, int set, size_t len); -#if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \ - defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \ - defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS) -typedef struct fast_wipememory_s -{ - u64 a; -} __attribute__((packed, aligned(1), may_alias)) fast_wipememory_t; -/* fast_wipememory may leave tail bytes unhandled, in which case tail bytes - are handled by wipememory. */ -# define fast_wipememory2(_vptr,_vset,_vlen) do { \ - fast_wipememory_t _vset_long; \ - if (_vlen < sizeof(fast_wipememory_t)) \ - break; \ - _vset_long.a = (_vset); \ - _vset_long.a *= U64_C(0x0101010101010101); \ - do { \ - volatile fast_wipememory_t *_vptr_long = \ - (volatile void *)_vptr; \ - _vptr_long->a = _vset_long.a; \ - _vlen -= sizeof(fast_wipememory_t); \ - _vptr += sizeof(fast_wipememory_t); \ - } while (_vlen >= sizeof(fast_wipememory_t)); \ - } while (0) -#else -# define fast_wipememory2(_vptr,_vset,_vlen) -#endif - - /* Digit predicates. */ #define digitp(p) (*(p) >= '0' && *(p) <= '9') -- 2.27.0 From jussi.kivilinna at iki.fi Sat Mar 27 15:32:44 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Mar 2021 16:32:44 +0200 Subject: [PATCH] Add XOF output support for internal _gcry_md_hash_buffers Message-ID: <20210327143244.91421-1-jussi.kivilinna@iki.fi> * cipher/ecc-eddsa.c (_gcry_ecc_eddsa_compute_h_d, _gcry_ecc_eddsa_sign) (_gcry_ecc_eddsa_verify): Use same _gcry_md_hash_buffers_extract code path for SHA512 and SHAKE256. * cipher/md.c (_gcry_md_hash_buffers): Rename to ... (_gcry_md_hash_buffers_extract): ... this; Add digestlen and handling for XOF algorithms (SHAKE128, SHAKE256). (_gcry_md_hash_buffers): New. * src/gcrypt-int.h (_gcry_md_hash_buffers_extract): New. -- Signed-off-by: Jussi Kivilinna --- cipher/ecc-eddsa.c | 440 +++++++++++++++++---------------------------- cipher/md.c | 51 ++++-- src/gcrypt-int.h | 4 + 3 files changed, 212 insertions(+), 283 deletions(-) diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c index 2a1a8907..baea1bf5 100644 --- a/cipher/ecc-eddsa.c +++ b/cipher/ecc-eddsa.c @@ -500,7 +500,8 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec) unsigned char *rawmpi = NULL; unsigned int rawmpilen; unsigned char *digest; - int hashalgo, b; + int hashalgo, b, digestlen; + gcry_buffer_t hvec[2]; *r_digest = NULL; @@ -511,11 +512,15 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec) * For now, it's determine by the bit size of the field. */ if (ec->nbits == 255) - hashalgo = GCRY_MD_SHA512; + { + hashalgo = GCRY_MD_SHA512; + digestlen = 64; + } else if (ec->nbits == 448) { b++; hashalgo = GCRY_MD_SHAKE256; + digestlen = 2 * b; } else return GPG_ERR_NOT_IMPLEMENTED; @@ -533,35 +538,14 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec) return gpg_err_code_from_syserror (); } - if (hashalgo == GCRY_MD_SHAKE256) - { - gcry_error_t err; - gcry_md_hd_t hd; + memset (hvec, 0, sizeof hvec); - err = _gcry_md_open (&hd, hashalgo, 0); - if (err) - rc = gcry_err_code (err); - else - { - _gcry_md_write (hd, rawmpi, rawmpilen); - _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b); - _gcry_md_close (hd); - rc = 0; - } - } - else - { - gcry_buffer_t hvec[2]; - - memset (hvec, 0, sizeof hvec); - - hvec[0].data = digest; - hvec[0].len = b > rawmpilen? b - rawmpilen : 0; - hvec[1].data = rawmpi; - hvec[1].len = rawmpilen; - rc = _gcry_md_hash_buffers (hashalgo, 0, digest, hvec, 2); - } + hvec[0].data = digest; + hvec[0].len = (hashalgo == GCRY_MD_SHA512 && b > rawmpilen) + ? b - rawmpilen : 0; + hvec[1].data = rawmpi; + hvec[1].len = rawmpilen; + rc = _gcry_md_hash_buffers_extract (hashalgo, 0, digest, digestlen, hvec, 2); xfree (rawmpi); if (rc) @@ -702,16 +686,29 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec, unsigned int encpklen; mpi_point_struct I; /* Intermediate value. */ gcry_mpi_t a, x, y, r; - int b; + const char *dom; + int domlen, digestlen; + int b, i; unsigned char x_olen[2]; unsigned char prehashed_msg[64]; + gcry_buffer_t hvec[6]; + gcry_buffer_t hvec2[1]; b = (ec->nbits+7)/8; if (ec->nbits == 255) - ; + { + dom = DOM25519; + domlen = DOM25519_LEN; + digestlen = 64; + } else if (ec->nbits == 448) - b++; + { + b++; + dom = DOM448; + domlen = DOM448_LEN; + digestlen = 2 * b; + } else return GPG_ERR_NOT_IMPLEMENTED; @@ -751,98 +748,58 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec, if (DBG_CIPHER) log_printhex (" m", mbuf, mlen); - if (ctx->hash_algo == GCRY_MD_SHAKE256) + memset (hvec, 0, sizeof hvec); + i = 0; + + if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448) { - gcry_error_t err; - gcry_md_hd_t hd; + hvec[i].data = (void *)dom; + hvec[i].len = domlen; + i++; + x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); + x_olen[1] = ctx->labellen; + hvec[i].data = x_olen; + hvec[i].len = 2; + i++; + if (ctx->labellen) + { + hvec[i].data = ctx->label; + hvec[i].len = ctx->labellen; + i++; + } + } - err = _gcry_md_open (&hd, ctx->hash_algo, 0); - if (err) - rc = gcry_err_code (err); - else - { - _gcry_md_write (hd, DOM448, DOM448_LEN); - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - _gcry_md_write (hd, x_olen, 2); - if (ctx->labellen) - _gcry_md_write (hd, ctx->label, ctx->labellen); - _gcry_md_write (hd, digest+b, b); - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - { - gcry_md_hd_t hd2; + hvec[i].data = digest; + hvec[i].off = b; + hvec[i].len = b; + i++; + if ((ctx->flags & PUBKEY_FLAG_PREHASH)) + { + memset (hvec2, 0, sizeof hvec2); - err = _gcry_md_open (&hd2, ctx->hash_algo, 0); - if (err) - { - rc = gcry_err_code (err); - _gcry_md_close (hd); - goto leave; - } - _gcry_md_write (hd2, mbuf, mlen); - _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64); - _gcry_md_close (hd2); - _gcry_md_write (hd, prehashed_msg, 64); - } - else - _gcry_md_write (hd, mbuf, mlen); - _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b); - _gcry_md_close (hd); - rc = 0; - } + hvec2[0].data = (char*)mbuf; + hvec2[0].len = mlen; + + _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, prehashed_msg, 64, + hvec2, 1); + hvec[i].data = (char*)prehashed_msg; + hvec[i].len = 64; } else { - gcry_buffer_t hvec[6]; - int i = 0; - - memset (hvec, 0, sizeof hvec); - - if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen) - { - hvec[i].data = (void *)DOM25519; - hvec[i].len = DOM25519_LEN; - i++; - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - hvec[i].data = x_olen; - hvec[i].len = 2; - i++; - if (ctx->labellen) - { - hvec[i].data = ctx->label; - hvec[i].len = ctx->labellen; - i++; - } - } - - hvec[i].data = digest; - hvec[i].off = b; - hvec[i].len = b; - i++; - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - { - _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen); - hvec[i].data = (char*)prehashed_msg; - hvec[i].len = 64; - } - else - { - hvec[i].data = (char*)mbuf; - hvec[i].len = mlen; - } - i++; - rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i); + hvec[i].data = (char*)mbuf; + hvec[i].len = mlen; } + i++; + rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen, + hvec, i); if (rc) goto leave; - reverse_buffer (digest, 2*b); + reverse_buffer (digest, digestlen); if (DBG_CIPHER) - log_printhex (" r", digest, 2*b); - _gcry_mpi_set_buffer (r, digest, 2*b, 0); + log_printhex (" r", digest, digestlen); + _gcry_mpi_set_buffer (r, digest, digestlen, 0); mpi_mod (r, r, ec->n); _gcry_mpi_ec_mul_point (&I, r, ec->G, ec); if (DBG_CIPHER) @@ -855,80 +812,48 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec, if (DBG_CIPHER) log_printhex (" e_r", rawmpi, rawmpilen); - if (ctx->hash_algo == GCRY_MD_SHAKE256) + memset (hvec, 0, sizeof hvec); + i = 0; + + if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448) { - gcry_error_t err; - gcry_md_hd_t hd; + hvec[i].data = (void *)dom; + hvec[i].len = domlen; + i++; + x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); + x_olen[1] = ctx->labellen; + hvec[i].data = x_olen; + hvec[i].len = 2; + i++; + if (ctx->labellen) + { + hvec[i].data = ctx->label; + hvec[i].len = ctx->labellen; + i++; + } + } - err = _gcry_md_open (&hd, ctx->hash_algo, 0); - if (err) - rc = gcry_err_code (err); - else - { - _gcry_md_write (hd, DOM448, DOM448_LEN); - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - _gcry_md_write (hd, x_olen, 2); - if (ctx->labellen) - _gcry_md_write (hd, ctx->label, ctx->labellen); - _gcry_md_write (hd, rawmpi, rawmpilen); - _gcry_md_write (hd, encpk, encpklen); - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - _gcry_md_write (hd, prehashed_msg, 64); - else - _gcry_md_write (hd, mbuf, mlen); - _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b); - _gcry_md_close (hd); - rc = 0; - } + /* S = r + a * H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) mod n */ + hvec[i].data = rawmpi; /* (this is R) */ + hvec[i].len = rawmpilen; + i++; + hvec[i].data = encpk; + hvec[i].len = encpklen; + i++; + if ((ctx->flags & PUBKEY_FLAG_PREHASH)) + { + hvec[i].data = (char*)prehashed_msg; + hvec[i].len = 64; } else { - gcry_buffer_t hvec[6]; - int i = 0; - - memset (hvec, 0, sizeof hvec); - - if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen) - { - hvec[i].data = (void *)DOM25519; - hvec[i].len = DOM25519_LEN; - i++; - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - hvec[i].data = x_olen; - hvec[i].len = 2; - i++; - if (ctx->labellen) - { - hvec[i].data = ctx->label; - hvec[i].len = ctx->labellen; - i++; - } - } - - /* S = r + a * H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) mod n */ - hvec[i].data = rawmpi; /* (this is R) */ - hvec[i].len = rawmpilen; - i++; - hvec[i].data = encpk; - hvec[i].len = encpklen; - i++; - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - { - hvec[i].data = (char*)prehashed_msg; - hvec[i].len = 64; - } - else - { - hvec[i].data = (char*)mbuf; - hvec[i].len = mlen; - } - i++; - rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i); + hvec[i].data = (char*)mbuf; + hvec[i].len = mlen; } + i++; + rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen, + hvec, i); if (rc) goto leave; @@ -936,10 +861,10 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec, mpi_set_opaque (r_r, rawmpi, rawmpilen*8); rawmpi = NULL; - reverse_buffer (digest, 2*b); + reverse_buffer (digest, digestlen); if (DBG_CIPHER) - log_printhex (" H(R+)", digest, 2*b); - _gcry_mpi_set_buffer (s, digest, 2*b, 0); + log_printhex (" H(R+)", digest, digestlen); + _gcry_mpi_set_buffer (s, digest, digestlen, 0); mpi_mulm (s, s, a, ec->n); mpi_addm (s, s, r, ec->n); rc = eddsa_encodempi (s, ec->nbits, &rawmpi, &rawmpilen); @@ -985,8 +910,13 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec, unsigned char digest[114]; gcry_mpi_t h, s; mpi_point_struct Ia, Ib; + const char *dom; + int domlen, digestlen; + int i; unsigned char x_olen[2]; unsigned char prehashed_msg[64]; + gcry_buffer_t hvec[6]; + gcry_buffer_t hvec2[1]; if (!mpi_is_opaque (input) || !mpi_is_opaque (r_in) || !mpi_is_opaque (s_in)) return GPG_ERR_INV_DATA; @@ -999,9 +929,18 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec, b = (ec->nbits+7)/8; if (ec->nbits == 255) - ; + { + dom = DOM25519; + domlen = DOM25519_LEN; + digestlen = 64; + } else if (ec->nbits == 448) - b++; + { + b++; + dom = DOM448; + domlen = DOM448_LEN; + digestlen = 2 * b; + } else return GPG_ERR_NOT_IMPLEMENTED; @@ -1038,102 +977,61 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec, goto leave; } - if (ctx->hash_algo == GCRY_MD_SHAKE256) + memset (hvec, 0, sizeof hvec); + i = 0; + + /* h = H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) */ + if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448) { - gcry_error_t err; - gcry_md_hd_t hd; + hvec[i].data = (void *)dom; + hvec[i].len = domlen; + i++; + x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); + x_olen[1] = ctx->labellen; + hvec[i].data = x_olen; + hvec[i].len = 2; + i++; + if (ctx->labellen) + { + hvec[i].data = ctx->label; + hvec[i].len = ctx->labellen; + i++; + } + } - err = _gcry_md_open (&hd, ctx->hash_algo, 0); - if (err) - rc = gcry_err_code (err); - else - { - _gcry_md_write (hd, DOM448, DOM448_LEN); - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - _gcry_md_write (hd, x_olen, 2); - if (ctx->labellen) - _gcry_md_write (hd, ctx->label, ctx->labellen); - _gcry_md_write (hd, rbuf, rlen); - _gcry_md_write (hd, encpk, encpklen); - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - { - gcry_md_hd_t hd2; + hvec[i].data = (char*)rbuf; + hvec[i].len = rlen; + i++; + hvec[i].data = encpk; + hvec[i].len = encpklen; + i++; + if ((ctx->flags & PUBKEY_FLAG_PREHASH)) + { + memset (hvec2, 0, sizeof hvec2); - err = _gcry_md_open (&hd2, ctx->hash_algo, 0); - if (err) - { - rc = gcry_err_code (err); - _gcry_md_close (hd); - goto leave; - } - _gcry_md_write (hd2, mbuf, mlen); - _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64); - _gcry_md_close (hd2); - _gcry_md_write (hd, prehashed_msg, 64); - } - else - _gcry_md_write (hd, mbuf, mlen); - _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0); - _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b); - _gcry_md_close (hd); - rc = 0; - } + hvec2[0].data = (char*)mbuf; + hvec2[0].len = mlen; + + _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, prehashed_msg, 64, + hvec2, 1); + hvec[i].data = (char*)prehashed_msg; + hvec[i].len = 64; } else { - gcry_buffer_t hvec[6]; - int i = 0; - - memset (hvec, 0, sizeof hvec); - - /* h = H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) */ - if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen) - { - hvec[i].data = (void *)DOM25519; - hvec[i].len = DOM25519_LEN; - i++; - x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH); - x_olen[1] = ctx->labellen; - hvec[i].data = x_olen; - hvec[i].len = 2; - i++; - if (ctx->labellen) - { - hvec[i].data = ctx->label; - hvec[i].len = ctx->labellen; - i++; - } - } - - hvec[i].data = (char*)rbuf; - hvec[i].len = rlen; - i++; - hvec[i].data = encpk; - hvec[i].len = encpklen; - i++; - if ((ctx->flags & PUBKEY_FLAG_PREHASH)) - { - _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen); - hvec[i].data = (char*)prehashed_msg; - hvec[i].len = 64; - } - else - { - hvec[i].data = (char*)mbuf; - hvec[i].len = mlen; - } - i++; - rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i); + hvec[i].data = (char*)mbuf; + hvec[i].len = mlen; } + i++; + rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen, + hvec, i); if (rc) goto leave; - reverse_buffer (digest, 2*b); + reverse_buffer (digest, digestlen); if (DBG_CIPHER) - log_printhex (" H(R+)", digest, 2*b); - _gcry_mpi_set_buffer (h, digest, 2*b, 0); + log_printhex (" H(R+)", digest, digestlen); + _gcry_mpi_set_buffer (h, digest, digestlen, 0); /* According to the paper the best way for verification is: encodepoint(sG - h?Q) = encodepoint(r) diff --git a/cipher/md.c b/cipher/md.c index efb7376a..87979059 100644 --- a/cipher/md.c +++ b/cipher/md.c @@ -1251,11 +1251,15 @@ _gcry_md_hash_buffer (int algo, void *digest, used as the key. On success 0 is returned and resulting hash or HMAC is stored at - DIGEST which must have been provided by the caller with an - appropriate length. */ + DIGEST. DIGESTLEN may be given as -1, in which case DIGEST must + have been provided by the caller with an appropriate length. + DIGESTLEN may also be the appropriate length or, in case of XOF + algorithms, DIGESTLEN indicates number bytes to extract from XOF + to DIGEST. */ gpg_err_code_t -_gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, - const gcry_buffer_t *iov, int iovcnt) +_gcry_md_hash_buffers_extract (int algo, unsigned int flags, void *digest, + int digestlen, const gcry_buffer_t *iov, + int iovcnt) { gcry_md_spec_t *spec; int hmac; @@ -1287,6 +1291,11 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, } } + if (spec->mdlen > 0 && digestlen != -1 && digestlen != spec->mdlen) + return GPG_ERR_DIGEST_ALGO; + if (spec->mdlen == 0 && digestlen == -1) + return GPG_ERR_DIGEST_ALGO; + if (!hmac && spec->hash_buffers) { spec->hash_buffers (digest, iov, iovcnt); @@ -1297,13 +1306,6 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, normal functions. */ gcry_md_hd_t h; gpg_err_code_t rc; - int dlen; - - /* Detect SHAKE128 like algorithms which we can't use because - * our API does not allow for a variable length digest. */ - dlen = md_digest_length (algo); - if (!dlen) - return GPG_ERR_DIGEST_ALGO; rc = md_open (&h, algo, (hmac? GCRY_MD_FLAG_HMAC:0)); if (rc) @@ -1324,7 +1326,10 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, for (;iovcnt; iov++, iovcnt--) md_write (h, (const char*)iov[0].data + iov[0].off, iov[0].len); md_final (h); - memcpy (digest, md_read (h, algo), dlen); + if (spec->mdlen > 0) + memcpy (digest, md_read (h, algo), spec->mdlen); + else if (digestlen > 0) + md_extract (h, algo, digest, digestlen); md_close (h); } @@ -1332,6 +1337,28 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, } +/* Shortcut function to hash multiple buffers with a given algo. In + contrast to gcry_md_hash_buffer, this function returns an error on + invalid arguments or on other problems; disabled algorithms are + _not_ ignored but flagged as an error. + + The data to sign is taken from the array IOV which has IOVCNT items. + + The only supported flag in FLAGS is GCRY_MD_FLAG_HMAC which turns + this function into a HMAC function; the first item in IOV is then + used as the key. + + On success 0 is returned and resulting hash or HMAC is stored at + DIGEST which must have been provided by the caller with an + appropriate length. */ +gpg_err_code_t +_gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, + const gcry_buffer_t *iov, int iovcnt) +{ + return _gcry_md_hash_buffers_extract(algo, flags, digest, -1, iov, iovcnt); +} + + static int md_get_algo (gcry_md_hd_t a) { diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h index 086953d7..d8b6d407 100644 --- a/src/gcrypt-int.h +++ b/src/gcrypt-int.h @@ -128,6 +128,10 @@ gpg_err_code_t _gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer, size_t length); void _gcry_md_hash_buffer (int algo, void *digest, const void *buffer, size_t length); +gpg_err_code_t _gcry_md_hash_buffers_extract (int algo, unsigned int flags, + void *digest, int digestlen, + const gcry_buffer_t *iov, + int iovcnt); gpg_err_code_t _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest, const gcry_buffer_t *iov, int iovcnt); -- 2.27.0 From jcb62281 at gmail.com Sun Mar 28 04:05:20 2021 From: jcb62281 at gmail.com (Jacob Bachmeyer) Date: Sat, 27 Mar 2021 21:05:20 -0500 Subject: [PATCH] Use memset for short constant length wipememory2 In-Reply-To: <20210327143105.91148-1-jussi.kivilinna@iki.fi> References: <20210327143105.91148-1-jussi.kivilinna@iki.fi> Message-ID: <605FE460.60209@gmail.com> Jussi Kivilinna wrote: > Use of memset allows better code generation by compiler - > for example, use of vector registers for memory clearing. > Are you sure that this is a good idea? The comment indicates that the original purpose of not using memset was to ensure that a compiler cannot optimize the call away. -- Jacob From jussi.kivilinna at iki.fi Sun Mar 28 15:09:08 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 28 Mar 2021 16:09:08 +0300 Subject: [PATCH] Use memset for short constant length wipememory2 In-Reply-To: <605FE460.60209@gmail.com> References: <20210327143105.91148-1-jussi.kivilinna@iki.fi> <605FE460.60209@gmail.com> Message-ID: On 28.3.2021 5.05, Jacob Bachmeyer via Gcrypt-devel wrote: > Jussi Kivilinna wrote: >> Use of memset allows better code generation by compiler - >> for example, use of vector registers for memory clearing. > > > Are you sure that this is a good idea?? The comment indicates that the original purpose of not using memset was to ensure that a compiler cannot optimize the call away. > New approach uses inline assembly memory barrier to prevent optimizing away preceding memset: memset(ptr_memory_to_wipe, 0, memory_length); asm volatile ("\n"::"r"(ptr_memory_to_wipe):"memory"); I'll update the comment to better explain this. -Jussi From jussi.kivilinna at iki.fi Tue Mar 30 17:56:24 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 30 Mar 2021 18:56:24 +0300 Subject: [PATCH v2] Use memset for constant length wipememory2 Message-ID: <20210330155624.852568-1-jussi.kivilinna@iki.fi> * src/g10lib.h (fast_wipememory2_inline): New. (wipememory2): Use 'fast_wipememory2_inline', remove 'fast_wipememory2' use; Use _gcry_fast_wipememory* when _len or _set is not constant. (fast_wipememory_s, fast_wipememory2): Remove. -- Use of memset allows better code generation by compiler - for example, use of vector registers for memory clearing. Dead store elimination of memset by compiler optimization is avoided by using assembly block after memset: memset(ptr_mem_wipe, 0, constant_mem_len); asm volatile ("\n" :: "r" (ptr_mem_wipe) : "memory"); Signed-off-by: Jussi Kivilinna --- src/g10lib.h | 62 +++++++++++++++++----------------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/src/g10lib.h b/src/g10lib.h index b0b73852..fb288a30 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -358,57 +358,35 @@ void __gcry_burn_stack (unsigned int bytes); do { __gcry_burn_stack (bytes); \ __gcry_burn_stack_dummy (); } while(0) -/* To avoid that a compiler optimizes certain memset calls away, these - macros may be used instead. For small constant length buffers, - memory wiping is inlined. For non-constant or large length buffers, - memory is wiped with memset through _gcry_fast_wipememory. */ +/* To avoid that a compiler optimizes certain memset calls away, this + macro may be used instead. For constant length buffers, memory + wiping is inlined. Dead store elimination of inlined memset is + avoided here by using assembly block after memset. For non-constant + length buffers, memory is wiped through _gcry_fast_wipememory. */ +#ifdef HAVE_GCC_ASM_VOLATILE_MEMORY +#define fast_wipememory2_inline(_ptr,_set,_len) do { \ + memset((_ptr), (_set), (_len)); \ + asm volatile ("\n" :: "r" (_ptr) : "memory"); \ + } while(0) +#else +#define fast_wipememory2_inline(_ptr,_set,_len) \ + _gcry_fast_wipememory2((void *)_ptr, _set, _len) +#endif #define wipememory2(_ptr,_set,_len) do { \ - if (!CONSTANT_P(_len) || _len > 64) { \ + if (!CONSTANT_P(_len) || !CONSTANT_P(_set)) { \ if (CONSTANT_P(_set) && (_set) == 0) \ - _gcry_fast_wipememory((void *)_ptr, _len); \ + _gcry_fast_wipememory((void *)(_ptr), (_len)); \ else \ - _gcry_fast_wipememory2((void *)_ptr, _set, _len); \ - } else {\ - volatile char *_vptr = (volatile char *)(_ptr); \ - size_t _vlen = (_len); \ - const unsigned char _vset = (_set); \ - fast_wipememory2(_vptr, _vset, _vlen); \ - while(_vlen) { *_vptr = (_vset); _vptr++; _vlen--; } \ + _gcry_fast_wipememory2((void *)(_ptr), (_set), (_len)); \ + } else { \ + fast_wipememory2_inline((void *)(_ptr), (_set), (_len)); \ } \ } while(0) -#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len) +#define wipememory(_ptr,_len) wipememory2((_ptr),0,(_len)) void _gcry_fast_wipememory(void *ptr, size_t len); void _gcry_fast_wipememory2(void *ptr, int set, size_t len); -#if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \ - defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \ - defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS) -typedef struct fast_wipememory_s -{ - u64 a; -} __attribute__((packed, aligned(1), may_alias)) fast_wipememory_t; -/* fast_wipememory may leave tail bytes unhandled, in which case tail bytes - are handled by wipememory. */ -# define fast_wipememory2(_vptr,_vset,_vlen) do { \ - fast_wipememory_t _vset_long; \ - if (_vlen < sizeof(fast_wipememory_t)) \ - break; \ - _vset_long.a = (_vset); \ - _vset_long.a *= U64_C(0x0101010101010101); \ - do { \ - volatile fast_wipememory_t *_vptr_long = \ - (volatile void *)_vptr; \ - _vptr_long->a = _vset_long.a; \ - _vlen -= sizeof(fast_wipememory_t); \ - _vptr += sizeof(fast_wipememory_t); \ - } while (_vlen >= sizeof(fast_wipememory_t)); \ - } while (0) -#else -# define fast_wipememory2(_vptr,_vset,_vlen) -#endif - - /* Digit predicates. */ #define digitp(p) (*(p) >= '0' && *(p) <= '9') -- 2.27.0 From guidovranken at gmail.com Wed Mar 31 20:42:23 2021 From: guidovranken at gmail.com (Guido Vranken) Date: Wed, 31 Mar 2021 20:42:23 +0200 Subject: CMAC + SERPENT/IDEA/RC2 buffer overflow/crash with oversized key Message-ID: In the program below, each of three calls to cmac() causes a different crash (use AddressSanitizer to be sure). I think the correct approach is to make gcry_mac_setkey() return an error code if the key has an inappropriate size. #include #define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; } static void cmac(const int mac, const int keysize) { unsigned char key[keysize]; memset(key, 0, keysize); gcry_mac_hd_t h; CF_CHECK_EQ(gcry_mac_open(&h, mac, 0, NULL), GPG_ERR_NO_ERROR); CF_CHECK_EQ(gcry_mac_setkey(h, key, keysize), GPG_ERR_NO_ERROR); end: /* noret */ gcry_mac_close(h); } int main(void) { cmac(GCRY_MAC_CMAC_SERPENT, 64); cmac(GCRY_MAC_CMAC_IDEA, 32); cmac(GCRY_MAC_CMAC_RFC2268, 256); return 0; } -------------- next part -------------- An HTML attachment was scrubbed... URL: