From horst.wente at posteo.de  Tue Mar  2 00:02:14 2021
From: horst.wente at posteo.de (horst.wente at posteo.de)
Date: Tue, 02 Mar 2021 00:02:14 +0100
Subject: libgcrypt-1.9.2: cipher/poly1305.c does not compile on i686/32 with
 gcc-4.7.3 ( --disable-asm does not work)
Message-ID: <d31a3e8ab4343df8fc2eefbaf6a99c59@posteo.de>

Hello!

The "#define ADD_1305_32" ( for "defined (__i386__) && __GNUC__ >= 4" ) 
leads to

> poly1305.c: In function 'poly1305_blocks':
> poly1305.c:424:7: error: can't find a register in class 'GENERAL_REGS' 
> while
     reloading 'asm'
> poly1305.c:424:7: error: 'asm' operand has impossible constraints

I suppose the template uses too many registers, so I redefined it and it 
seems to
work for me ( all tests successful ).

hth
horst wente


--- cipher/poly1305.c.orig      2021-01-28 13:53:52.000000000 +0100
+++ cipher/poly1305.c   2021-03-01 20:25:26.000000000 +0100
@@ -314,14 +314,19 @@

  /* A += B (i386) */
  #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
-      __asm__ ("addl %5, %0\n" \
-              "adcl %6, %1\n" \
-              "adcl %7, %2\n" \
-              "adcl %8, %3\n" \
-              "adcl %9, %4\n" \
-              : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
-              : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
-              : "cc" )
+      __asm__ ("clc\n"         \
+               "addl %5, %0\n" \
+               "movl %6, %5\n" \
+               "adcl %5, %1\n" \
+               "movl %7, %5\n" \
+               "adcl %5, %2\n" \
+               "movl %8, %5\n" \
+               "adcl %5, %3\n" \
+               "movl %9, %5\n" \
+               "adcl %5, %4\n" \
+               : "+m" (A0), "+m" (A1), "+m" (A2), "+m" (A3), "+m" (A4)  
\
+               : "r" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4)       
\
+               : "cc" )

  #endif /* __i386__ */


From jussi.kivilinna at iki.fi  Wed Mar  3 18:07:03 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed,  3 Mar 2021 19:07:03 +0200
Subject: [PATCH 1/3] cipher-gcm-intel-pclmul: fix compiling with i386 gcc-4.7
Message-ID: <20210303170705.1614871-1-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-intel-pclmul.c (gcm_lsh): Pass '*pconst' instead of
'pconst' to asm block.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-intel-pclmul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 28165c65..334c89cd 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -397,7 +397,7 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
                 "pxor %%xmm4, %%xmm2\n\t"
                 "movdqu %%xmm2, (%[h])\n\t"
                 :
-                : [pconst] "m" (pconst),
+                : [pconst] "m" (*pconst),
                   [h] "r" ((byte *)h + hoffs)
                 : "memory" );
 }
-- 
2.27.0


From jussi.kivilinna at iki.fi  Wed Mar  3 18:07:05 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed,  3 Mar 2021 19:07:05 +0200
Subject: [PATCH 3/3] poly1305: make --disable-asm work on x86, aarch64 and ppc
In-Reply-To: <20210303170705.1614871-1-jussi.kivilinna@iki.fi>
References: <20210303170705.1614871-1-jussi.kivilinna@iki.fi>
Message-ID: <20210303170705.1614871-3-jussi.kivilinna@iki.fi>

* cipher/poly1305.c [__aarch64__] (ADD_1305_64): Check for
HAVE_CPU_ARCH_ARM.
[__x86_64__] (ADD_1305_64): Check for HAVE_CPU_ARCH_X86.
[__powerpc__] (ADD_1305_64): Check for HAVE_CPU_ARCH_PPC.
[__i386__] (ADD_1305_32): Check for HAVE_CPU_ARCH_X86.
--

Reported-by: Horst Wente <horst.wente at posteo.de>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/poly1305.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index da8f4601..41e55e8d 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -105,7 +105,7 @@ static void poly1305_init (poly1305_context_t *ctx,
 
 #ifdef USE_MPI_64BIT
 
-#if defined (__aarch64__) && __GNUC__ >= 4
+#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4
 
 /* A += B (armv8/aarch64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
@@ -118,7 +118,7 @@ static void poly1305_init (poly1305_context_t *ctx,
 
 #endif /* __aarch64__ */
 
-#if defined (__x86_64__) && __GNUC__ >= 4
+#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
 
 /* A += B (x86-64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
@@ -131,7 +131,7 @@ static void poly1305_init (poly1305_context_t *ctx,
 
 #endif /* __x86_64__ */
 
-#if defined (__powerpc__) && __GNUC__ >= 4
+#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4
 
 /* A += B (ppc64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
@@ -310,7 +310,7 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
-#if defined (__i386__) && __GNUC__ >= 5
+#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5
 /* Note: ADD_1305_32 below does not compile on GCC-4.7 */
 
 /* A += B (i386) */
-- 
2.27.0


From jussi.kivilinna at iki.fi  Wed Mar  3 18:07:04 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed,  3 Mar 2021 19:07:04 +0200
Subject: [PATCH 2/3] poly1305: fix compiling on i386 gcc-4.7
In-Reply-To: <20210303170705.1614871-1-jussi.kivilinna@iki.fi>
References: <20210303170705.1614871-1-jussi.kivilinna@iki.fi>
Message-ID: <20210303170705.1614871-2-jussi.kivilinna@iki.fi>

* cipher/poly1305.c [__i386__]: Limit i386 variant of ADD_1305_32 to
GCC-5 or newer.
--

Reported-by: Horst Wente <horst.wente at posteo.de>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/poly1305.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 6cb4d2b7..da8f4601 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -310,7 +310,8 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
-#if defined (__i386__) && __GNUC__ >= 4
+#if defined (__i386__) && __GNUC__ >= 5
+/* Note: ADD_1305_32 below does not compile on GCC-4.7 */
 
 /* A += B (i386) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
-- 
2.27.0


From jussi.kivilinna at iki.fi  Wed Mar  3 18:08:08 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 3 Mar 2021 19:08:08 +0200
Subject: libgcrypt-1.9.2: cipher/poly1305.c does not compile on i686/32
 with gcc-4.7.3 ( --disable-asm does not work)
In-Reply-To: <d31a3e8ab4343df8fc2eefbaf6a99c59@posteo.de>
References: <d31a3e8ab4343df8fc2eefbaf6a99c59@posteo.de>
Message-ID: <3a4a2a1e-1bb2-9707-908c-6b0fe007effe@iki.fi>

On 2.3.2021 1.02, horst.wente--- via Gcrypt-devel wrote:
> Hello!
> 
> The "#define ADD_1305_32" ( for "defined (__i386__) && __GNUC__ >= 4" ) leads to
> 
>> poly1305.c: In function 'poly1305_blocks':
>> poly1305.c:424:7: error: can't find a register in class 'GENERAL_REGS' while
>  ??? reloading 'asm'
>> poly1305.c:424:7: error: 'asm' operand has impossible constraints
> 
> I suppose the template uses too many registers, so I redefined it and it seems to
> work for me ( all tests successful ).
> 

Thanks for reporting. I think it is better to just disable this macro for GCC-4.x
as any changes to asm operand types cause performance drop with newer GCC.
When ADD_1305_32 is not provided specific arch, generic version is used.

-Jussi

> hth
> horst wente
> 
> 
> --- cipher/poly1305.c.orig????? 2021-01-28 13:53:52.000000000 +0100
> +++ cipher/poly1305.c?? 2021-03-01 20:25:26.000000000 +0100
> @@ -314,14 +314,19 @@
> 
>  ?/* A += B (i386) */
>  ?#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
> -????? __asm__ ("addl %5, %0\n" \
> -????????????? "adcl %6, %1\n" \
> -????????????? "adcl %7, %2\n" \
> -????????????? "adcl %8, %3\n" \
> -????????????? "adcl %9, %4\n" \
> -????????????? : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
> -????????????? : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
> -????????????? : "cc" )
> +????? __asm__ ("clc\n"???????? \
> +?????????????? "addl %5, %0\n" \
> +?????????????? "movl %6, %5\n" \
> +?????????????? "adcl %5, %1\n" \
> +?????????????? "movl %7, %5\n" \
> +?????????????? "adcl %5, %2\n" \
> +?????????????? "movl %8, %5\n" \
> +?????????????? "adcl %5, %3\n" \
> +?????????????? "movl %9, %5\n" \
> +?????????????? "adcl %5, %4\n" \
> +?????????????? : "+m" (A0), "+m" (A1), "+m" (A2), "+m" (A3), "+m" (A4) \
> +?????????????? : "r" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
> +?????????????? : "cc" )
> 
>  ?#endif /* __i386__ */
> 
> 
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
> 


From jussi.kivilinna at iki.fi  Sun Mar  7 17:44:07 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun,  7 Mar 2021 18:44:07 +0200
Subject: [PATCH 1/4] VPMSUMD acceleration for GCM mode on PPC
Message-ID: <20210307164410.508295-1-jussi.kivilinna@iki.fi>

From: Shawn Landden <shawn at git.icu>

* cipher/Makefile.am: Add 'cipher-gcm-ppc.c'.
* cipher/cipher-gcm-ppc.c: New.
* cipher/cipher-gcm.c [GCM_USE_PPC_VPMSUM] (_gcry_ghash_setup_ppc_vpmsum)
(_gcry_ghash_ppc_vpmsum, ghash_setup_ppc_vpsum, ghash_ppc_vpmsum): New.
(setupM) [GCM_USE_PPC_VPMSUM]: Select ppc-vpmsum implementation if
HW feature "ppc-vcrypto" is available.
* cipher/cipher-internal.h (GCM_USE_PPC_VPMSUM): New.
(gcry_cipher_handle): Move 'ghash_fn' at end of 'gcm' block to align
'gcm_table' to 16 bytes.
* configure.ac: Add 'cipher-gcm-ppc.lo'.
* tests/basic.c (_check_gcm_cipher): New AES256 test vector.
* AUTHORS: Add 'CRYPTOGAMS'.
* LICENSES: Add original license to 3-clause-BSD section.
--

https://dev.gnupg.org/D501:

10-20X speed.

However this Power 9 machine is faster than the last Power 9 benchmarks
on the optimized versions, so while better than the last patch, it is
not all due to the code.

Before:

 GCM enc |      4.23 ns/B     225.3 MiB/s         - c/B
 GCM dec |      3.58 ns/B     266.2 MiB/s         - c/B
GCM auth |      3.34 ns/B     285.3 MiB/s         - c/B

After:

 GCM enc |     0.370 ns/B      2578 MiB/s         - c/B
 GCM dec |     0.371 ns/B      2571 MiB/s         - c/B
GCM auth |     0.159 ns/B      6003 MiB/s         - c/B

Signed-off-by: Shawn Landden <shawn at git.icu>
[jk: coding style fixes, Makefile.am integration, patch from Differential
 to git, commit changelog, fixed few compiler warnings]
GnuPG-bug-id: 5040
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 AUTHORS                  |   1 +
 LICENSES                 |  43 +++-
 cipher/Makefile.am       |   7 +
 cipher/cipher-gcm-ppc.c  | 510 +++++++++++++++++++++++++++++++++++++++
 cipher/cipher-gcm.c      |  31 ++-
 cipher/cipher-internal.h |  18 +-
 configure.ac             |  13 +
 tests/basic.c            |  16 ++
 8 files changed, 634 insertions(+), 5 deletions(-)
 create mode 100644 cipher/cipher-gcm-ppc.c

diff --git a/AUTHORS b/AUTHORS
index f6bfcb85..f237baa5 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -30,6 +30,7 @@ List of Copyright holders
   Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood
   Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett
   Copyright (C) 2003 Nikos Mavroyanopoulos
+  Copyright (c) 2006 CRYPTOGAMS
   Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation)
   Copyright (C) 2012-2021 g10 Code GmbH
   Copyright (C) 2012 Simon Josefsson, Niels M?ller
diff --git a/LICENSES b/LICENSES
index 31f8eae8..bdd16aab 100644
--- a/LICENSES
+++ b/LICENSES
@@ -55,7 +55,6 @@ with any binary distributions derived from the GNU C Library.
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
-
   For files:
   - random/jitterentropy-base.c
   - random/jitterentropy.h
@@ -100,6 +99,48 @@ with any binary distributions derived from the GNU C Library.
  * DAMAGE.
 #+end_quote
 
+  For files:
+  - cipher/cipher-gcm-ppc.c
+
+#+begin_quote
+ Copyright (c) 2006, CRYPTOGAMS by <appro at openssl.org>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+       * Redistributions of source code must retain copyright notices,
+         this list of conditions and the following disclaimer.
+
+       * Redistributions in binary form must reproduce the above
+         copyright notice, this list of conditions and the following
+         disclaimer in the documentation and/or other materials
+         provided with the distribution.
+
+       * Neither the name of the CRYPTOGAMS nor the names of its
+         copyright holder and contributors may be used to endorse or
+         promote products derived from this software without specific
+         prior written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this
+ product may be distributed under the terms of the GNU General Public
+ License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ those given above.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
 * X License
 
   For files:
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index e2100cf3..da8cc126 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -83,6 +83,7 @@ EXTRA_libcipher_la_SOURCES = \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
 	chacha20-ppc.c chacha20-s390x.S \
+	cipher-gcm-ppc.c \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
@@ -258,3 +259,9 @@ crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
 
 crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
new file mode 100644
index 00000000..ed27ef15
--- /dev/null
+++ b/cipher/cipher-gcm-ppc.c
@@ -0,0 +1,510 @@
+/* cipher-gcm-ppc.c  -  Power 8 vpmsum accelerated Galois Counter Mode
+ *                      implementation
+ * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Based on GHASH implementation by Andy Polyakov from CRYPTOGAMS
+ * distribution (ppc/ghashp8-ppc.pl). Specifically, it uses his register
+ * allocation (which then defers to your compiler's register allocation),
+ * instead of re-implementing Gerald Estrin's Scheme of parallelized
+ * multiplication of polynomials, as I did not understand this algorithm at
+ * the time.
+ *
+ * Original copyright license follows:
+ *
+ *  Copyright (c) 2006, CRYPTOGAMS by <appro at openssl.org>
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *        * Redistributions of source code must retain copyright notices,
+ *          this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above
+ *          copyright notice, this list of conditions and the following
+ *          disclaimer in the documentation and/or other materials
+ *          provided with the distribution.
+ *
+ *        * Neither the name of the CRYPTOGAMS nor the names of its
+ *          copyright holder and contributors may be used to endorse or
+ *          promote products derived from this software without specific
+ *          prior written permission.
+ *
+ *  ALTERNATIVELY, provided that this notice is retained in full, this
+ *  product may be distributed under the terms of the GNU General Public
+ *  License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ *  those given above.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+#ifdef GCM_USE_PPC_VPMSUM
+
+#include <altivec.h>
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
+
+typedef vector unsigned char vector16x_u8;
+typedef vector signed char vector16x_s8;
+typedef vector unsigned long long vector2x_u64;
+typedef vector unsigned long long block;
+
+static ASM_FUNC_ATTR_INLINE block
+asm_vpmsumd(block a, block b)
+{
+  block r;
+  __asm__("vpmsumd %0, %1, %2"
+	  : "=v" (r)
+	  : "v" (a), "v" (b));
+  return r;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_swap_u64(block a)
+{
+  __asm__("xxswapd %x0, %x1"
+          : "=wa" (a)
+          : "wa" (a));
+  return a;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_rot_block_left(block a)
+{
+  block zero = {0, 0};
+  block mask = {2, 0};
+  return __builtin_shuffle(a, zero, mask);
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_rot_block_right(block a)
+{
+  block zero = {0, 0};
+  block mask = {1, 2};
+  return __builtin_shuffle(a, zero, mask);
+}
+
+/* vsl is a slightly strange function in the way the shift is passed... */
+static ASM_FUNC_ATTR_INLINE block
+asm_ashl_128(block a, vector16x_u8 shift)
+{
+  block r;
+  __asm__("vsl %0, %1, %2"
+          : "=v" (r)
+          : "v" (a), "v" (shift));
+  return r;
+}
+
+#define ALIGNED_LOAD(in_ptr) \
+  (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
+
+static ASM_FUNC_ATTR_INLINE block
+vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
+{
+#ifndef WORDS_BIGENDIAN
+  block vec;
+  __asm__ ("lvx %0,%1,%2\n\t"
+	   : "=v" (vec)
+	   : "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory", "r0");
+  return vec;
+#else
+  return vec_vsx_ld (offset, ptr);
+#endif
+}
+
+#define STORE_TABLE(gcm_table, slot, vec) \
+  vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table));
+
+
+static ASM_FUNC_ATTR_INLINE void
+vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
+{
+#ifndef WORDS_BIGENDIAN
+  __asm__ ("stvx %0,%1,%2\n\t"
+	   :
+	   : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory", "r0");
+#else
+  vec_vsx_st ((vector16x_u8)vec, offset, ptr);
+#endif
+}
+
+#define VEC_LOAD_BE(in_ptr, bswap_const) \
+  (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const))
+
+static ASM_FUNC_ATTR_INLINE block
+vec_load_be(unsigned long offset, const unsigned char *ptr,
+	    vector unsigned char be_bswap_const)
+{
+#ifndef WORDS_BIGENDIAN
+  block vec;
+  /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
+   * lxvw4x directly instead. */
+  __asm__ ("lxvw4x %x0,%1,%2\n\t"
+	   : "=wa" (vec)
+	   : "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory", "r0");
+  __asm__ ("vperm %0,%1,%1,%2\n\t"
+	   : "=v" (vec)
+	   : "v" (vec), "v" (be_bswap_const));
+  return vec;
+#else
+  (void)be_bswap_const;
+  return vec_vsx_ld (offset, ptr);
+#endif
+}
+
+/* Power ghash based on papers:
+   "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
+   "Intel? Carry-Less Multiplication Instruction and its Usage for Computing
+    the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+
+   After saving the magic c2 constant and pre-formatted version of the key,
+   we pre-process the key for parallel hashing. This takes advantage of the
+   identity of addition over a galois field being identital to XOR, and thus
+   can be parellized (S 2.2, page 3). We multiply and add (galois field
+   versions) the key over multiple iterations and save the result. This can
+   later be galois added (XORed) with parallel processed input (Estrin's
+   Scheme).
+
+   The ghash "key" is a salt. */
+void ASM_FUNC_ATTR
+_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
+{
+  vector16x_u8 bswap_const =
+    { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
+  vector16x_u8 c2 =
+    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
+  block T0, T1, T2;
+  block C2, H, H1, H1l, H1h, H2, H2l, H2h;
+  block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
+  vector16x_s8 most_sig_of_H, t7, carry;
+  vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+  H = VEC_LOAD_BE(gcm_key, bswap_const);
+  most_sig_of_H = vec_splat((vector16x_s8)H, 15);
+  t7 = vec_splat_s8(7);
+  carry = most_sig_of_H >> t7;
+  carry &= c2; /* only interested in certain carries. */
+  H1 = asm_ashl_128(H, one);
+  H1 ^= (block)carry; /* complete the <<< 1 */
+
+  T1 = asm_swap_u64 (H1);
+  H1l = asm_rot_block_right (T1);
+  H1h = asm_rot_block_left (T1);
+  C2 = asm_rot_block_right ((block)c2);
+
+  STORE_TABLE (gcm_table, 0, C2);
+  STORE_TABLE (gcm_table, 1, H1l);
+  STORE_TABLE (gcm_table, 2, T1);
+  STORE_TABLE (gcm_table, 3, H1h);
+
+  /* pre-process coefficients for Gerald Estrin's scheme for parallel
+   * multiplication of polynomials
+   */
+  H2l = asm_vpmsumd (H1l, H1); /* do not need to mask in
+                                  because 0 * anything -> 0 */
+  H2 = asm_vpmsumd (T1, H1);
+  H2h = asm_vpmsumd (H1h, H1);
+
+  /* reduce 1 */
+  T0 = asm_vpmsumd (H2l, C2);
+
+  H2l ^= asm_rot_block_left (H2);;
+  H2h ^= asm_rot_block_right (H2);
+  H2l = asm_swap_u64 (H2l);
+  H2l ^= T0;
+  /* reduce 2 */
+  T0 = asm_swap_u64 (H2l);
+  H2l = asm_vpmsumd (H2l, C2);
+  H2 = H2l ^ H2h ^ T0;
+
+  T2 = asm_swap_u64 (H2);
+  H2l = asm_rot_block_right (T2);
+  H2h = asm_rot_block_left (T2);
+
+  STORE_TABLE (gcm_table, 4, H2l);
+  STORE_TABLE (gcm_table, 5, T2);
+  STORE_TABLE (gcm_table, 6, H2h);
+
+  H3l = asm_vpmsumd (H2l, H1);
+  H4l = asm_vpmsumd (H2l, H2);
+  H3 = asm_vpmsumd (T2, H1);
+  H4 = asm_vpmsumd (T2, H2);
+  H3h = asm_vpmsumd (H2h, H1);
+  H4h = asm_vpmsumd (H2h, H2);
+
+  T3 = asm_vpmsumd (H3l, C2);
+  T4 = asm_vpmsumd (H4l, C2);
+
+  H3l ^= asm_rot_block_left (H3);
+  H3h ^= asm_rot_block_right (H3);
+  H4l ^= asm_rot_block_left (H4);
+  H4h ^= asm_rot_block_right (H4);
+
+  H3 = asm_swap_u64 (H3l);
+  H4 = asm_swap_u64 (H4l);
+
+  H3 ^= T3;
+  H4 ^= T4;
+
+  /* We could have also b64 switched reduce and reduce2, however as we are
+     using the unrotated H and H2 above to vpmsum, this is marginally better. */
+  T3 = asm_swap_u64 (H3);
+  T4 = asm_swap_u64 (H4);
+
+  H3 = asm_vpmsumd (H3, C2);
+  H4 = asm_vpmsumd (H4, C2);
+
+  T3 ^= H3h;
+  T4 ^= H4h;
+  H3 ^= T3;
+  H4 ^= T4;
+  H3 = asm_swap_u64 (H3);
+  H4 = asm_swap_u64 (H4);
+
+  H3l = asm_rot_block_right (H3);
+  H3h = asm_rot_block_left (H3);
+  H4l = asm_rot_block_right (H4);
+  H4h = asm_rot_block_left (H4);
+
+  STORE_TABLE (gcm_table, 7, H3l);
+  STORE_TABLE (gcm_table, 8, H3);
+  STORE_TABLE (gcm_table, 9, H3h);
+  STORE_TABLE (gcm_table, 10, H4l);
+  STORE_TABLE (gcm_table, 11, H4);
+  STORE_TABLE (gcm_table, 12, H4h);
+}
+
+ASM_FUNC_ATTR_INLINE
+block
+vec_perm2(block l, block r, vector16x_u8 perm) {
+  block ret;
+  __asm__ ("vperm %0,%1,%2,%3\n\t"
+	   : "=v" (ret)
+	   : "v" (l), "v" (r), "v" (perm));
+  return ret;
+}
+
+void ASM_FUNC_ATTR
+_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
+			const byte *const buf, const size_t nblocks)
+{
+  /* This const is strange, it is reversing the bytes, and also reversing
+     the u32s that get switched by lxvw4 and it also addresses bytes big-endian,
+     and is here due to lack of proper peep-hole optimization. */
+  vector16x_u8 bswap_const =
+    { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
+  vector16x_u8 bswap_8_const =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
+  block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
+  size_t blocks_remaining = nblocks, off = 0;
+  size_t not_multiple_of_four;
+  block t0;
+
+  cur = vec_load_be (0, result, bswap_const);
+
+  c2 = vec_aligned_ld (0, gcm_table);
+  H0l = vec_aligned_ld (16, gcm_table);
+  H0m = vec_aligned_ld (32, gcm_table);
+  H0h = vec_aligned_ld (48, gcm_table);
+
+  for (not_multiple_of_four = nblocks % 4; not_multiple_of_four;
+       not_multiple_of_four--)
+    {
+      in = vec_load_be (off, buf, bswap_const);
+      off += 16;
+      blocks_remaining--;
+      cur ^= in;
+
+      Hl = asm_vpmsumd (cur, H0l);
+      Hm = asm_vpmsumd (cur, H0m);
+      Hh = asm_vpmsumd (cur, H0h);
+
+      t0 = asm_vpmsumd (Hl, c2);
+
+      Hl ^= asm_rot_block_left (Hm);
+
+      Hm_right = asm_rot_block_right (Hm);
+      Hh ^= Hm_right;
+      Hl_rotate = asm_swap_u64 (Hl);
+      Hl_rotate ^= t0;
+      Hl = asm_swap_u64 (Hl_rotate);
+      Hl_rotate = asm_vpmsumd (Hl_rotate, c2);
+      Hl ^= Hh;
+      Hl ^= Hl_rotate;
+
+      cur = Hl;
+  }
+
+  if (blocks_remaining > 0)
+    {
+      vector16x_u8 hiperm =
+	{
+	  0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+	  0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0
+	};
+      vector16x_u8 loperm =
+        {
+	  0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
+	  0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8
+	};
+      block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate;
+      block H21l, H21h, merge_l, merge_h;
+
+      H2m = vec_aligned_ld (48 + 32, gcm_table);
+      H3l = vec_aligned_ld (48 * 2 + 16, gcm_table);
+      H3m = vec_aligned_ld (48 * 2 + 32, gcm_table);
+      H3h = vec_aligned_ld (48 * 2 + 48, gcm_table);
+      H4l = vec_aligned_ld (48 * 3 + 16, gcm_table);
+      H4m = vec_aligned_ld (48 * 3 + 32, gcm_table);
+      H4h = vec_aligned_ld (48 * 3 + 48, gcm_table);
+
+      in0 = vec_load_be (off, buf, bswap_const);
+      in1 = vec_load_be (off + 16, buf, bswap_const);
+      in2 = vec_load_be (off + 32, buf, bswap_const);
+      in3 = vec_load_be (off + 48, buf, bswap_const);
+      blocks_remaining -= 4;
+      off += 64;
+
+      Xh = in0 ^ cur;
+
+      Xl1 = asm_vpmsumd (in1, H3l);
+      Xm1 = asm_vpmsumd (in1, H3m);
+      Xh1 = asm_vpmsumd (in1, H3h);
+
+      H21l = vec_perm2 (H2m, H0m, hiperm);
+      H21h = vec_perm2 (H2m, H0m, loperm);
+      merge_l = vec_perm2 (in2, in3, loperm);
+      merge_h = vec_perm2 (in2, in3, hiperm);
+
+      Xm2 = asm_vpmsumd (in2, H2m);
+      Xl3 = asm_vpmsumd (merge_l, H21l);
+      Xm3 = asm_vpmsumd (in3, H0m);
+      Xh3 = asm_vpmsumd (merge_h, H21h);
+
+      Xm2 ^= Xm1;
+      Xl3 ^= Xl1;
+      Xm3 ^= Xm2;
+      Xh3 ^= Xh1;
+
+      /* Gerald Estrin's scheme for parallel multiplication of polynomials */
+      for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64)
+        {
+	  in0 = vec_load_be (off, buf, bswap_const);
+	  in1 = vec_load_be (off + 16, buf, bswap_const);
+	  in2 = vec_load_be (off + 32, buf, bswap_const);
+	  in3 = vec_load_be (off + 48, buf, bswap_const);
+
+	  Xl = asm_vpmsumd (Xh, H4l);
+	  Xm = asm_vpmsumd (Xh, H4m);
+	  Xh = asm_vpmsumd (Xh, H4h);
+	  Xl1 = asm_vpmsumd (in1, H3l);
+	  Xm1 = asm_vpmsumd (in1, H3m);
+	  Xh1 = asm_vpmsumd (in1, H3h);
+
+	  Xl ^= Xl3;
+	  Xm ^= Xm3;
+	  Xh ^= Xh3;
+	  merge_l = vec_perm2 (in2, in3, loperm);
+	  merge_h = vec_perm2 (in2, in3, hiperm);
+
+	  t0 = asm_vpmsumd (Xl, c2);
+	  Xl3 = asm_vpmsumd (merge_l, H21l);
+	  Xh3 = asm_vpmsumd (merge_h, H21h);
+
+	  Xl ^= asm_rot_block_left (Xm);
+	  Xh ^= asm_rot_block_right (Xm);
+
+	  Xl = asm_swap_u64 (Xl);
+	  Xl ^= t0;
+
+	  Xl_rotate = asm_swap_u64 (Xl);
+	  Xm2 = asm_vpmsumd (in2, H2m);
+	  Xm3 = asm_vpmsumd (in3, H0m);
+	  Xl = asm_vpmsumd (Xl, c2);
+
+	  Xl3 ^= Xl1;
+	  Xh3 ^= Xh1;
+	  Xh ^= in0;
+	  Xm2 ^= Xm1;
+	  Xh ^= Xl_rotate;
+	  Xm3 ^= Xm2;
+	  Xh ^= Xl;
+	}
+
+      Xl = asm_vpmsumd (Xh, H4l);
+      Xm = asm_vpmsumd (Xh, H4m);
+      Xh = asm_vpmsumd (Xh, H4h);
+
+      Xl ^= Xl3;
+      Xm ^= Xm3;
+
+      t0 = asm_vpmsumd (Xl, c2);
+
+      Xh ^= Xh3;
+      Xl ^= asm_rot_block_left (Xm);
+      Xh ^= asm_rot_block_right (Xm);
+
+      Xl = asm_swap_u64 (Xl);
+      Xl ^= t0;
+
+      Xl_rotate = asm_swap_u64 (Xl);
+      Xl = asm_vpmsumd (Xl, c2);
+      Xl_rotate ^= Xh;
+      Xl ^= Xl_rotate;
+
+      cur = Xl;
+    }
+
+  cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const);
+  STORE_TABLE (result, 0, cur);
+}
+
+#endif /* GCM_USE_PPC_VPMSUM */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 7aad1277..598ea5fb 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -109,6 +109,28 @@ ghash_s390x_kimd (gcry_cipher_hd_t c, byte *result, const byte *buf,
 }
 #endif /* GCM_USE_S390X_CRYPTO*/
 
+#ifdef GCM_USE_PPC_VPMSUM
+extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key);
+
+/* result is 128-bits */
+extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
+					    const byte *buf, size_t nblocks);
+
+static void
+ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c)
+{
+  _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key);
+}
+
+static unsigned int
+ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf,
+		  size_t nblocks)
+{
+  _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
+			 nblocks);
+  return 0;
+}
+#endif /* GCM_USE_PPC_VPMSUM */
 
 #ifdef GCM_USE_TABLES
 static struct
@@ -543,7 +565,7 @@ static void
 setupM (gcry_cipher_hd_t c)
 {
 #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
-    defined(GCM_USE_S390X_CRYPTO)
+    defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
   unsigned int features = _gcry_get_hw_features ();
 #endif
 
@@ -572,6 +594,13 @@ setupM (gcry_cipher_hd_t c)
       ghash_setup_armv7_neon (c);
     }
 #endif
+#ifdef GCM_USE_PPC_VPMSUM
+  else if (features & HWF_PPC_VCRYPTO)
+    {
+      c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum;
+      ghash_setup_ppc_vpmsum (c);
+    }
+#endif
 #ifdef GCM_USE_S390X_CRYPTO
   else if (features & HWF_S390X_MSA)
     {
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 59b36ce7..1d62b11e 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -102,6 +102,18 @@
 # define GCM_USE_S390X_CRYPTO 1
 #endif /* GCM_USE_S390X_CRYPTO */
 
+/* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8
+ * polynomial multiplication instruction. */
+#undef GCM_USE_PPC_VPMSUM
+#if defined(GCM_USE_TABLES)
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \
+    !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4
+#  define GCM_USE_PPC_VPMSUM 1
+#  define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */
+#endif
+#endif /* GCM_USE_PPC_VPMSUM */
+
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);
 
@@ -322,9 +334,6 @@ struct gcry_cipher_handle
         unsigned char key[MAX_BLOCKSIZE];
       } u_ghash_key;
 
-      /* GHASH implementation in use. */
-      ghash_fn_t ghash_fn;
-
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES
  #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
@@ -335,6 +344,9 @@ struct gcry_cipher_handle
       u32 gcm_table[8 * 16];
  #endif
 #endif
+
+      /* GHASH implementation in use. */
+      ghash_fn_t ghash_fn;
     } gcm;
 
     /* Mode specific storage for OCB mode. */
diff --git a/configure.ac b/configure.ac
index 564d361b..fb1a7a70 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3058,6 +3058,19 @@ case "$mpi_cpu_arch" in
   ;;
 esac
 
+# Arch specific GCM implementations
+case "${host}" in
+  powerpc64le-*-*)
+      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  ;;
+  powerpc64-*-*)
+      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  ;;
+  powerpc-*-*)
+      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  ;;
+esac
+
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
diff --git a/tests/basic.c b/tests/basic.c
index 2b543846..9a7e33cc 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3233,6 +3233,22 @@ _check_gcm_cipher (unsigned int step)
         "\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde"
         "\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f",
         "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" },
+      { GCRY_CIPHER_AES256,
+        "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+        "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08",
+        "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12,
+        "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef"
+        "\xab\xad\xda\xd2", 20,
+        "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
+        "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
+        "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
+        "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39",
+        60,
+        "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
+        "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa"
+        "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38"
+        "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a\xbc\xc9\xf6\x62",
+        "\x76\xfc\x6e\xce\x0f\x4e\x17\x68\xcd\xdf\x88\x53\xbb\x2d\x55\x1b" },
       /* Test vectors for overflowing CTR. */
       /* After setiv, ctr_low: 0xffffffff */
       { GCRY_CIPHER_AES256,
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sun Mar  7 17:44:08 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun,  7 Mar 2021 18:44:08 +0200
Subject: [PATCH 2/4] cipher-gcm-ppc: tweak for better performance
In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
References: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
Message-ID: <20210307164410.508295-2-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-ppc.c (asm_xor, asm_mergelo, asm_mergehi)
(vec_be_swap, vec_load_he, vec_store_he): New.
(vec_load_be, vec_perm2, vec_aligned_st,vec_aligned_ld): Remove.
(asm_vpmsumd, asm_swap_u64, vec_perm2, asm_rot_block_left)
(asm_rot_block_right, asm_ashl_128, vec_aligned_ld)
(vec_aligned_st, vec_load_be): Use 'asm volatile'.
(_gcry_ghash_setup_ppc_vpmsum): Update 'bswap_const'.
(_gcry_ghash_ppc_vpmsum): Update 'bswap_const'; Use 'asm_mergehi'
and 'asm_mergelo' instead of vec_perm2; Use 'asm_xor' for
fast path to enforce instruction ordering; Use 'vec_load_he' and
'vec_be_swap' for big-endian loads.
--

Benchmark on POWER8 (3700Mhz):

Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 GMAC_AES           |     0.169 ns/B      5647 MiB/s     0.625 c/B

After (~13% faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 GMAC_AES           |     0.149 ns/B      6385 MiB/s     0.553 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-ppc.c | 336 ++++++++++++++++++++++------------------
 1 file changed, 185 insertions(+), 151 deletions(-)

diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
index ed27ef15..2f60c09d 100644
--- a/cipher/cipher-gcm-ppc.c
+++ b/cipher/cipher-gcm-ppc.c
@@ -93,112 +93,157 @@ typedef vector signed char vector16x_s8;
 typedef vector unsigned long long vector2x_u64;
 typedef vector unsigned long long block;
 
+static ASM_FUNC_ATTR_INLINE block
+asm_xor(block a, block b)
+{
+  block r;
+  __asm__ volatile ("xxlxor %x0, %x1, %x2"
+		    : "=wa" (r)
+		    : "wa" (a), "wa" (b));
+  return r;
+}
+
 static ASM_FUNC_ATTR_INLINE block
 asm_vpmsumd(block a, block b)
 {
   block r;
-  __asm__("vpmsumd %0, %1, %2"
-	  : "=v" (r)
-	  : "v" (a), "v" (b));
+  __asm__ volatile ("vpmsumd %0, %1, %2"
+		    : "=v" (r)
+		    : "v" (a), "v" (b));
   return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_swap_u64(block a)
 {
-  __asm__("xxswapd %x0, %x1"
-          : "=wa" (a)
-          : "wa" (a));
-  return a;
+  block r;
+  __asm__ volatile ("xxswapd %x0, %x1"
+		    : "=wa" (r)
+		    : "wa" (a));
+  return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
-asm_rot_block_left(block a)
+asm_mergelo(block l, block r)
 {
-  block zero = {0, 0};
-  block mask = {2, 0};
-  return __builtin_shuffle(a, zero, mask);
+  block ret;
+  __asm__ volatile ("xxmrgld %x0, %x1, %x2\n\t"
+		    : "=wa" (ret)
+		    : "wa" (l), "wa" (r));
+  return ret;
 }
 
 static ASM_FUNC_ATTR_INLINE block
-asm_rot_block_right(block a)
+asm_mergehi(block l, block r)
 {
-  block zero = {0, 0};
-  block mask = {1, 2};
-  return __builtin_shuffle(a, zero, mask);
+  block ret;
+  __asm__ volatile ("xxmrghd %x0, %x1, %x2\n\t"
+		    : "=wa" (ret)
+		    : "wa" (l), "wa" (r));
+  return ret;
 }
 
-/* vsl is a slightly strange function in the way the shift is passed... */
 static ASM_FUNC_ATTR_INLINE block
-asm_ashl_128(block a, vector16x_u8 shift)
+asm_rot_block_left(block a)
 {
   block r;
-  __asm__("vsl %0, %1, %2"
-          : "=v" (r)
-          : "v" (a), "v" (shift));
+  block zero = { 0, 0 };
+  __asm__ volatile ("xxmrgld %x0, %x1, %x2"
+		    : "=wa" (r)
+		    : "wa" (a), "wa" (zero));
   return r;
 }
 
-#define ALIGNED_LOAD(in_ptr) \
-  (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
+static ASM_FUNC_ATTR_INLINE block
+asm_rot_block_right(block a)
+{
+  block r;
+  block zero = { 0, 0 };
+  __asm__ volatile ("xxsldwi %x0, %x2, %x1, 2"
+		    : "=wa" (r)
+		    : "wa" (a), "wa" (zero));
+  return r;
+}
 
+/* vsl is a slightly strange function in the way the shift is passed... */
 static ASM_FUNC_ATTR_INLINE block
-vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
+asm_ashl_128(block a, vector16x_u8 shift)
 {
-#ifndef WORDS_BIGENDIAN
-  block vec;
-  __asm__ ("lvx %0,%1,%2\n\t"
-	   : "=v" (vec)
-	   : "r" (offset), "r" ((uintptr_t)ptr)
-	   : "memory", "r0");
-  return vec;
-#else
-  return vec_vsx_ld (offset, ptr);
-#endif
+  block r;
+  __asm__ volatile ("vsl %0, %1, %2"
+		    : "=v" (r)
+		    : "v" (a), "v" (shift));
+  return r;
 }
 
 #define STORE_TABLE(gcm_table, slot, vec) \
-  vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table));
-
+  vec_store_he (((block)vec), slot * 16, (unsigned char *)(gcm_table));
 
 static ASM_FUNC_ATTR_INLINE void
-vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
+vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
 {
 #ifndef WORDS_BIGENDIAN
-  __asm__ ("stvx %0,%1,%2\n\t"
-	   :
-	   : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
-	   : "memory", "r0");
+  /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
+   * lxvd2x directly instead. */
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("stxvd2x %x0, 0, %1\n\t"
+		    :
+		    : "wa" (vec), "r" ((uintptr_t)ptr)
+		    : "memory", "r0");
+  else
+#endif
+    __asm__ volatile ("stxvd2x %x0, %1, %2\n\t"
+		      :
+		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
 #else
   vec_vsx_st ((vector16x_u8)vec, offset, ptr);
 #endif
 }
 
 #define VEC_LOAD_BE(in_ptr, bswap_const) \
-  (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const))
+  vec_be_swap(vec_load_he (0, (const unsigned char *)(in_ptr)), bswap_const)
 
 static ASM_FUNC_ATTR_INLINE block
-vec_load_be(unsigned long offset, const unsigned char *ptr,
-	    vector unsigned char be_bswap_const)
+vec_load_he(unsigned long offset, const unsigned char *ptr)
 {
 #ifndef WORDS_BIGENDIAN
   block vec;
   /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
-   * lxvw4x directly instead. */
-  __asm__ ("lxvw4x %x0,%1,%2\n\t"
-	   : "=wa" (vec)
-	   : "r" (offset), "r" ((uintptr_t)ptr)
-	   : "memory", "r0");
-  __asm__ ("vperm %0,%1,%1,%2\n\t"
-	   : "=v" (vec)
-	   : "v" (vec), "v" (be_bswap_const));
+   * lxvd2x directly instead. */
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvd2x %x0, 0, %1\n\t"
+		    : "=wa" (vec)
+		    : "r" ((uintptr_t)ptr)
+		    : "memory", "r0");
+  else
+#endif
+    __asm__ volatile ("lxvd2x %x0, %1, %2\n\t"
+		      : "=wa" (vec)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
   return vec;
 #else
-  (void)be_bswap_const;
   return vec_vsx_ld (offset, ptr);
 #endif
 }
 
+static ASM_FUNC_ATTR_INLINE block
+vec_be_swap(block vec, vector16x_u8 be_bswap_const)
+{
+#ifndef WORDS_BIGENDIAN
+  __asm__ volatile ("vperm %0, %1, %1, %2\n\t"
+		    : "=v" (vec)
+		    : "v" (vec), "v" (be_bswap_const));
+#else
+  (void)be_bswap_const;
+#endif
+  return vec;
+}
+
+
 /* Power ghash based on papers:
    "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
    "Intel? Carry-Less Multiplication Instruction and its Usage for Computing
@@ -216,15 +261,16 @@ vec_load_be(unsigned long offset, const unsigned char *ptr,
 void ASM_FUNC_ATTR
 _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
 {
-  vector16x_u8 bswap_const =
-    { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
-  vector16x_u8 c2 =
+  static const vector16x_u8 bswap_const =
+    { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
+  static const vector16x_u8 c2 =
     { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
+  static const vector16x_u8 one =
+    { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
   block T0, T1, T2;
   block C2, H, H1, H1l, H1h, H2, H2l, H2h;
   block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
   vector16x_s8 most_sig_of_H, t7, carry;
-  vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 
   H = VEC_LOAD_BE(gcm_key, bswap_const);
   most_sig_of_H = vec_splat((vector16x_s8)H, 15);
@@ -255,7 +301,7 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
   /* reduce 1 */
   T0 = asm_vpmsumd (H2l, C2);
 
-  H2l ^= asm_rot_block_left (H2);;
+  H2l ^= asm_rot_block_left (H2);
   H2h ^= asm_rot_block_right (H2);
   H2l = asm_swap_u64 (H2l);
   H2l ^= T0;
@@ -321,45 +367,30 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
   STORE_TABLE (gcm_table, 12, H4h);
 }
 
-ASM_FUNC_ATTR_INLINE
-block
-vec_perm2(block l, block r, vector16x_u8 perm) {
-  block ret;
-  __asm__ ("vperm %0,%1,%2,%3\n\t"
-	   : "=v" (ret)
-	   : "v" (l), "v" (r), "v" (perm));
-  return ret;
-}
-
 void ASM_FUNC_ATTR
-_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
-			const byte *const buf, const size_t nblocks)
+_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table,
+			const byte *buf, const size_t nblocks)
 {
-  /* This const is strange, it is reversing the bytes, and also reversing
-     the u32s that get switched by lxvw4 and it also addresses bytes big-endian,
-     and is here due to lack of proper peep-hole optimization. */
-  vector16x_u8 bswap_const =
-    { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
-  vector16x_u8 bswap_8_const =
-    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  static const vector16x_u8 bswap_const =
+    { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
   block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
   block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
-  size_t blocks_remaining = nblocks, off = 0;
+  size_t blocks_remaining = nblocks;
   size_t not_multiple_of_four;
   block t0;
 
-  cur = vec_load_be (0, result, bswap_const);
+  cur = vec_be_swap (vec_load_he (0, result), bswap_const);
 
-  c2 = vec_aligned_ld (0, gcm_table);
-  H0l = vec_aligned_ld (16, gcm_table);
-  H0m = vec_aligned_ld (32, gcm_table);
-  H0h = vec_aligned_ld (48, gcm_table);
+  c2 = vec_load_he (0, gcm_table);
+  H0l = vec_load_he (16, gcm_table);
+  H0m = vec_load_he (32, gcm_table);
+  H0h = vec_load_he (48, gcm_table);
 
   for (not_multiple_of_four = nblocks % 4; not_multiple_of_four;
        not_multiple_of_four--)
     {
-      in = vec_load_be (off, buf, bswap_const);
-      off += 16;
+      in = vec_be_swap (vec_load_he (0, buf), bswap_const);
+      buf += 16;
       blocks_remaining--;
       cur ^= in;
 
@@ -385,62 +416,64 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
 
   if (blocks_remaining > 0)
     {
-      vector16x_u8 hiperm =
-	{
-	  0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
-	  0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0
-	};
-      vector16x_u8 loperm =
-        {
-	  0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
-	  0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8
-	};
       block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate;
       block H21l, H21h, merge_l, merge_h;
-
-      H2m = vec_aligned_ld (48 + 32, gcm_table);
-      H3l = vec_aligned_ld (48 * 2 + 16, gcm_table);
-      H3m = vec_aligned_ld (48 * 2 + 32, gcm_table);
-      H3h = vec_aligned_ld (48 * 2 + 48, gcm_table);
-      H4l = vec_aligned_ld (48 * 3 + 16, gcm_table);
-      H4m = vec_aligned_ld (48 * 3 + 32, gcm_table);
-      H4h = vec_aligned_ld (48 * 3 + 48, gcm_table);
-
-      in0 = vec_load_be (off, buf, bswap_const);
-      in1 = vec_load_be (off + 16, buf, bswap_const);
-      in2 = vec_load_be (off + 32, buf, bswap_const);
-      in3 = vec_load_be (off + 48, buf, bswap_const);
-      blocks_remaining -= 4;
-      off += 64;
-
-      Xh = in0 ^ cur;
+      block t1, t2;
+
+      H2m = vec_load_he (48 + 32, gcm_table);
+      H3l = vec_load_he (48 * 2 + 16, gcm_table);
+      H3m = vec_load_he (48 * 2 + 32, gcm_table);
+      H3h = vec_load_he (48 * 2 + 48, gcm_table);
+      H4l = vec_load_he (48 * 3 + 16, gcm_table);
+      H4m = vec_load_he (48 * 3 + 32, gcm_table);
+      H4h = vec_load_he (48 * 3 + 48, gcm_table);
+
+      in0 = vec_load_he (0, buf);
+      in1 = vec_load_he (16, buf);
+      in2 = vec_load_he (32, buf);
+      in3 = vec_load_he (48, buf);
+      in0 = vec_be_swap(in0, bswap_const);
+      in1 = vec_be_swap(in1, bswap_const);
+      in2 = vec_be_swap(in2, bswap_const);
+      in3 = vec_be_swap(in3, bswap_const);
+
+      Xh = asm_xor (in0, cur);
 
       Xl1 = asm_vpmsumd (in1, H3l);
       Xm1 = asm_vpmsumd (in1, H3m);
       Xh1 = asm_vpmsumd (in1, H3h);
 
-      H21l = vec_perm2 (H2m, H0m, hiperm);
-      H21h = vec_perm2 (H2m, H0m, loperm);
-      merge_l = vec_perm2 (in2, in3, loperm);
-      merge_h = vec_perm2 (in2, in3, hiperm);
+      H21l = asm_mergehi (H2m, H0m);
+      H21h = asm_mergelo (H2m, H0m);
+      merge_l = asm_mergelo (in2, in3);
+      merge_h = asm_mergehi (in2, in3);
 
       Xm2 = asm_vpmsumd (in2, H2m);
       Xl3 = asm_vpmsumd (merge_l, H21l);
       Xm3 = asm_vpmsumd (in3, H0m);
       Xh3 = asm_vpmsumd (merge_h, H21h);
 
-      Xm2 ^= Xm1;
-      Xl3 ^= Xl1;
-      Xm3 ^= Xm2;
-      Xh3 ^= Xh1;
+      Xm2 = asm_xor (Xm2, Xm1);
+      Xl3 = asm_xor (Xl3, Xl1);
+      Xm3 = asm_xor (Xm3, Xm2);
+      Xh3 = asm_xor (Xh3, Xh1);
 
       /* Gerald Estrin's scheme for parallel multiplication of polynomials */
-      for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64)
+      while (1)
         {
-	  in0 = vec_load_be (off, buf, bswap_const);
-	  in1 = vec_load_be (off + 16, buf, bswap_const);
-	  in2 = vec_load_be (off + 32, buf, bswap_const);
-	  in3 = vec_load_be (off + 48, buf, bswap_const);
+	  buf += 64;
+	  blocks_remaining -= 4;
+	  if (!blocks_remaining)
+	    break;
+
+	  in0 = vec_load_he (0, buf);
+	  in1 = vec_load_he (16, buf);
+	  in2 = vec_load_he (32, buf);
+	  in3 = vec_load_he (48, buf);
+	  in1 = vec_be_swap(in1, bswap_const);
+	  in2 = vec_be_swap(in2, bswap_const);
+	  in3 = vec_be_swap(in3, bswap_const);
+	  in0 = vec_be_swap(in0, bswap_const);
 
 	  Xl = asm_vpmsumd (Xh, H4l);
 	  Xm = asm_vpmsumd (Xh, H4m);
@@ -449,62 +482,63 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
 	  Xm1 = asm_vpmsumd (in1, H3m);
 	  Xh1 = asm_vpmsumd (in1, H3h);
 
-	  Xl ^= Xl3;
-	  Xm ^= Xm3;
-	  Xh ^= Xh3;
-	  merge_l = vec_perm2 (in2, in3, loperm);
-	  merge_h = vec_perm2 (in2, in3, hiperm);
+	  Xl = asm_xor (Xl, Xl3);
+	  Xm = asm_xor (Xm, Xm3);
+	  Xh = asm_xor (Xh, Xh3);
+	  merge_l = asm_mergelo (in2, in3);
+	  merge_h = asm_mergehi (in2, in3);
 
 	  t0 = asm_vpmsumd (Xl, c2);
 	  Xl3 = asm_vpmsumd (merge_l, H21l);
 	  Xh3 = asm_vpmsumd (merge_h, H21h);
 
-	  Xl ^= asm_rot_block_left (Xm);
-	  Xh ^= asm_rot_block_right (Xm);
+	  t1 = asm_rot_block_left (Xm);
+	  t2 = asm_rot_block_right (Xm);
+	  Xl = asm_xor(Xl, t1);
+	  Xh = asm_xor(Xh, t2);
 
 	  Xl = asm_swap_u64 (Xl);
-	  Xl ^= t0;
+	  Xl = asm_xor(Xl, t0);
 
 	  Xl_rotate = asm_swap_u64 (Xl);
 	  Xm2 = asm_vpmsumd (in2, H2m);
 	  Xm3 = asm_vpmsumd (in3, H0m);
 	  Xl = asm_vpmsumd (Xl, c2);
 
-	  Xl3 ^= Xl1;
-	  Xh3 ^= Xh1;
-	  Xh ^= in0;
-	  Xm2 ^= Xm1;
-	  Xh ^= Xl_rotate;
-	  Xm3 ^= Xm2;
-	  Xh ^= Xl;
+	  Xl3 = asm_xor (Xl3, Xl1);
+	  Xh3 = asm_xor (Xh3, Xh1);
+	  Xh = asm_xor (Xh, in0);
+	  Xm2 = asm_xor (Xm2, Xm1);
+	  Xh = asm_xor (Xh, Xl_rotate);
+	  Xm3 = asm_xor (Xm3, Xm2);
+	  Xh = asm_xor (Xh, Xl);
 	}
 
       Xl = asm_vpmsumd (Xh, H4l);
       Xm = asm_vpmsumd (Xh, H4m);
       Xh = asm_vpmsumd (Xh, H4h);
 
-      Xl ^= Xl3;
-      Xm ^= Xm3;
+      Xl = asm_xor (Xl, Xl3);
+      Xm = asm_xor (Xm, Xm3);
 
       t0 = asm_vpmsumd (Xl, c2);
 
-      Xh ^= Xh3;
-      Xl ^= asm_rot_block_left (Xm);
-      Xh ^= asm_rot_block_right (Xm);
+      Xh = asm_xor (Xh, Xh3);
+      t1 = asm_rot_block_left (Xm);
+      t2 = asm_rot_block_right (Xm);
+      Xl = asm_xor (Xl, t1);
+      Xh = asm_xor (Xh, t2);
 
       Xl = asm_swap_u64 (Xl);
-      Xl ^= t0;
+      Xl = asm_xor (Xl, t0);
 
       Xl_rotate = asm_swap_u64 (Xl);
       Xl = asm_vpmsumd (Xl, c2);
-      Xl_rotate ^= Xh;
-      Xl ^= Xl_rotate;
-
-      cur = Xl;
+      Xh = asm_xor (Xh, Xl_rotate);
+      cur = asm_xor (Xh, Xl);
     }
 
-  cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const);
-  STORE_TABLE (result, 0, cur);
+  vec_store_he (vec_be_swap (cur, bswap_const), 0, result);
 }
 
 #endif /* GCM_USE_PPC_VPMSUM */
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sun Mar  7 17:44:09 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun,  7 Mar 2021 18:44:09 +0200
Subject: [PATCH 3/4] configure.ac: fix digest implementations going to cipher
 list
In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
References: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
Message-ID: <20210307164410.508295-3-jussi.kivilinna@iki.fi>

* configure.ac: Add 'crc-arm*.lo', 'crc-ppc.lo', 'sha*-ppc.lo' to
GCRYPT_DIGESTS instead of GCRYPT_CIPHERS.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 configure.ac | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/configure.ac b/configure.ac
index fb1a7a70..3734cf7a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2837,17 +2837,17 @@ if test "$found" = "1" ; then
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo"
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-armv8-ce.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo"
       ;;
       powerpc64-*-*)
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo"
       ;;
       powerpc-*-*)
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-ppc.lo"
       ;;
    esac
 fi
@@ -2914,17 +2914,17 @@ if test "$found" = "1" ; then
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ppc.lo"
    esac
 
    case "$mpi_cpu_arch" in
@@ -2957,17 +2957,17 @@ if test "$found" = "1" ; then
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ppc.lo"
    esac
 
    if test x"$neonsupport" = xyes ; then
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sun Mar  7 17:44:10 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun,  7 Mar 2021 18:44:10 +0200
Subject: [PATCH 4/4] Compile arch specific GCM implementations only on target
 arch
In-Reply-To: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
References: <20210307164410.508295-1-jussi.kivilinna@iki.fi>
Message-ID: <20210307164410.508295-4-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Move arch specific 'cipher-gcm-*.[cS]' files
from libcipher_la_SOURCES to EXTRA_libcipher_la_SOURCES.
* configure.ac: Add 'cipher-gcm-intel-pclmul.lo' and
'cipher-gcm-arm*.lo'.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am |  6 +++---
 configure.ac       | 16 ++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index da8cc126..52a00aa9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -48,8 +48,7 @@ libcipher_la_SOURCES = \
 	cipher-aeswrap.c \
 	cipher-ccm.c \
 	cipher-cmac.c \
-	cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
-	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
+	cipher-gcm.c \
 	cipher-poly1305.c \
 	cipher-ocb.c \
 	cipher-xts.c \
@@ -83,7 +82,8 @@ EXTRA_libcipher_la_SOURCES = \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
 	chacha20-ppc.c chacha20-s390x.S \
-	cipher-gcm-ppc.c \
+	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
+	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
diff --git a/configure.ac b/configure.ac
index 3734cf7a..44ffffa7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3060,14 +3060,18 @@ esac
 
 # Arch specific GCM implementations
 case "${host}" in
-  powerpc64le-*-*)
-      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  i?86-*-* | x86_64-*-*)
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-intel-pclmul.lo"
   ;;
-  powerpc64-*-*)
-      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  arm*-*-*)
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv7-neon.lo"
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch32-ce.lo"
+  ;;
+  aarch64-*-*)
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch64-ce.lo"
   ;;
-  powerpc-*-*)
-      GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
+  powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
   ;;
 esac
 
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sat Mar 27 11:40:33 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Mar 2021 12:40:33 +0200
Subject: [PATCH] cipher-gcm-ppc: add big-endian support
Message-ID: <20210327104033.70161-1-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-ppc.c (ALIGNED_16): New.
(vec_store_he, vec_load_he): Remove WORDS_BIGENDIAN ifdef.
(vec_dup_byte_elem): New.
(_gcry_ghash_setup_ppc_vpmsum): Match function declaration with
prototype in cipher-gcm.c; Load C2 with VEC_LOAD_BE; Use
vec_dup_byte_elem; Align constants to 16 bytes.
(_gcry_ghash_ppc_vpmsum): Match function declaration with
prototype in cipher-gcm.c; Align constant to 16 bytes.
* cipher/cipher-gcm.c (ghash_ppc_vpmsum): Return value from
_gcry_ghash_ppc_vpmsum.
* cipher/cipher-internal.h (GCM_USE_PPC_VPMSUM): Remove requirement
for !WORDS_BIGENDIAN.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-ppc.c  | 45 +++++++++++++++++++++++-----------------
 cipher/cipher-gcm.c      |  8 +++----
 cipher/cipher-internal.h |  2 +-
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
index 2f60c09d..4f75e95c 100644
--- a/cipher/cipher-gcm-ppc.c
+++ b/cipher/cipher-gcm-ppc.c
@@ -88,6 +88,8 @@
 #define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
 #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
 
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
 typedef vector unsigned char vector16x_u8;
 typedef vector signed char vector16x_s8;
 typedef vector unsigned long long vector2x_u64;
@@ -182,7 +184,6 @@ asm_ashl_128(block a, vector16x_u8 shift)
 static ASM_FUNC_ATTR_INLINE void
 vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
 {
-#ifndef WORDS_BIGENDIAN
   /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
    * lxvd2x directly instead. */
 #if __GNUC__ >= 4
@@ -197,9 +198,6 @@ vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
 		      :
 		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
 		      : "memory", "r0");
-#else
-  vec_vsx_st ((vector16x_u8)vec, offset, ptr);
-#endif
 }
 
 #define VEC_LOAD_BE(in_ptr, bswap_const) \
@@ -208,7 +206,6 @@ vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
 static ASM_FUNC_ATTR_INLINE block
 vec_load_he(unsigned long offset, const unsigned char *ptr)
 {
-#ifndef WORDS_BIGENDIAN
   block vec;
   /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
    * lxvd2x directly instead. */
@@ -225,9 +222,6 @@ vec_load_he(unsigned long offset, const unsigned char *ptr)
 		      : "r" (offset), "r" ((uintptr_t)ptr)
 		      : "memory", "r0");
   return vec;
-#else
-  return vec_vsx_ld (offset, ptr);
-#endif
 }
 
 static ASM_FUNC_ATTR_INLINE block
@@ -243,6 +237,15 @@ vec_be_swap(block vec, vector16x_u8 be_bswap_const)
   return vec;
 }
 
+static ASM_FUNC_ATTR_INLINE block
+vec_dup_byte_elem(block vec, int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return (block)vec_splat((vector16x_s8)vec, idx);
+#else
+  return (block)vec_splat((vector16x_s8)vec, (15 - idx) & 15);
+#endif
+}
 
 /* Power ghash based on papers:
    "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
@@ -259,31 +262,33 @@ vec_be_swap(block vec, vector16x_u8 be_bswap_const)
 
    The ghash "key" is a salt. */
 void ASM_FUNC_ATTR
-_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
+_gcry_ghash_setup_ppc_vpmsum (void *gcm_table_arg, void *gcm_key)
 {
-  static const vector16x_u8 bswap_const =
+  static const vector16x_u8 bswap_const ALIGNED_16 =
     { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
-  static const vector16x_u8 c2 =
-    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
-  static const vector16x_u8 one =
+  static const byte c2[16] ALIGNED_16 =
+    { 0xc2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  static const vector16x_u8 one ALIGNED_16 =
     { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+  uint64_t *gcm_table = gcm_table_arg;
   block T0, T1, T2;
   block C2, H, H1, H1l, H1h, H2, H2l, H2h;
   block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
   vector16x_s8 most_sig_of_H, t7, carry;
 
   H = VEC_LOAD_BE(gcm_key, bswap_const);
-  most_sig_of_H = vec_splat((vector16x_s8)H, 15);
+  C2 = VEC_LOAD_BE(c2, bswap_const);
+  most_sig_of_H = (vector16x_s8)vec_dup_byte_elem(H, 15);
   t7 = vec_splat_s8(7);
   carry = most_sig_of_H >> t7;
-  carry &= c2; /* only interested in certain carries. */
+  carry &= (vector16x_s8)C2; /* only interested in certain carries. */
   H1 = asm_ashl_128(H, one);
   H1 ^= (block)carry; /* complete the <<< 1 */
 
   T1 = asm_swap_u64 (H1);
   H1l = asm_rot_block_right (T1);
   H1h = asm_rot_block_left (T1);
-  C2 = asm_rot_block_right ((block)c2);
+  C2 = asm_rot_block_right (C2);
 
   STORE_TABLE (gcm_table, 0, C2);
   STORE_TABLE (gcm_table, 1, H1l);
@@ -367,11 +372,11 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
   STORE_TABLE (gcm_table, 12, H4h);
 }
 
-void ASM_FUNC_ATTR
-_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table,
+unsigned int ASM_FUNC_ATTR
+_gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
 			const byte *buf, const size_t nblocks)
 {
-  static const vector16x_u8 bswap_const =
+  static const vector16x_u8 bswap_const ALIGNED_16 =
     { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
   block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
   block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
@@ -539,6 +544,8 @@ _gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table,
     }
 
   vec_store_he (vec_be_swap (cur, bswap_const), 0, result);
+
+  return 0;
 }
 
 #endif /* GCM_USE_PPC_VPMSUM */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 598ea5fb..4ce85408 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -119,16 +119,16 @@ extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
 static void
 ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c)
 {
-  _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key);
+  _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table,
+			       c->u_mode.gcm.u_ghash_key.key);
 }
 
 static unsigned int
 ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
-  _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
-			 nblocks);
-  return 0;
+  return _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
+				nblocks);
 }
 #endif /* GCM_USE_PPC_VPMSUM */
 
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 1d62b11e..0e4a90fc 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -107,7 +107,7 @@
 #undef GCM_USE_PPC_VPMSUM
 #if defined(GCM_USE_TABLES)
 #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \
-    !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4
 #  define GCM_USE_PPC_VPMSUM 1
 #  define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sat Mar 27 15:31:05 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Mar 2021 16:31:05 +0200
Subject: [PATCH] Use memset for short constant length wipememory2
Message-ID: <20210327143105.91148-1-jussi.kivilinna@iki.fi>

* src/g10lib.h (fast_wipememory2_inline): New.
(wipememory2): Use 'fast_wipememory2_inline', remove 'fast_wipememory2'
use; Use _gcry_fast_wipememory* only when _len is not constant.
(fast_wipememory_s, fast_wipememory2): Remove.
--

Use of memset allows better code generation by compiler -
for example, use of vector registers for memory clearing.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 src/g10lib.h | 59 ++++++++++++++++------------------------------------
 1 file changed, 18 insertions(+), 41 deletions(-)

diff --git a/src/g10lib.h b/src/g10lib.h
index b0b73852..bc6e378f 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -359,56 +359,33 @@ void __gcry_burn_stack (unsigned int bytes);
 	     __gcry_burn_stack_dummy (); } while(0)
 
 /* To avoid that a compiler optimizes certain memset calls away, these
-   macros may be used instead.  For small constant length buffers,
-   memory wiping is inlined.  For non-constant or large length buffers,
-   memory is wiped with memset through _gcry_fast_wipememory. */
+   macros may be used instead.  For constant length buffers, memory
+   wiping is inlined.  For non-constant or large length buffers,
+   memory is wiped through _gcry_fast_wipememory. */
+#ifdef HAVE_GCC_ASM_VOLATILE_MEMORY
+#define fast_wipememory2_inline(_ptr,_set,_len) do { \
+	      memset((_ptr), (_set), (_len)); \
+	      asm volatile ("\n" :: "r" (_ptr) : "memory"); \
+	    } while(0)
+#else
+#define fast_wipememory2_inline(_ptr,_set,_len) \
+	    _gcry_fast_wipememory2((void *)_ptr, _set, _len)
+#endif
 #define wipememory2(_ptr,_set,_len) do { \
-	      if (!CONSTANT_P(_len) || _len > 64) { \
+	      if (!CONSTANT_P(_len)) { \
 		if (CONSTANT_P(_set) && (_set) == 0) \
-		  _gcry_fast_wipememory((void *)_ptr, _len); \
+		  _gcry_fast_wipememory((void *)(_ptr), (_len)); \
 		else \
-		  _gcry_fast_wipememory2((void *)_ptr, _set, _len); \
-	      } else {\
-		volatile char *_vptr = (volatile char *)(_ptr); \
-		size_t _vlen = (_len); \
-		const unsigned char _vset = (_set); \
-		fast_wipememory2(_vptr, _vset, _vlen); \
-		while(_vlen) { *_vptr = (_vset); _vptr++; _vlen--; } \
+		  _gcry_fast_wipememory2((void *)(_ptr), (_set), (_len)); \
+	      } else { \
+		fast_wipememory2_inline((void *)(_ptr), (_set), (_len)); \
 	      } \
 	    } while(0)
-#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
+#define wipememory(_ptr,_len) wipememory2((_ptr),0,(_len))
 
 void _gcry_fast_wipememory(void *ptr, size_t len);
 void _gcry_fast_wipememory2(void *ptr, int set, size_t len);
 
-#if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \
-    defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \
-    defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
-typedef struct fast_wipememory_s
-{
-  u64 a;
-} __attribute__((packed, aligned(1), may_alias)) fast_wipememory_t;
-/* fast_wipememory may leave tail bytes unhandled, in which case tail bytes
-   are handled by wipememory. */
-# define fast_wipememory2(_vptr,_vset,_vlen) do { \
-	      fast_wipememory_t _vset_long; \
-	      if (_vlen < sizeof(fast_wipememory_t)) \
-		break; \
-	      _vset_long.a = (_vset); \
-	      _vset_long.a *= U64_C(0x0101010101010101); \
-	      do { \
-		volatile fast_wipememory_t *_vptr_long = \
-		  (volatile void *)_vptr; \
-		_vptr_long->a = _vset_long.a; \
-		_vlen -= sizeof(fast_wipememory_t); \
-		_vptr += sizeof(fast_wipememory_t); \
-	      } while (_vlen >= sizeof(fast_wipememory_t)); \
-	    } while (0)
-#else
-# define fast_wipememory2(_vptr,_vset,_vlen)
-#endif
-
-
 /* Digit predicates.  */
 
 #define digitp(p)   (*(p) >= '0' && *(p) <= '9')
-- 
2.27.0


From jussi.kivilinna at iki.fi  Sat Mar 27 15:32:44 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Mar 2021 16:32:44 +0200
Subject: [PATCH] Add XOF output support for internal _gcry_md_hash_buffers
Message-ID: <20210327143244.91421-1-jussi.kivilinna@iki.fi>

* cipher/ecc-eddsa.c (_gcry_ecc_eddsa_compute_h_d, _gcry_ecc_eddsa_sign)
(_gcry_ecc_eddsa_verify): Use same _gcry_md_hash_buffers_extract code
path for SHA512 and SHAKE256.
* cipher/md.c (_gcry_md_hash_buffers): Rename to ...
(_gcry_md_hash_buffers_extract): ... this; Add digestlen and handling
for XOF algorithms (SHAKE128, SHAKE256).
(_gcry_md_hash_buffers): New.
* src/gcrypt-int.h (_gcry_md_hash_buffers_extract): New.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/ecc-eddsa.c | 440 +++++++++++++++++----------------------------
 cipher/md.c        |  51 ++++--
 src/gcrypt-int.h   |   4 +
 3 files changed, 212 insertions(+), 283 deletions(-)

diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c
index 2a1a8907..baea1bf5 100644
--- a/cipher/ecc-eddsa.c
+++ b/cipher/ecc-eddsa.c
@@ -500,7 +500,8 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec)
   unsigned char *rawmpi = NULL;
   unsigned int rawmpilen;
   unsigned char *digest;
-  int hashalgo, b;
+  int hashalgo, b, digestlen;
+  gcry_buffer_t hvec[2];
 
   *r_digest = NULL;
 
@@ -511,11 +512,15 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec)
    * For now, it's determine by the bit size of the field.
    */
   if (ec->nbits == 255)
-    hashalgo = GCRY_MD_SHA512;
+    {
+      hashalgo = GCRY_MD_SHA512;
+      digestlen = 64;
+    }
   else if (ec->nbits == 448)
     {
       b++;
       hashalgo = GCRY_MD_SHAKE256;
+      digestlen = 2 * b;
     }
   else
     return GPG_ERR_NOT_IMPLEMENTED;
@@ -533,35 +538,14 @@ _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec)
       return gpg_err_code_from_syserror ();
     }
 
-  if (hashalgo == GCRY_MD_SHAKE256)
-    {
-      gcry_error_t err;
-      gcry_md_hd_t hd;
+  memset (hvec, 0, sizeof hvec);
 
-      err = _gcry_md_open (&hd, hashalgo, 0);
-      if (err)
-        rc = gcry_err_code (err);
-      else
-        {
-          _gcry_md_write (hd, rawmpi, rawmpilen);
-          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
-          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
-          _gcry_md_close (hd);
-          rc = 0;
-        }
-    }
-  else
-    {
-      gcry_buffer_t hvec[2];
-
-      memset (hvec, 0, sizeof hvec);
-
-      hvec[0].data = digest;
-      hvec[0].len = b > rawmpilen? b - rawmpilen : 0;
-      hvec[1].data = rawmpi;
-      hvec[1].len = rawmpilen;
-      rc = _gcry_md_hash_buffers (hashalgo, 0, digest, hvec, 2);
-    }
+  hvec[0].data = digest;
+  hvec[0].len = (hashalgo == GCRY_MD_SHA512 && b > rawmpilen)
+		  ? b - rawmpilen : 0;
+  hvec[1].data = rawmpi;
+  hvec[1].len = rawmpilen;
+  rc = _gcry_md_hash_buffers_extract (hashalgo, 0, digest, digestlen, hvec, 2);
 
   xfree (rawmpi);
   if (rc)
@@ -702,16 +686,29 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec,
   unsigned int encpklen;
   mpi_point_struct I;          /* Intermediate value.  */
   gcry_mpi_t a, x, y, r;
-  int b;
+  const char *dom;
+  int domlen, digestlen;
+  int b, i;
   unsigned char x_olen[2];
   unsigned char prehashed_msg[64];
+  gcry_buffer_t hvec[6];
+  gcry_buffer_t hvec2[1];
 
   b = (ec->nbits+7)/8;
 
   if (ec->nbits == 255)
-    ;
+    {
+      dom = DOM25519;
+      domlen = DOM25519_LEN;
+      digestlen = 64;
+    }
   else if (ec->nbits == 448)
-    b++;
+    {
+      b++;
+      dom = DOM448;
+      domlen = DOM448_LEN;
+      digestlen = 2 * b;
+    }
   else
     return GPG_ERR_NOT_IMPLEMENTED;
 
@@ -751,98 +748,58 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec,
   if (DBG_CIPHER)
     log_printhex ("     m", mbuf, mlen);
 
-  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+  memset (hvec, 0, sizeof hvec);
+  i = 0;
+
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448)
     {
-      gcry_error_t err;
-      gcry_md_hd_t hd;
+      hvec[i].data = (void *)dom;
+      hvec[i].len  = domlen;
+      i++;
+      x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+      x_olen[1] = ctx->labellen;
+      hvec[i].data = x_olen;
+      hvec[i].len  = 2;
+      i++;
+      if (ctx->labellen)
+	{
+	  hvec[i].data = ctx->label;
+	  hvec[i].len  = ctx->labellen;
+	  i++;
+	}
+    }
 
-      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
-      if (err)
-        rc = gcry_err_code (err);
-      else
-        {
-          _gcry_md_write (hd, DOM448, DOM448_LEN);
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          _gcry_md_write (hd, x_olen, 2);
-          if (ctx->labellen)
-            _gcry_md_write (hd, ctx->label, ctx->labellen);
-          _gcry_md_write (hd, digest+b, b);
-          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-            {
-              gcry_md_hd_t hd2;
+  hvec[i].data = digest;
+  hvec[i].off  = b;
+  hvec[i].len  = b;
+  i++;
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+    {
+      memset (hvec2, 0, sizeof hvec2);
 
-              err = _gcry_md_open (&hd2, ctx->hash_algo, 0);
-              if (err)
-                {
-                  rc = gcry_err_code (err);
-                  _gcry_md_close (hd);
-                  goto leave;
-                }
-              _gcry_md_write (hd2, mbuf, mlen);
-              _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0);
-              _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64);
-              _gcry_md_close (hd2);
-              _gcry_md_write (hd, prehashed_msg, 64);
-            }
-          else
-            _gcry_md_write (hd, mbuf, mlen);
-          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
-          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
-          _gcry_md_close (hd);
-          rc = 0;
-        }
+      hvec2[0].data = (char*)mbuf;
+      hvec2[0].len  = mlen;
+
+      _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, prehashed_msg, 64,
+				     hvec2, 1);
+      hvec[i].data = (char*)prehashed_msg;
+      hvec[i].len  = 64;
     }
   else
     {
-      gcry_buffer_t hvec[6];
-      int i = 0;
-
-      memset (hvec, 0, sizeof hvec);
-
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
-        {
-          hvec[i].data = (void *)DOM25519;
-          hvec[i].len  = DOM25519_LEN;
-          i++;
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          hvec[i].data = x_olen;
-          hvec[i].len  = 2;
-          i++;
-          if (ctx->labellen)
-            {
-              hvec[i].data = ctx->label;
-              hvec[i].len  = ctx->labellen;
-              i++;
-            }
-        }
-
-      hvec[i].data = digest;
-      hvec[i].off  = b;
-      hvec[i].len  = b;
-      i++;
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-        {
-          _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen);
-          hvec[i].data = (char*)prehashed_msg;
-          hvec[i].len  = 64;
-        }
-      else
-        {
-          hvec[i].data = (char*)mbuf;
-          hvec[i].len  = mlen;
-        }
-      i++;
-      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+      hvec[i].data = (char*)mbuf;
+      hvec[i].len  = mlen;
     }
+  i++;
 
+  rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen,
+				      hvec, i);
   if (rc)
     goto leave;
-  reverse_buffer (digest, 2*b);
+  reverse_buffer (digest, digestlen);
   if (DBG_CIPHER)
-    log_printhex ("     r", digest, 2*b);
-  _gcry_mpi_set_buffer (r, digest, 2*b, 0);
+    log_printhex ("     r", digest, digestlen);
+  _gcry_mpi_set_buffer (r, digest, digestlen, 0);
   mpi_mod (r, r, ec->n);
   _gcry_mpi_ec_mul_point (&I, r, ec->G, ec);
   if (DBG_CIPHER)
@@ -855,80 +812,48 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec,
   if (DBG_CIPHER)
     log_printhex ("   e_r", rawmpi, rawmpilen);
 
-  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+  memset (hvec, 0, sizeof hvec);
+  i = 0;
+
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448)
     {
-      gcry_error_t err;
-      gcry_md_hd_t hd;
+      hvec[i].data = (void *)dom;
+      hvec[i].len  = domlen;
+      i++;
+      x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+      x_olen[1] = ctx->labellen;
+      hvec[i].data = x_olen;
+      hvec[i].len  = 2;
+      i++;
+      if (ctx->labellen)
+	{
+	  hvec[i].data = ctx->label;
+	  hvec[i].len  = ctx->labellen;
+	  i++;
+	}
+    }
 
-      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
-      if (err)
-        rc = gcry_err_code (err);
-      else
-        {
-          _gcry_md_write (hd, DOM448, DOM448_LEN);
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          _gcry_md_write (hd, x_olen, 2);
-          if (ctx->labellen)
-            _gcry_md_write (hd, ctx->label, ctx->labellen);
-          _gcry_md_write (hd, rawmpi, rawmpilen);
-          _gcry_md_write (hd, encpk, encpklen);
-          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-            _gcry_md_write (hd, prehashed_msg, 64);
-          else
-            _gcry_md_write (hd, mbuf, mlen);
-          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
-          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
-          _gcry_md_close (hd);
-          rc = 0;
-        }
+  /* S = r + a * H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) mod n  */
+  hvec[i].data = rawmpi;  /* (this is R) */
+  hvec[i].len  = rawmpilen;
+  i++;
+  hvec[i].data = encpk;
+  hvec[i].len  = encpklen;
+  i++;
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+    {
+      hvec[i].data = (char*)prehashed_msg;
+      hvec[i].len  = 64;
     }
   else
     {
-      gcry_buffer_t hvec[6];
-      int i = 0;
-
-      memset (hvec, 0, sizeof hvec);
-
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
-        {
-          hvec[i].data = (void *)DOM25519;
-          hvec[i].len  = DOM25519_LEN;
-          i++;
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          hvec[i].data = x_olen;
-          hvec[i].len  = 2;
-          i++;
-          if (ctx->labellen)
-            {
-              hvec[i].data = ctx->label;
-              hvec[i].len  = ctx->labellen;
-              i++;
-            }
-        }
-
-      /* S = r + a * H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) mod n  */
-      hvec[i].data = rawmpi;  /* (this is R) */
-      hvec[i].len  = rawmpilen;
-      i++;
-      hvec[i].data = encpk;
-      hvec[i].len  = encpklen;
-      i++;
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-        {
-          hvec[i].data = (char*)prehashed_msg;
-          hvec[i].len  = 64;
-        }
-      else
-        {
-          hvec[i].data = (char*)mbuf;
-          hvec[i].len  = mlen;
-        }
-      i++;
-      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+      hvec[i].data = (char*)mbuf;
+      hvec[i].len  = mlen;
     }
+  i++;
 
+  rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen,
+				      hvec, i);
   if (rc)
     goto leave;
 
@@ -936,10 +861,10 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec,
   mpi_set_opaque (r_r, rawmpi, rawmpilen*8);
   rawmpi = NULL;
 
-  reverse_buffer (digest, 2*b);
+  reverse_buffer (digest, digestlen);
   if (DBG_CIPHER)
-    log_printhex (" H(R+)", digest, 2*b);
-  _gcry_mpi_set_buffer (s, digest, 2*b, 0);
+    log_printhex (" H(R+)", digest, digestlen);
+  _gcry_mpi_set_buffer (s, digest, digestlen, 0);
   mpi_mulm (s, s, a, ec->n);
   mpi_addm (s, s, r, ec->n);
   rc = eddsa_encodempi (s, ec->nbits, &rawmpi, &rawmpilen);
@@ -985,8 +910,13 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec,
   unsigned char digest[114];
   gcry_mpi_t h, s;
   mpi_point_struct Ia, Ib;
+  const char *dom;
+  int domlen, digestlen;
+  int i;
   unsigned char x_olen[2];
   unsigned char prehashed_msg[64];
+  gcry_buffer_t hvec[6];
+  gcry_buffer_t hvec2[1];
 
   if (!mpi_is_opaque (input) || !mpi_is_opaque (r_in) || !mpi_is_opaque (s_in))
     return GPG_ERR_INV_DATA;
@@ -999,9 +929,18 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec,
   b = (ec->nbits+7)/8;
 
   if (ec->nbits == 255)
-    ;
+    {
+      dom = DOM25519;
+      domlen = DOM25519_LEN;
+      digestlen = 64;
+    }
   else if (ec->nbits == 448)
-    b++;
+    {
+      b++;
+      dom = DOM448;
+      domlen = DOM448_LEN;
+      digestlen = 2 * b;
+    }
   else
     return GPG_ERR_NOT_IMPLEMENTED;
 
@@ -1038,102 +977,61 @@ _gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec,
       goto leave;
     }
 
-  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+  memset (hvec, 0, sizeof hvec);
+  i = 0;
+
+  /* h = H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m)  */
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen || ec->nbits == 448)
     {
-      gcry_error_t err;
-      gcry_md_hd_t hd;
+      hvec[i].data = (void *)dom;
+      hvec[i].len  = domlen;
+      i++;
+      x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+      x_olen[1] = ctx->labellen;
+      hvec[i].data = x_olen;
+      hvec[i].len  = 2;
+      i++;
+      if (ctx->labellen)
+	{
+	  hvec[i].data = ctx->label;
+	  hvec[i].len  = ctx->labellen;
+	  i++;
+	}
+    }
 
-      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
-      if (err)
-        rc = gcry_err_code (err);
-      else
-        {
-          _gcry_md_write (hd, DOM448, DOM448_LEN);
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          _gcry_md_write (hd, x_olen, 2);
-          if (ctx->labellen)
-            _gcry_md_write (hd, ctx->label, ctx->labellen);
-          _gcry_md_write (hd, rbuf, rlen);
-          _gcry_md_write (hd, encpk, encpklen);
-          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-            {
-              gcry_md_hd_t hd2;
+  hvec[i].data = (char*)rbuf;
+  hvec[i].len  = rlen;
+  i++;
+  hvec[i].data = encpk;
+  hvec[i].len  = encpklen;
+  i++;
+  if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+    {
+      memset (hvec2, 0, sizeof hvec2);
 
-              err = _gcry_md_open (&hd2, ctx->hash_algo, 0);
-              if (err)
-                {
-                  rc = gcry_err_code (err);
-                  _gcry_md_close (hd);
-                  goto leave;
-                }
-              _gcry_md_write (hd2, mbuf, mlen);
-              _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0);
-              _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64);
-              _gcry_md_close (hd2);
-              _gcry_md_write (hd, prehashed_msg, 64);
-            }
-          else
-            _gcry_md_write (hd, mbuf, mlen);
-          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
-          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
-          _gcry_md_close (hd);
-          rc = 0;
-        }
+      hvec2[0].data = (char*)mbuf;
+      hvec2[0].len  = mlen;
+
+      _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, prehashed_msg, 64,
+				      hvec2, 1);
+      hvec[i].data = (char*)prehashed_msg;
+      hvec[i].len  = 64;
     }
   else
     {
-      gcry_buffer_t hvec[6];
-      int i = 0;
-
-      memset (hvec, 0, sizeof hvec);
-
-      /* h = H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m)  */
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
-        {
-          hvec[i].data = (void *)DOM25519;
-          hvec[i].len  = DOM25519_LEN;
-          i++;
-          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
-          x_olen[1] = ctx->labellen;
-          hvec[i].data = x_olen;
-          hvec[i].len  = 2;
-          i++;
-          if (ctx->labellen)
-            {
-              hvec[i].data = ctx->label;
-              hvec[i].len  = ctx->labellen;
-              i++;
-            }
-        }
-
-      hvec[i].data = (char*)rbuf;
-      hvec[i].len  = rlen;
-      i++;
-      hvec[i].data = encpk;
-      hvec[i].len  = encpklen;
-      i++;
-      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
-        {
-          _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen);
-          hvec[i].data = (char*)prehashed_msg;
-          hvec[i].len  = 64;
-        }
-      else
-        {
-          hvec[i].data = (char*)mbuf;
-          hvec[i].len  = mlen;
-        }
-      i++;
-      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+      hvec[i].data = (char*)mbuf;
+      hvec[i].len  = mlen;
     }
+  i++;
 
+  rc = _gcry_md_hash_buffers_extract (ctx->hash_algo, 0, digest, digestlen,
+				      hvec, i);
   if (rc)
     goto leave;
-  reverse_buffer (digest, 2*b);
+  reverse_buffer (digest, digestlen);
   if (DBG_CIPHER)
-    log_printhex (" H(R+)", digest, 2*b);
-  _gcry_mpi_set_buffer (h, digest, 2*b, 0);
+    log_printhex (" H(R+)", digest, digestlen);
+  _gcry_mpi_set_buffer (h, digest, digestlen, 0);
 
   /* According to the paper the best way for verification is:
          encodepoint(sG - h?Q) = encodepoint(r)
diff --git a/cipher/md.c b/cipher/md.c
index efb7376a..87979059 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -1251,11 +1251,15 @@ _gcry_md_hash_buffer (int algo, void *digest,
    used as the key.
 
    On success 0 is returned and resulting hash or HMAC is stored at
-   DIGEST which must have been provided by the caller with an
-   appropriate length.  */
+   DIGEST. DIGESTLEN may be given as -1, in which case DIGEST must
+   have been provided by the caller with an appropriate length.
+   DIGESTLEN may also be the appropriate length or, in case of XOF
+   algorithms, DIGESTLEN indicates number bytes to extract from XOF
+   to DIGEST.  */
 gpg_err_code_t
-_gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
-                       const gcry_buffer_t *iov, int iovcnt)
+_gcry_md_hash_buffers_extract (int algo, unsigned int flags, void *digest,
+			       int digestlen, const gcry_buffer_t *iov,
+			       int iovcnt)
 {
   gcry_md_spec_t *spec;
   int hmac;
@@ -1287,6 +1291,11 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
         }
     }
 
+  if (spec->mdlen > 0 && digestlen != -1 && digestlen != spec->mdlen)
+    return GPG_ERR_DIGEST_ALGO;
+  if (spec->mdlen == 0 && digestlen == -1)
+    return GPG_ERR_DIGEST_ALGO;
+
   if (!hmac && spec->hash_buffers)
     {
       spec->hash_buffers (digest, iov, iovcnt);
@@ -1297,13 +1306,6 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
          normal functions.  */
       gcry_md_hd_t h;
       gpg_err_code_t rc;
-      int dlen;
-
-      /* Detect SHAKE128 like algorithms which we can't use because
-       * our API does not allow for a variable length digest.  */
-      dlen = md_digest_length (algo);
-      if (!dlen)
-        return GPG_ERR_DIGEST_ALGO;
 
       rc = md_open (&h, algo, (hmac? GCRY_MD_FLAG_HMAC:0));
       if (rc)
@@ -1324,7 +1326,10 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
       for (;iovcnt; iov++, iovcnt--)
         md_write (h, (const char*)iov[0].data + iov[0].off, iov[0].len);
       md_final (h);
-      memcpy (digest, md_read (h, algo), dlen);
+      if (spec->mdlen > 0)
+	memcpy (digest, md_read (h, algo), spec->mdlen);
+      else if (digestlen > 0)
+	md_extract (h, algo, digest, digestlen);
       md_close (h);
     }
 
@@ -1332,6 +1337,28 @@ _gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
 }
 
 
+/* Shortcut function to hash multiple buffers with a given algo.  In
+   contrast to gcry_md_hash_buffer, this function returns an error on
+   invalid arguments or on other problems; disabled algorithms are
+   _not_ ignored but flagged as an error.
+
+   The data to sign is taken from the array IOV which has IOVCNT items.
+
+   The only supported flag in FLAGS is GCRY_MD_FLAG_HMAC which turns
+   this function into a HMAC function; the first item in IOV is then
+   used as the key.
+
+   On success 0 is returned and resulting hash or HMAC is stored at
+   DIGEST which must have been provided by the caller with an
+   appropriate length.  */
+gpg_err_code_t
+_gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
+		       const gcry_buffer_t *iov, int iovcnt)
+{
+  return _gcry_md_hash_buffers_extract(algo, flags, digest, -1, iov, iovcnt);
+}
+
+
 static int
 md_get_algo (gcry_md_hd_t a)
 {
diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h
index 086953d7..d8b6d407 100644
--- a/src/gcrypt-int.h
+++ b/src/gcrypt-int.h
@@ -128,6 +128,10 @@ gpg_err_code_t _gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer,
                                  size_t length);
 void _gcry_md_hash_buffer (int algo, void *digest,
                            const void *buffer, size_t length);
+gpg_err_code_t _gcry_md_hash_buffers_extract (int algo, unsigned int flags,
+					      void *digest, int digestlen,
+					      const gcry_buffer_t *iov,
+					      int iovcnt);
 gpg_err_code_t _gcry_md_hash_buffers (int algo, unsigned int flags,
                                       void *digest,
                                       const gcry_buffer_t *iov, int iovcnt);
-- 
2.27.0


From jcb62281 at gmail.com  Sun Mar 28 04:05:20 2021
From: jcb62281 at gmail.com (Jacob Bachmeyer)
Date: Sat, 27 Mar 2021 21:05:20 -0500
Subject: [PATCH] Use memset for short constant length wipememory2
In-Reply-To: <20210327143105.91148-1-jussi.kivilinna@iki.fi>
References: <20210327143105.91148-1-jussi.kivilinna@iki.fi>
Message-ID: <605FE460.60209@gmail.com>

Jussi Kivilinna wrote:
> Use of memset allows better code generation by compiler -
> for example, use of vector registers for memory clearing.
>   


Are you sure that this is a good idea?  The comment indicates that the 
original purpose of not using memset was to ensure that a compiler 
cannot optimize the call away.


-- Jacob


From jussi.kivilinna at iki.fi  Sun Mar 28 15:09:08 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 28 Mar 2021 16:09:08 +0300
Subject: [PATCH] Use memset for short constant length wipememory2
In-Reply-To: <605FE460.60209@gmail.com>
References: <20210327143105.91148-1-jussi.kivilinna@iki.fi>
 <605FE460.60209@gmail.com>
Message-ID: <df4a465f-d974-5268-943c-63d3e81fd776@iki.fi>

On 28.3.2021 5.05, Jacob Bachmeyer via Gcrypt-devel wrote:
> Jussi Kivilinna wrote:
>> Use of memset allows better code generation by compiler -
>> for example, use of vector registers for memory clearing.
> 
> 
> Are you sure that this is a good idea?? The comment indicates that the original purpose of not using memset was to ensure that a compiler cannot optimize the call away.
> 

New approach uses inline assembly memory barrier to prevent optimizing away preceding memset:

     memset(ptr_memory_to_wipe, 0, memory_length);
     asm volatile ("\n"::"r"(ptr_memory_to_wipe):"memory");

I'll update the comment to better explain this.

-Jussi


From jussi.kivilinna at iki.fi  Tue Mar 30 17:56:24 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 30 Mar 2021 18:56:24 +0300
Subject: [PATCH v2] Use memset for constant length wipememory2
Message-ID: <20210330155624.852568-1-jussi.kivilinna@iki.fi>

* src/g10lib.h (fast_wipememory2_inline): New.
(wipememory2): Use 'fast_wipememory2_inline', remove 'fast_wipememory2'
use; Use _gcry_fast_wipememory* when _len or _set is not constant.
(fast_wipememory_s, fast_wipememory2): Remove.
--

Use of memset allows better code generation by compiler - for example,
use of vector registers for memory clearing. Dead store elimination
of memset by compiler optimization is avoided by using assembly block
after memset:

  memset(ptr_mem_wipe, 0, constant_mem_len);
  asm volatile ("\n" :: "r" (ptr_mem_wipe) : "memory");

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 src/g10lib.h | 62 +++++++++++++++++-----------------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/src/g10lib.h b/src/g10lib.h
index b0b73852..fb288a30 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -358,57 +358,35 @@ void __gcry_burn_stack (unsigned int bytes);
 	do { __gcry_burn_stack (bytes); \
 	     __gcry_burn_stack_dummy (); } while(0)
 
-/* To avoid that a compiler optimizes certain memset calls away, these
-   macros may be used instead.  For small constant length buffers,
-   memory wiping is inlined.  For non-constant or large length buffers,
-   memory is wiped with memset through _gcry_fast_wipememory. */
+/* To avoid that a compiler optimizes certain memset calls away, this
+   macro may be used instead.  For constant length buffers, memory
+   wiping is inlined.  Dead store elimination of inlined memset is
+   avoided here by using assembly block after memset.  For non-constant
+   length buffers, memory is wiped through _gcry_fast_wipememory.  */
+#ifdef HAVE_GCC_ASM_VOLATILE_MEMORY
+#define fast_wipememory2_inline(_ptr,_set,_len) do { \
+	      memset((_ptr), (_set), (_len)); \
+	      asm volatile ("\n" :: "r" (_ptr) : "memory"); \
+	    } while(0)
+#else
+#define fast_wipememory2_inline(_ptr,_set,_len) \
+	    _gcry_fast_wipememory2((void *)_ptr, _set, _len)
+#endif
 #define wipememory2(_ptr,_set,_len) do { \
-	      if (!CONSTANT_P(_len) || _len > 64) { \
+	      if (!CONSTANT_P(_len) || !CONSTANT_P(_set)) { \
 		if (CONSTANT_P(_set) && (_set) == 0) \
-		  _gcry_fast_wipememory((void *)_ptr, _len); \
+		  _gcry_fast_wipememory((void *)(_ptr), (_len)); \
 		else \
-		  _gcry_fast_wipememory2((void *)_ptr, _set, _len); \
-	      } else {\
-		volatile char *_vptr = (volatile char *)(_ptr); \
-		size_t _vlen = (_len); \
-		const unsigned char _vset = (_set); \
-		fast_wipememory2(_vptr, _vset, _vlen); \
-		while(_vlen) { *_vptr = (_vset); _vptr++; _vlen--; } \
+		  _gcry_fast_wipememory2((void *)(_ptr), (_set), (_len)); \
+	      } else { \
+		fast_wipememory2_inline((void *)(_ptr), (_set), (_len)); \
 	      } \
 	    } while(0)
-#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
+#define wipememory(_ptr,_len) wipememory2((_ptr),0,(_len))
 
 void _gcry_fast_wipememory(void *ptr, size_t len);
 void _gcry_fast_wipememory2(void *ptr, int set, size_t len);
 
-#if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \
-    defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \
-    defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
-typedef struct fast_wipememory_s
-{
-  u64 a;
-} __attribute__((packed, aligned(1), may_alias)) fast_wipememory_t;
-/* fast_wipememory may leave tail bytes unhandled, in which case tail bytes
-   are handled by wipememory. */
-# define fast_wipememory2(_vptr,_vset,_vlen) do { \
-	      fast_wipememory_t _vset_long; \
-	      if (_vlen < sizeof(fast_wipememory_t)) \
-		break; \
-	      _vset_long.a = (_vset); \
-	      _vset_long.a *= U64_C(0x0101010101010101); \
-	      do { \
-		volatile fast_wipememory_t *_vptr_long = \
-		  (volatile void *)_vptr; \
-		_vptr_long->a = _vset_long.a; \
-		_vlen -= sizeof(fast_wipememory_t); \
-		_vptr += sizeof(fast_wipememory_t); \
-	      } while (_vlen >= sizeof(fast_wipememory_t)); \
-	    } while (0)
-#else
-# define fast_wipememory2(_vptr,_vset,_vlen)
-#endif
-
-
 /* Digit predicates.  */
 
 #define digitp(p)   (*(p) >= '0' && *(p) <= '9')
-- 
2.27.0


From guidovranken at gmail.com  Wed Mar 31 20:42:23 2021
From: guidovranken at gmail.com (Guido Vranken)
Date: Wed, 31 Mar 2021 20:42:23 +0200
Subject: CMAC + SERPENT/IDEA/RC2 buffer overflow/crash with oversized key
Message-ID: <CAO5O-ELxjC-nnuROpJf98GWy5S1=PZUytcMRFP1KYCZnPVp8DQ@mail.gmail.com>

In the program below, each of three calls to cmac() causes a different
crash (use AddressSanitizer to be sure). I think the correct approach is to
make gcry_mac_setkey() return an error code if the key has an inappropriate
size.

#include <gcrypt.h>

#define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; }

static void cmac(const int mac, const int keysize) {
    unsigned char key[keysize];
    memset(key, 0, keysize);

    gcry_mac_hd_t h;
    CF_CHECK_EQ(gcry_mac_open(&h, mac, 0, NULL), GPG_ERR_NO_ERROR);
    CF_CHECK_EQ(gcry_mac_setkey(h, key, keysize), GPG_ERR_NO_ERROR);

end:
    /* noret */ gcry_mac_close(h);
}

int main(void)
{
    cmac(GCRY_MAC_CMAC_SERPENT, 64);
    cmac(GCRY_MAC_CMAC_IDEA, 32);
    cmac(GCRY_MAC_CMAC_RFC2268, 256);
    return 0;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20210331/1154e058/attachment.html>