[PATCH 2/7] sha512: add Intel SHA512 extension implementation

Sun Jun 28 14:37:36 CEST 2026

* cipher/Makefile.am: Add 'sha512-intel-shaext.c'; Add instrumentation
option munging for 'sha512-intel-shaext.o' and 'sha512-intel-shaext.lo'.
* cipher/sha512-intel-shaext.c: New.
* cipher/sha512.c (USE_SHA512_INTEL): New.
[USE_SHA512_INTEL] (_gcry_sha512_transform_intel_shaext)
(do_sha512_transform_intel_shaext): New.
(sha512_init_common) [USE_SHA512_INTEL]: Use Intel SHA512 accelerated
implementation if HW feature available.
* configure.ac (gcry_cv_gcc_inline_asm_sha512)
(HAVE_GCC_INLINE_ASM_SHA512): New.
(GCRYPT_ASM_DIGESTS) [x86]: Add 'sha512-intel-shaext.lo'.
* doc/gcrypt.texi: Add "intel-sha512" to HW features list.
* src/g10lib.h (HWF_INTEL_SHA512): New.
* src/hwf-x86.c (get_cpuid_subleaf): New, based on 'get_cpuid'.
(get_cpuid): Implement as wrapper for 'get_cpuid_subleaf'.
(detect_x86_gnuc): Add Intel SHA512 detection.
* src/hwfeatures.c (hwflist): Add "intel-sha512".
--

Implementation has been tested with Intel SDE (both i386 and x86-64),
not yet on real hardware.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |   7 +
 cipher/sha512-intel-shaext.c | 298 +++++++++++++++++++++++++++++++++++
 cipher/sha512.c              |  29 ++++
 configure.ac                 |  35 ++++
 doc/gcrypt.texi              |   1 +
 src/g10lib.h                 |   1 +
 src/hwf-x86.c                | 117 ++++++++------
 src/hwfeatures.c             |   1 +
 8 files changed, 445 insertions(+), 44 deletions(-)
 create mode 100644 cipher/sha512-intel-shaext.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 11bb19d7..0abbfea6 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -149,6 +149,7 @@ EXTRA_libcipher_la_SOURCES = \
 	sha256-intel-shaext.c sha256-ppc.c sha256-riscv-zvknha-zvkb.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
+	sha512-intel-shaext.c \
 	sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
 	sha512-ppc.c sha512-riscv-zvknhb-zvkb.c sha512-ssse3-i386.c \
 	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
@@ -242,6 +243,12 @@ sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
 sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
+sha512-intel-shaext.o: $(srcdir)/sha512-intel-shaext.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha512-intel-shaext.lo: $(srcdir)/sha512-intel-shaext.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
 crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/sha512-intel-shaext.c b/cipher/sha512-intel-shaext.c
new file mode 100644
index 00000000..d991abad
--- /dev/null
+++ b/cipher/sha512-intel-shaext.c
@@ -0,0 +1,298 @@
+/* sha512-intel-shaext.c - SHA512 accelerated with Intel SHA512 extension.
+ * Copyright (C) 2026 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHA512) && \
+    defined(USE_SHA512) && defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
+
+/* Two macros to be called prior and after the use of SHA512
+  instructions.  There should be no external function calls between
+  the use of these macros.  There purpose is to make sure that the
+  SSE regsiters are cleared and won't reveal any information about
+  the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, 0*16(%0)\n"                       \
+		      "movdqu %%xmm7, 1*16(%0)\n"                       \
+		      :                                                 \
+		      : "r" (&win64tmp[0])                              \
+		      : "memory");                                      \
+  } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0\n\t"                \
+		      "vmovdqu %%ymm0, (%0)\n\t"                        \
+		      "vmovdqu %%ymm0, (%1)\n\t"                        \
+		      "vzeroall\n\t"                                    \
+		      "movdqu 0*16(%2), %%xmm6\n"                       \
+		      "movdqu 1*16(%2), %%xmm7\n"                       \
+		      :                                                 \
+		      : "r" (tmp0), "r" (tmp1), "r" (&win64tmp[0])      \
+		      : "memory");                                      \
+  } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0\n\t"                \
+		      "vmovdqu %%ymm0, (%0)\n\t"                        \
+		      "vmovdqu %%ymm0, (%1)\n\t"                        \
+		      "vzeroall\n\t"                                    \
+		      :                                                 \
+		      : "r" (tmp0), "r" (tmp1)                          \
+		      : "memory");                                      \
+  } while (0)
+#endif
+
+/*
+ * Transform nblks*128 bytes (nblks*16 64-bit words) at DATA.
+ */
+unsigned int ASM_FUNC_ATTR
+_gcry_sha512_transform_intel_shaext(u64 state[8], const unsigned char *data,
+				    size_t nblks, const u64 k[80])
+{
+  static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) =
+    { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  char save_buf[2 * 32 + 31];
+  char *abef_save;
+  char *cdgh_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abef_save) : "0" (save_buf) : "memory");
+  abef_save = abef_save + (-(uintptr_t)abef_save & 31);
+  cdgh_save = abef_save + 32;
+
+  /* Load state.  State is {a,b,c,d} and {e,f,g,h} in memory, repack to
+     ABEF (YMM elem order {f,e,b,a}) in YMM1 and CDGH ({h,g,d,c}) in YMM2. */
+  asm volatile ("vpshufd $0x4e, 0*32(%[state]), %%ymm3\n\t" /* {b,a,d,c} */
+		"vpshufd $0x4e, 1*32(%[state]), %%ymm4\n\t" /* {f,e,h,g} */
+		"vperm2i128 $0x02, %%ymm4, %%ymm3, %%ymm1\n\t" /* {f,e,b,a} */
+		"vperm2i128 $0x13, %%ymm4, %%ymm3, %%ymm2\n\t" /* {h,g,d,c} */
+		:
+		: [state] "r" (state)
+		: "memory" );
+
+  /* Load message */
+  asm volatile ("vbroadcasti128 %[mask], %%ymm7\n\t"
+		"vmovdqu 0*32(%[data]), %%ymm3\n\t"
+		"vmovdqu 1*32(%[data]), %%ymm4\n\t"
+		"vmovdqu 2*32(%[data]), %%ymm5\n\t"
+		"vmovdqu 3*32(%[data]), %%ymm6\n\t"
+		"vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+		"vpshufb %%ymm7, %%ymm4, %%ymm4\n\t"
+		"vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+		"vpshufb %%ymm7, %%ymm6, %%ymm6\n\t"
+		:
+		: [data] "r" (data), [mask] "m" (*bshuf_mask)
+		: "memory" );
+  data += 128;
+
+  do
+    {
+      /* Save state */
+      asm volatile ("vmovdqa %%ymm1, (%[abef_save])\n\t"
+		    "vmovdqa %%ymm2, (%[cdgh_save])\n\t"
+		    :
+		    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+		    : "memory" );
+
+
+      /* Rounds 0..3 */
+      asm volatile ("vpaddq 32*0(%[k]), %%ymm3, %%ymm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+		    :
+		    : [k] "r" (k)
+		    : "memory" );
+
+      /* Rounds 4..7 */
+      asm volatile ("vpaddq 32*1(%[k]), %%ymm4, %%ymm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+		    "vsha512msg1 %%xmm4, %%ymm3\n\t"
+		    :
+		    : [k] "r" (k)
+		    : "memory" );
+
+      /* Rounds 8..11 */
+      asm volatile ("vpaddq 32*2(%[k]), %%ymm5, %%ymm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+		    "vsha512msg1 %%xmm5, %%ymm4\n\t"
+		    :
+		    : [k] "r" (k)
+		    : "memory" );
+
+#define ROUND(gr, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("vpaddq 32*" #gr "(%[k]), %%ymm" #MSG0 ", %%ymm0\n\t" \
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t" \
+		    "vperm2i128 $0x21, %%ymm" #MSG0 ", %%ymm" #MSG3 ", %%ymm7\n\t" \
+		    "vpalignr $8, %%ymm" #MSG3 ", %%ymm7, %%ymm7\n\t" \
+		    "vpaddq %%ymm7, %%ymm" #MSG1 ", %%ymm" #MSG1 "\n\t" \
+		    "vsha512msg2 %%ymm" #MSG0 ", %%ymm" #MSG1 "\n\t" \
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t" \
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t" \
+		    "vsha512msg1 %%xmm" #MSG0 ", %%ymm" #MSG3 "\n\t" \
+		    : \
+		    : [k] "r" (k) \
+		    : "memory" )
+
+      /* Rounds 12..15 to 64..67 (message schedule for W[16..79]). */
+      ROUND(3, 6, 3, 4, 5);
+      ROUND(4, 3, 4, 5, 6);
+      ROUND(5, 4, 5, 6, 3);
+      ROUND(6, 5, 6, 3, 4);
+      ROUND(7, 6, 3, 4, 5);
+      ROUND(8, 3, 4, 5, 6);
+      ROUND(9, 4, 5, 6, 3);
+      ROUND(10, 5, 6, 3, 4);
+      ROUND(11, 6, 3, 4, 5);
+      ROUND(12, 3, 4, 5, 6);
+      ROUND(13, 4, 5, 6, 3);
+      ROUND(14, 5, 6, 3, 4);
+      ROUND(15, 6, 3, 4, 5);
+      ROUND(16, 3, 4, 5, 6);
+
+      if (--nblks == 0)
+	break;
+
+/* Final two message groups: finalize MSG1 but no further message schedule. */
+#define ROUND_FINAL(gr, MSG0, MSG1, MSG3) \
+      asm volatile ("vpaddq 32*" #gr "(%[k]), %%ymm" #MSG0 ", %%ymm0\n\t" \
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t" \
+		    "vperm2i128 $0x21, %%ymm" #MSG0 ", %%ymm" #MSG3 ", %%ymm7\n\t" \
+		    "vpalignr $8, %%ymm" #MSG3 ", %%ymm7, %%ymm7\n\t" \
+		    "vpaddq %%ymm7, %%ymm" #MSG1 ", %%ymm" #MSG1 "\n\t" \
+		    "vsha512msg2 %%ymm" #MSG0 ", %%ymm" #MSG1 "\n\t" \
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t" \
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t" \
+		    : \
+		    : [k] "r" (k) \
+		    : "memory" )
+
+      /* Rounds 68..71 */
+      ROUND_FINAL(17, 4, 5, 3);
+
+      asm volatile ("vmovdqu 0*32(%[data]), %%ymm3\n\t"
+		    :
+		    : [data] "r" (data)
+		    : "memory" );
+
+      /* Rounds 72..75 */
+      ROUND_FINAL(18, 5, 6, 4);
+
+      asm volatile ("vbroadcasti128 %[mask], %%ymm7\n\t" /* Reload mask */
+		    "vmovdqu 1*32(%[data]), %%ymm4\n\t"
+		    "vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+		    :
+		    : [data] "r" (data), [mask] "m" (*bshuf_mask)
+		    : "memory" );
+
+      /* Rounds 76..79 */
+      asm volatile ("vpaddq 32*19(%[k]), %%ymm6, %%ymm0\n\t"
+		      "vmovdqu 2*32(%[data]), %%ymm5\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+                      "vmovdqu 3*32(%[data]), %%ymm6\n\t"
+		      "vpshufb %%ymm7, %%ymm4, %%ymm4\n\t"
+		    "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+		      "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+		    "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+		      "vpshufb %%ymm7, %%ymm6, %%ymm6\n\t"
+		    :
+		    : [k] "r" (k), [data] "r" (data)
+		    : "memory" );
+
+      data += 128;
+
+      /* Merge states */
+      asm volatile ("vpaddq (%[abef_save]), %%ymm1, %%ymm1\n\t"
+		    "vpaddq (%[cdgh_save]), %%ymm2, %%ymm2\n\t"
+		    :
+		    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+		    : "memory" );
+    }
+  while (1);
+
+  /* Rounds 68..71 */
+  ROUND_FINAL(17, 4, 5, 3);
+  /* Rounds 72..75 */
+  ROUND_FINAL(18, 5, 6, 4);
+
+  /* Rounds 76..79 */
+  asm volatile ("vpaddq 32*19(%[k]), %%ymm6, %%ymm0\n\t"
+		"vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+		"vextracti128 $1, %%ymm0, %%xmm0\n\t"
+		"vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+		:
+		: [k] "r" (k)
+		: "memory" );
+
+  /* Merge states */
+  asm volatile ("vpaddq (%[abef_save]), %%ymm1, %%ymm1\n\t"
+		"vpaddq (%[cdgh_save]), %%ymm2, %%ymm2\n\t"
+		:
+		: [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+		: "memory" );
+
+  /* Store state.  ABEF=YMM1 ({f,e,b,a}), CDGH=YMM2 ({h,g,d,c}). */
+  asm volatile ("vperm2i128 $0x31, %%ymm2, %%ymm1, %%ymm3\n\t" /* {b,a,d,c} */
+		"vperm2i128 $0x20, %%ymm2, %%ymm1, %%ymm4\n\t" /* {f,e,h,g} */
+		"vpshufd $0x4e, %%ymm3, %%ymm3\n\t"            /* {a,b,c,d} */
+		"vpshufd $0x4e, %%ymm4, %%ymm4\n\t"            /* {e,f,g,h} */
+		"vmovdqu %%ymm3, 0*32(%[state])\n\t"
+		"vmovdqu %%ymm4, 1*32(%[state])\n\t"
+		:
+		: [state] "r" (state)
+		: "memory" );
+
+  shaext_cleanup (abef_save, cdgh_save);
+  return 0;
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA512 */
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 51bf6641..024a45ea 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -133,6 +133,15 @@
 #endif
 
 
+/* USE_SHA512_INTEL indicates whether to compile with Intel SHA512 code. */
+#undef USE_SHA512_INTEL
+#if defined(HAVE_GCC_INLINE_ASM_SHA512) && \
+    defined(USE_SHA512) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHA512_INTEL 1
+#endif
+
+
 /* USE_SSSE3_I386 indicates whether to compile with Intel SSSE3/i386 code. */
 #undef USE_SSSE3_I386
 #if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
@@ -345,6 +354,22 @@ do_sha512_transform_amd64_avx512(void *ctx, const unsigned char *data,
 }
 #endif
 
+#ifdef USE_SHA512_INTEL
+/* Does not need ASM_FUNC_ABI */
+unsigned int _gcry_sha512_transform_intel_shaext(u64 state[8],
+                                                 const unsigned char *input_data,
+                                                 size_t num_blks,
+                                                 const u64 k[]);
+
+static unsigned int
+do_sha512_transform_intel_shaext(void *ctx, const unsigned char *data,
+                                 size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_intel_shaext (hd->state.h, data, nblks, k);
+}
+#endif
+
 #ifdef USE_SSSE3_I386
 unsigned int _gcry_sha512_transform_i386_ssse3(u64 state[8],
 					       const unsigned char *input_data,
@@ -508,6 +533,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
   if ((features & HWF_INTEL_SSSE3) != 0)
     ctx->bctx.bwrite = do_sha512_transform_i386_ssse3;
 #endif
+#ifdef USE_SHA512_INTEL
+  if ((features & HWF_INTEL_SHA512) && (features & HWF_INTEL_AVX2))
+    ctx->bctx.bwrite = do_sha512_transform_intel_shaext;
+#endif
 #ifdef USE_RISCV_V_CRYPTO
   if ((features & HWF_RISCV_IMAFDC)
       && (features & HWF_RISCV_B)      /* Mandatory in RVA23U64 */
diff --git a/configure.ac b/configure.ac
index b41f67f4..c11bc3b6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1584,6 +1584,34 @@ if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports Intel SHA512 instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports Intel SHA512 instructions],
+       [gcry_cv_gcc_inline_asm_sha512],
+       [if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_sha512="n/a"
+        else
+          gcry_cv_gcc_inline_asm_sha512=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void a(void) {
+              __asm__("vsha512msg1 %%xmm1, %%ymm3\n\t":::"cc");
+              __asm__("vsha512msg2 %%ymm1, %%ymm3\n\t":::"cc");
+              __asm__("vsha512rnds2 %%xmm0, %%ymm1, %%ymm3\n\t":::"cc");
+              __asm__("vperm2i128 \$0x21, %%ymm1, %%ymm2, %%ymm3\n\t":::"cc");
+              __asm__("vpalignr \$8, %%ymm1, %%ymm2, %%ymm3\n\t":::"cc");
+              __asm__("vpermq \$0x1b, %%ymm1, %%ymm3\n\t":::"cc");
+              __asm__("vextracti128 \$1, %%ymm0, %%xmm1\n\t":::"cc");
+            }]], [ a(); ] )],
+          [gcry_cv_gcc_inline_asm_sha512=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_sha512" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_SHA512,1,
+     [Defined if inline assembler supports Intel SHA512 instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
@@ -4042,6 +4070,13 @@ if test "$found" = "1" ; then
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-riscv-zvknhb-zvkb.lo"
       ;;
    esac
+
+   case "$mpi_cpu_arch" in
+     x86)
+       # Build with the Intel SHA512 implementation
+       GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-intel-shaext.lo"
+     ;;
+   esac
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index e2b8223a..58887b8d 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -586,6 +586,7 @@ are
 @item intel-vaes-vpclmul
 @item intel-avx512
 @item intel-gfni
+ at item intel-sha512
 @item arm-neon
 @item arm-aes
 @item arm-sha1
diff --git a/src/g10lib.h b/src/g10lib.h
index bb735e77..6abc5f5b 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -239,6 +239,7 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_INTEL_VAES_VPCLMUL  (1 << 16)
 #define HWF_INTEL_AVX512        (1 << 17)
 #define HWF_INTEL_GFNI          (1 << 18)
+#define HWF_INTEL_SHA512        (1 << 19)
 
 #elif defined(HAVE_CPU_ARCH_ARM)
 
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index e2c9af0c..e3e144d0 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -95,8 +95,9 @@ is_cpuid_available(void)
 }
 
 static void
-get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
-          unsigned int *ecx, unsigned int *edx)
+get_cpuid_subleaf(unsigned int in, unsigned int subleaf,
+		  unsigned int *eax, unsigned int *ebx,
+		  unsigned int *ecx, unsigned int *edx)
 {
   unsigned int regs[4];
 
@@ -105,7 +106,7 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
      "cpuid\n\t"
      "xchgl %%ebx, %1\n\t"     /* Restore GOT register. */
      : "=a" (regs[0]), "=D" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
-     : "0" (in), "1" (0), "2" (0), "3" (0)
+     : "0" (in), "1" (0), "2" (subleaf), "3" (0)
      : "cc"
      );
 
@@ -119,6 +120,13 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
     *edx = regs[3];
 }
 
+static void
+get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
+          unsigned int *ecx, unsigned int *edx)
+{
+  get_cpuid_subleaf(in, 0, eax, ebx, ecx, edx);
+}
+
 #if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT)
 static unsigned int
 get_xgetbv(void)
@@ -148,15 +156,16 @@ is_cpuid_available(void)
 }
 
 static void
-get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
-          unsigned int *ecx, unsigned int *edx)
+get_cpuid_subleaf(unsigned int in, unsigned int subleaf,
+		  unsigned int *eax, unsigned int *ebx,
+		  unsigned int *ecx, unsigned int *edx)
 {
   unsigned int regs[4];
 
   asm volatile
     ("cpuid\n\t"
      : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
-     : "0" (in), "1" (0), "2" (0), "3" (0)
+     : "0" (in), "1" (0), "2" (subleaf), "3" (0)
      : "cc"
      );
 
@@ -170,6 +179,13 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
     *edx = regs[3];
 }
 
+static void
+get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
+          unsigned int *ecx, unsigned int *edx)
+{
+  get_cpuid_subleaf(in, 0, eax, ebx, ecx, edx);
+}
+
 #if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT)
 static unsigned int
 get_xgetbv(void)
@@ -228,33 +244,35 @@ detect_x86_gnuc (
 #ifdef ENABLE_PADLOCK_SUPPORT
   else if (!strcmp (vendor_id.c, "CentaurHauls"))
     {
+      unsigned int via_feat, via_feat2;
+
       /* This is a VIA CPU.  Check what PadLock features we have.  */
 
       /* Check for extended centaur (EAX).  */
-      get_cpuid(0xC0000000, &features, NULL, NULL, NULL);
+      get_cpuid(0xC0000000, &via_feat, NULL, NULL, NULL);
 
       /* Has extended centaur features? */
-      if (features > 0xC0000000)
+      if (via_feat > 0xC0000000)
         {
            /* Ask for the extended feature flags (EDX). */
-           get_cpuid(0xC0000001, NULL, NULL, NULL, &features);
+           get_cpuid(0xC0000001, NULL, NULL, NULL, &via_feat2);
 
            /* Test bits 2 and 3 to see whether the RNG exists and is enabled. */
-           if ((features & 0x0C) == 0x0C)
+           if ((via_feat2 & 0x0C) == 0x0C)
              result |= HWF_PADLOCK_RNG;
 
            /* Test bits 6 and 7 to see whether the ACE exists and is enabled. */
-           if ((features & 0xC0) == 0xC0)
+           if ((via_feat2 & 0xC0) == 0xC0)
              result |= HWF_PADLOCK_AES;
 
            /* Test bits 10 and 11 to see whether the PHE exists and is
               enabled.  */
-           if ((features & 0xC00) == 0xC00)
+           if ((via_feat2 & 0xC00) == 0xC00)
              result |= HWF_PADLOCK_SHA;
 
            /* Test bits 12 and 13 to see whether the MONTMUL exists and is
               enabled.  */
-           if ((features & 0x3000) == 0x3000)
+           if ((via_feat2 & 0x3000) == 0x3000)
              result |= HWF_PADLOCK_MMUL;
         }
     }
@@ -337,28 +355,30 @@ detect_x86_gnuc (
   has_sse3 = !!(features & 0x00000001);
   if (max_cpuid_level >= 7 && has_sse3)
     {
+      unsigned int intel_feat, intel_feat2, intel_feat3;
+
       /* Get CPUID:7 contains further Intel feature flags. */
-      get_cpuid(7, NULL, &features, &features2, NULL);
+      get_cpuid(7, NULL, &intel_feat, &intel_feat2, NULL);
 
       /* Test bit 8 for BMI2.  */
-      if (features & 0x00000100)
+      if (intel_feat & 0x00000100)
           result |= HWF_INTEL_BMI2;
 
 #ifdef ENABLE_AVX2_SUPPORT
       /* Test bit 5 for AVX2.  */
-      if (features & 0x00000020)
+      if (intel_feat & 0x00000020)
         if (os_supports_avx_avx2_registers)
           result |= HWF_INTEL_AVX2;
 #endif /*ENABLE_AVX_SUPPORT*/
 
       /* Test bit 29 for SHA Extensions. */
-      if (features & (1 << 29))
+      if (intel_feat & (1 << 29))
         result |= HWF_INTEL_SHAEXT;
 
 #if defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT) && \
     defined(ENABLE_PCLMUL_SUPPORT)
-      /* Test features2 bit 9 for VAES and features2 bit 10 for VPCLMULDQD */
-      if ((features2 & 0x00000200) && (features2 & 0x00000400))
+      /* Test intel_feat2 bit 9 for VAES and intel_feat2 bit 10 for VPCLMULDQD */
+      if ((intel_feat2 & 0x00000200) && (intel_feat2 & 0x00000400))
         result |= HWF_INTEL_VAES_VPCLMUL;
 #endif
 
@@ -367,44 +387,53 @@ detect_x86_gnuc (
        * supporting CPUs are new enough not to suffer from reduced clock
        * frequencies when AVX512 is used, which was issue on early AVX512
        * capable CPUs.
-       *  - AVX512F (features bit 16)
-       *  - AVX512DQ (features bit 17)
-       *  - AVX512IFMA (features bit 21)
-       *  - AVX512CD (features bit 28)
-       *  - AVX512BW (features bit 30)
-       *  - AVX512VL (features bit 31)
-       *  - AVX512_VBMI (features2 bit 1)
-       *  - AVX512_VBMI2 (features2 bit 6)
-       *  - AVX512_VNNI (features2 bit 11)
-       *  - AVX512_BITALG (features2 bit 12)
-       *  - AVX512_VPOPCNTDQ (features2 bit 14)
+       *  - AVX512F (intel_feat bit 16)
+       *  - AVX512DQ (intel_feat bit 17)
+       *  - AVX512IFMA (intel_feat bit 21)
+       *  - AVX512CD (intel_feat bit 28)
+       *  - AVX512BW (intel_feat bit 30)
+       *  - AVX512VL (intel_feat bit 31)
+       *  - AVX512_VBMI (intel_feat2 bit 1)
+       *  - AVX512_VBMI2 (intel_feat2 bit 6)
+       *  - AVX512_VNNI (intel_feat2 bit 11)
+       *  - AVX512_BITALG (intel_feat2 bit 12)
+       *  - AVX512_VPOPCNTDQ (intel_feat2 bit 14)
        */
       if (os_supports_avx512_registers
-	  && (features & (1 << 16))
-	  && (features & (1 << 17))
-	  && (features & (1 << 21))
-	  && (features & (1 << 28))
-	  && (features & (1 << 30))
-	  && (features & (1U << 31))
-	  && (features2 & (1 << 1))
-	  && (features2 & (1 << 6))
-	  && (features2 & (1 << 11))
-	  && (features2 & (1 << 12))
-	  && (features2 & (1 << 14)))
+	  && (intel_feat & (1 << 16))
+	  && (intel_feat & (1 << 17))
+	  && (intel_feat & (1 << 21))
+	  && (intel_feat & (1 << 28))
+	  && (intel_feat & (1 << 30))
+	  && (intel_feat & (1U << 31))
+	  && (intel_feat2 & (1 << 1))
+	  && (intel_feat2 & (1 << 6))
+	  && (intel_feat2 & (1 << 11))
+	  && (intel_feat2 & (1 << 12))
+	  && (intel_feat2 & (1 << 14)))
 	result |= HWF_INTEL_AVX512;
 #endif
 
       /* Test features2 bit 6 for GFNI (Galois field new instructions).
        * These instructions are available for SSE/AVX/AVX2/AVX512. */
-      if (features2 & (1 << 6))
+      if (intel_feat2 & (1 << 6))
         result |= HWF_INTEL_GFNI;
+
+      /* Get CPUID:7 sub-leaf 1 for further Intel feature flags. */
+      get_cpuid_subleaf(7, 1, &intel_feat3, NULL, NULL, NULL);
+
+      /* Test bit 0 for Intel SHA512 instructions. */
+      if ((intel_feat3 & (1 << 0)) && os_supports_avx_avx2_registers)
+        result |= HWF_INTEL_SHA512;
     }
 
   /* Check additional feature flags. */
   if (max_cpuid_level >= 0x21 && has_sse3)
     {
-      get_cpuid(0x21, &features, NULL, NULL, NULL);
-      if (features & (1 << 23))
+      unsigned int amd_feat;
+
+      get_cpuid(0x21, &amd_feat, NULL, NULL, NULL);
+      if (amd_feat & (1 << 23))
 	{
 	  has_avx512bmm = 1;
 	}
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 7d32c4de..4c13ce96 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -84,6 +84,7 @@ static struct
     { HWF_INTEL_VAES_VPCLMUL,  "intel-vaes-vpclmul" },
     { HWF_INTEL_AVX512,        "intel-avx512" },
     { HWF_INTEL_GFNI,          "intel-gfni" },
+    { HWF_INTEL_SHA512,        "intel-sha512" },
     /* Following removed HW feature strings are kept for API compatibility. */
     { 0,                       "intel-fast-vpgather" },
 #elif defined(HAVE_CPU_ARCH_ARM)
-- 
2.53.0