[PATCH 2/7] sha512: add Intel SHA512 extension implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Jun 28 14:37:36 CEST 2026
* cipher/Makefile.am: Add 'sha512-intel-shaext.c'; Add instrumentation
option munging for 'sha512-intel-shaext.o' and 'sha512-intel-shaext.lo'.
* cipher/sha512-intel-shaext.c: New.
* cipher/sha512.c (USE_SHA512_INTEL): New.
[USE_SHA512_INTEL] (_gcry_sha512_transform_intel_shaext)
(do_sha512_transform_intel_shaext): New.
(sha512_init_common) [USE_SHA512_INTEL]: Use Intel SHA512 accelerated
implementation if HW feature available.
* configure.ac (gcry_cv_gcc_inline_asm_sha512)
(HAVE_GCC_INLINE_ASM_SHA512): New.
(GCRYPT_ASM_DIGESTS) [x86]: Add 'sha512-intel-shaext.lo'.
* doc/gcrypt.texi: Add "intel-sha512" to HW features list.
* src/g10lib.h (HWF_INTEL_SHA512): New.
* src/hwf-x86.c (get_cpuid_subleaf): New, based on 'get_cpuid'.
(get_cpuid): Implement as wrapper for 'get_cpuid_subleaf'.
(detect_x86_gnuc): Add Intel SHA512 detection.
* src/hwfeatures.c (hwflist): Add "intel-sha512".
--
Implementation has been tested with Intel SDE (both i386 and x86-64),
not yet on real hardware.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 7 +
cipher/sha512-intel-shaext.c | 298 +++++++++++++++++++++++++++++++++++
cipher/sha512.c | 29 ++++
configure.ac | 35 ++++
doc/gcrypt.texi | 1 +
src/g10lib.h | 1 +
src/hwf-x86.c | 117 ++++++++------
src/hwfeatures.c | 1 +
8 files changed, 445 insertions(+), 44 deletions(-)
create mode 100644 cipher/sha512-intel-shaext.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 11bb19d7..0abbfea6 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -149,6 +149,7 @@ EXTRA_libcipher_la_SOURCES = \
sha256-intel-shaext.c sha256-ppc.c sha256-riscv-zvknha-zvkb.c \
sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
+ sha512-intel-shaext.c \
sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
sha512-ppc.c sha512-riscv-zvknhb-zvkb.c sha512-ssse3-i386.c \
sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
@@ -242,6 +243,12 @@ sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+sha512-intel-shaext.o: $(srcdir)/sha512-intel-shaext.c Makefile
+ `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha512-intel-shaext.lo: $(srcdir)/sha512-intel-shaext.c Makefile
+ `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
diff --git a/cipher/sha512-intel-shaext.c b/cipher/sha512-intel-shaext.c
new file mode 100644
index 00000000..d991abad
--- /dev/null
+++ b/cipher/sha512-intel-shaext.c
@@ -0,0 +1,298 @@
+/* sha512-intel-shaext.c - SHA512 accelerated with Intel SHA512 extension.
+ * Copyright (C) 2026 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHA512) && \
+ defined(USE_SHA512) && defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+# pragma GCC target("no-sse")
+#endif
+#if __clang__
+# pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
+
+/* Two macros to be called prior and after the use of SHA512
+ instructions. There should be no external function calls between
+ the use of these macros. There purpose is to make sure that the
+ SSE regsiters are cleared and won't reveal any information about
+ the key or the data. */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare() \
+ do { asm volatile ("movdqu %%xmm6, 0*16(%0)\n" \
+ "movdqu %%xmm7, 1*16(%0)\n" \
+ : \
+ : "r" (&win64tmp[0]) \
+ : "memory"); \
+ } while (0)
+# define shaext_cleanup(tmp0,tmp1) \
+ do { asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0\n\t" \
+ "vmovdqu %%ymm0, (%0)\n\t" \
+ "vmovdqu %%ymm0, (%1)\n\t" \
+ "vzeroall\n\t" \
+ "movdqu 0*16(%2), %%xmm6\n" \
+ "movdqu 1*16(%2), %%xmm7\n" \
+ : \
+ : "r" (tmp0), "r" (tmp1), "r" (&win64tmp[0]) \
+ : "memory"); \
+ } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1) \
+ do { asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0\n\t" \
+ "vmovdqu %%ymm0, (%0)\n\t" \
+ "vmovdqu %%ymm0, (%1)\n\t" \
+ "vzeroall\n\t" \
+ : \
+ : "r" (tmp0), "r" (tmp1) \
+ : "memory"); \
+ } while (0)
+#endif
+
+/*
+ * Transform nblks*128 bytes (nblks*16 64-bit words) at DATA.
+ */
+unsigned int ASM_FUNC_ATTR
+_gcry_sha512_transform_intel_shaext(u64 state[8], const unsigned char *data,
+ size_t nblks, const u64 k[80])
+{
+ static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) =
+ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ char save_buf[2 * 32 + 31];
+ char *abef_save;
+ char *cdgh_save;
+ shaext_prepare_variable;
+
+ if (nblks == 0)
+ return 0;
+
+ shaext_prepare ();
+
+ asm volatile ("" : "=r" (abef_save) : "0" (save_buf) : "memory");
+ abef_save = abef_save + (-(uintptr_t)abef_save & 31);
+ cdgh_save = abef_save + 32;
+
+ /* Load state. State is {a,b,c,d} and {e,f,g,h} in memory, repack to
+ ABEF (YMM elem order {f,e,b,a}) in YMM1 and CDGH ({h,g,d,c}) in YMM2. */
+ asm volatile ("vpshufd $0x4e, 0*32(%[state]), %%ymm3\n\t" /* {b,a,d,c} */
+ "vpshufd $0x4e, 1*32(%[state]), %%ymm4\n\t" /* {f,e,h,g} */
+ "vperm2i128 $0x02, %%ymm4, %%ymm3, %%ymm1\n\t" /* {f,e,b,a} */
+ "vperm2i128 $0x13, %%ymm4, %%ymm3, %%ymm2\n\t" /* {h,g,d,c} */
+ :
+ : [state] "r" (state)
+ : "memory" );
+
+ /* Load message */
+ asm volatile ("vbroadcasti128 %[mask], %%ymm7\n\t"
+ "vmovdqu 0*32(%[data]), %%ymm3\n\t"
+ "vmovdqu 1*32(%[data]), %%ymm4\n\t"
+ "vmovdqu 2*32(%[data]), %%ymm5\n\t"
+ "vmovdqu 3*32(%[data]), %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+ "vpshufb %%ymm7, %%ymm4, %%ymm4\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vpshufb %%ymm7, %%ymm6, %%ymm6\n\t"
+ :
+ : [data] "r" (data), [mask] "m" (*bshuf_mask)
+ : "memory" );
+ data += 128;
+
+ do
+ {
+ /* Save state */
+ asm volatile ("vmovdqa %%ymm1, (%[abef_save])\n\t"
+ "vmovdqa %%ymm2, (%[cdgh_save])\n\t"
+ :
+ : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+ : "memory" );
+
+
+ /* Rounds 0..3 */
+ asm volatile ("vpaddq 32*0(%[k]), %%ymm3, %%ymm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+ :
+ : [k] "r" (k)
+ : "memory" );
+
+ /* Rounds 4..7 */
+ asm volatile ("vpaddq 32*1(%[k]), %%ymm4, %%ymm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+ "vsha512msg1 %%xmm4, %%ymm3\n\t"
+ :
+ : [k] "r" (k)
+ : "memory" );
+
+ /* Rounds 8..11 */
+ asm volatile ("vpaddq 32*2(%[k]), %%ymm5, %%ymm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+ "vsha512msg1 %%xmm5, %%ymm4\n\t"
+ :
+ : [k] "r" (k)
+ : "memory" );
+
+#define ROUND(gr, MSG0, MSG1, MSG2, MSG3) \
+ asm volatile ("vpaddq 32*" #gr "(%[k]), %%ymm" #MSG0 ", %%ymm0\n\t" \
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t" \
+ "vperm2i128 $0x21, %%ymm" #MSG0 ", %%ymm" #MSG3 ", %%ymm7\n\t" \
+ "vpalignr $8, %%ymm" #MSG3 ", %%ymm7, %%ymm7\n\t" \
+ "vpaddq %%ymm7, %%ymm" #MSG1 ", %%ymm" #MSG1 "\n\t" \
+ "vsha512msg2 %%ymm" #MSG0 ", %%ymm" #MSG1 "\n\t" \
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t" \
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t" \
+ "vsha512msg1 %%xmm" #MSG0 ", %%ymm" #MSG3 "\n\t" \
+ : \
+ : [k] "r" (k) \
+ : "memory" )
+
+ /* Rounds 12..15 to 64..67 (message schedule for W[16..79]). */
+ ROUND(3, 6, 3, 4, 5);
+ ROUND(4, 3, 4, 5, 6);
+ ROUND(5, 4, 5, 6, 3);
+ ROUND(6, 5, 6, 3, 4);
+ ROUND(7, 6, 3, 4, 5);
+ ROUND(8, 3, 4, 5, 6);
+ ROUND(9, 4, 5, 6, 3);
+ ROUND(10, 5, 6, 3, 4);
+ ROUND(11, 6, 3, 4, 5);
+ ROUND(12, 3, 4, 5, 6);
+ ROUND(13, 4, 5, 6, 3);
+ ROUND(14, 5, 6, 3, 4);
+ ROUND(15, 6, 3, 4, 5);
+ ROUND(16, 3, 4, 5, 6);
+
+ if (--nblks == 0)
+ break;
+
+/* Final two message groups: finalize MSG1 but no further message schedule. */
+#define ROUND_FINAL(gr, MSG0, MSG1, MSG3) \
+ asm volatile ("vpaddq 32*" #gr "(%[k]), %%ymm" #MSG0 ", %%ymm0\n\t" \
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t" \
+ "vperm2i128 $0x21, %%ymm" #MSG0 ", %%ymm" #MSG3 ", %%ymm7\n\t" \
+ "vpalignr $8, %%ymm" #MSG3 ", %%ymm7, %%ymm7\n\t" \
+ "vpaddq %%ymm7, %%ymm" #MSG1 ", %%ymm" #MSG1 "\n\t" \
+ "vsha512msg2 %%ymm" #MSG0 ", %%ymm" #MSG1 "\n\t" \
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t" \
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t" \
+ : \
+ : [k] "r" (k) \
+ : "memory" )
+
+ /* Rounds 68..71 */
+ ROUND_FINAL(17, 4, 5, 3);
+
+ asm volatile ("vmovdqu 0*32(%[data]), %%ymm3\n\t"
+ :
+ : [data] "r" (data)
+ : "memory" );
+
+ /* Rounds 72..75 */
+ ROUND_FINAL(18, 5, 6, 4);
+
+ asm volatile ("vbroadcasti128 %[mask], %%ymm7\n\t" /* Reload mask */
+ "vmovdqu 1*32(%[data]), %%ymm4\n\t"
+ "vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+ :
+ : [data] "r" (data), [mask] "m" (*bshuf_mask)
+ : "memory" );
+
+ /* Rounds 76..79 */
+ asm volatile ("vpaddq 32*19(%[k]), %%ymm6, %%ymm0\n\t"
+ "vmovdqu 2*32(%[data]), %%ymm5\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+ "vmovdqu 3*32(%[data]), %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm4, %%ymm4\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+ "vpshufb %%ymm7, %%ymm6, %%ymm6\n\t"
+ :
+ : [k] "r" (k), [data] "r" (data)
+ : "memory" );
+
+ data += 128;
+
+ /* Merge states */
+ asm volatile ("vpaddq (%[abef_save]), %%ymm1, %%ymm1\n\t"
+ "vpaddq (%[cdgh_save]), %%ymm2, %%ymm2\n\t"
+ :
+ : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+ : "memory" );
+ }
+ while (1);
+
+ /* Rounds 68..71 */
+ ROUND_FINAL(17, 4, 5, 3);
+ /* Rounds 72..75 */
+ ROUND_FINAL(18, 5, 6, 4);
+
+ /* Rounds 76..79 */
+ asm volatile ("vpaddq 32*19(%[k]), %%ymm6, %%ymm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm1, %%ymm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm0\n\t"
+ "vsha512rnds2 %%xmm0, %%ymm2, %%ymm1\n\t"
+ :
+ : [k] "r" (k)
+ : "memory" );
+
+ /* Merge states */
+ asm volatile ("vpaddq (%[abef_save]), %%ymm1, %%ymm1\n\t"
+ "vpaddq (%[cdgh_save]), %%ymm2, %%ymm2\n\t"
+ :
+ : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+ : "memory" );
+
+ /* Store state. ABEF=YMM1 ({f,e,b,a}), CDGH=YMM2 ({h,g,d,c}). */
+ asm volatile ("vperm2i128 $0x31, %%ymm2, %%ymm1, %%ymm3\n\t" /* {b,a,d,c} */
+ "vperm2i128 $0x20, %%ymm2, %%ymm1, %%ymm4\n\t" /* {f,e,h,g} */
+ "vpshufd $0x4e, %%ymm3, %%ymm3\n\t" /* {a,b,c,d} */
+ "vpshufd $0x4e, %%ymm4, %%ymm4\n\t" /* {e,f,g,h} */
+ "vmovdqu %%ymm3, 0*32(%[state])\n\t"
+ "vmovdqu %%ymm4, 1*32(%[state])\n\t"
+ :
+ : [state] "r" (state)
+ : "memory" );
+
+ shaext_cleanup (abef_save, cdgh_save);
+ return 0;
+}
+
+#if __clang__
+# pragma clang attribute pop
+#endif
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA512 */
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 51bf6641..024a45ea 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -133,6 +133,15 @@
#endif
+/* USE_SHA512_INTEL indicates whether to compile with Intel SHA512 code. */
+#undef USE_SHA512_INTEL
+#if defined(HAVE_GCC_INLINE_ASM_SHA512) && \
+ defined(USE_SHA512) && \
+ defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHA512_INTEL 1
+#endif
+
+
/* USE_SSSE3_I386 indicates whether to compile with Intel SSSE3/i386 code. */
#undef USE_SSSE3_I386
#if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
@@ -345,6 +354,22 @@ do_sha512_transform_amd64_avx512(void *ctx, const unsigned char *data,
}
#endif
+#ifdef USE_SHA512_INTEL
+/* Does not need ASM_FUNC_ABI */
+unsigned int _gcry_sha512_transform_intel_shaext(u64 state[8],
+ const unsigned char *input_data,
+ size_t num_blks,
+ const u64 k[]);
+
+static unsigned int
+do_sha512_transform_intel_shaext(void *ctx, const unsigned char *data,
+ size_t nblks)
+{
+ SHA512_CONTEXT *hd = ctx;
+ return _gcry_sha512_transform_intel_shaext (hd->state.h, data, nblks, k);
+}
+#endif
+
#ifdef USE_SSSE3_I386
unsigned int _gcry_sha512_transform_i386_ssse3(u64 state[8],
const unsigned char *input_data,
@@ -508,6 +533,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
if ((features & HWF_INTEL_SSSE3) != 0)
ctx->bctx.bwrite = do_sha512_transform_i386_ssse3;
#endif
+#ifdef USE_SHA512_INTEL
+ if ((features & HWF_INTEL_SHA512) && (features & HWF_INTEL_AVX2))
+ ctx->bctx.bwrite = do_sha512_transform_intel_shaext;
+#endif
#ifdef USE_RISCV_V_CRYPTO
if ((features & HWF_RISCV_IMAFDC)
&& (features & HWF_RISCV_B) /* Mandatory in RVA23U64 */
diff --git a/configure.ac b/configure.ac
index b41f67f4..c11bc3b6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1584,6 +1584,34 @@ if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
fi
+#
+# Check whether GCC inline assembler supports Intel SHA512 instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports Intel SHA512 instructions],
+ [gcry_cv_gcc_inline_asm_sha512],
+ [if test "$mpi_cpu_arch" != "x86" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_gcc_inline_asm_sha512="n/a"
+ else
+ gcry_cv_gcc_inline_asm_sha512=no
+ AC_LINK_IFELSE([AC_LANG_PROGRAM(
+ [[void a(void) {
+ __asm__("vsha512msg1 %%xmm1, %%ymm3\n\t":::"cc");
+ __asm__("vsha512msg2 %%ymm1, %%ymm3\n\t":::"cc");
+ __asm__("vsha512rnds2 %%xmm0, %%ymm1, %%ymm3\n\t":::"cc");
+ __asm__("vperm2i128 \$0x21, %%ymm1, %%ymm2, %%ymm3\n\t":::"cc");
+ __asm__("vpalignr \$8, %%ymm1, %%ymm2, %%ymm3\n\t":::"cc");
+ __asm__("vpermq \$0x1b, %%ymm1, %%ymm3\n\t":::"cc");
+ __asm__("vextracti128 \$1, %%ymm0, %%xmm1\n\t":::"cc");
+ }]], [ a(); ] )],
+ [gcry_cv_gcc_inline_asm_sha512=yes])
+ fi])
+if test "$gcry_cv_gcc_inline_asm_sha512" = "yes" ; then
+ AC_DEFINE(HAVE_GCC_INLINE_ASM_SHA512,1,
+ [Defined if inline assembler supports Intel SHA512 instructions])
+fi
+
+
#
# Check whether GCC inline assembler supports SSE4.1 instructions.
#
@@ -4042,6 +4070,13 @@ if test "$found" = "1" ; then
GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-riscv-zvknhb-zvkb.lo"
;;
esac
+
+ case "$mpi_cpu_arch" in
+ x86)
+ # Build with the Intel SHA512 implementation
+ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-intel-shaext.lo"
+ ;;
+ esac
fi
LIST_MEMBER(sha3, $enabled_digests)
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index e2b8223a..58887b8d 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -586,6 +586,7 @@ are
@item intel-vaes-vpclmul
@item intel-avx512
@item intel-gfni
+ at item intel-sha512
@item arm-neon
@item arm-aes
@item arm-sha1
diff --git a/src/g10lib.h b/src/g10lib.h
index bb735e77..6abc5f5b 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -239,6 +239,7 @@ char **_gcry_strtokenize (const char *string, const char *delim);
#define HWF_INTEL_VAES_VPCLMUL (1 << 16)
#define HWF_INTEL_AVX512 (1 << 17)
#define HWF_INTEL_GFNI (1 << 18)
+#define HWF_INTEL_SHA512 (1 << 19)
#elif defined(HAVE_CPU_ARCH_ARM)
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index e2c9af0c..e3e144d0 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -95,8 +95,9 @@ is_cpuid_available(void)
}
static void
-get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
+get_cpuid_subleaf(unsigned int in, unsigned int subleaf,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
{
unsigned int regs[4];
@@ -105,7 +106,7 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
"cpuid\n\t"
"xchgl %%ebx, %1\n\t" /* Restore GOT register. */
: "=a" (regs[0]), "=D" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
- : "0" (in), "1" (0), "2" (0), "3" (0)
+ : "0" (in), "1" (0), "2" (subleaf), "3" (0)
: "cc"
);
@@ -119,6 +120,13 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
*edx = regs[3];
}
+static void
+get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ get_cpuid_subleaf(in, 0, eax, ebx, ecx, edx);
+}
+
#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT)
static unsigned int
get_xgetbv(void)
@@ -148,15 +156,16 @@ is_cpuid_available(void)
}
static void
-get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
+get_cpuid_subleaf(unsigned int in, unsigned int subleaf,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
{
unsigned int regs[4];
asm volatile
("cpuid\n\t"
: "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
- : "0" (in), "1" (0), "2" (0), "3" (0)
+ : "0" (in), "1" (0), "2" (subleaf), "3" (0)
: "cc"
);
@@ -170,6 +179,13 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
*edx = regs[3];
}
+static void
+get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ get_cpuid_subleaf(in, 0, eax, ebx, ecx, edx);
+}
+
#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT)
static unsigned int
get_xgetbv(void)
@@ -228,33 +244,35 @@ detect_x86_gnuc (
#ifdef ENABLE_PADLOCK_SUPPORT
else if (!strcmp (vendor_id.c, "CentaurHauls"))
{
+ unsigned int via_feat, via_feat2;
+
/* This is a VIA CPU. Check what PadLock features we have. */
/* Check for extended centaur (EAX). */
- get_cpuid(0xC0000000, &features, NULL, NULL, NULL);
+ get_cpuid(0xC0000000, &via_feat, NULL, NULL, NULL);
/* Has extended centaur features? */
- if (features > 0xC0000000)
+ if (via_feat > 0xC0000000)
{
/* Ask for the extended feature flags (EDX). */
- get_cpuid(0xC0000001, NULL, NULL, NULL, &features);
+ get_cpuid(0xC0000001, NULL, NULL, NULL, &via_feat2);
/* Test bits 2 and 3 to see whether the RNG exists and is enabled. */
- if ((features & 0x0C) == 0x0C)
+ if ((via_feat2 & 0x0C) == 0x0C)
result |= HWF_PADLOCK_RNG;
/* Test bits 6 and 7 to see whether the ACE exists and is enabled. */
- if ((features & 0xC0) == 0xC0)
+ if ((via_feat2 & 0xC0) == 0xC0)
result |= HWF_PADLOCK_AES;
/* Test bits 10 and 11 to see whether the PHE exists and is
enabled. */
- if ((features & 0xC00) == 0xC00)
+ if ((via_feat2 & 0xC00) == 0xC00)
result |= HWF_PADLOCK_SHA;
/* Test bits 12 and 13 to see whether the MONTMUL exists and is
enabled. */
- if ((features & 0x3000) == 0x3000)
+ if ((via_feat2 & 0x3000) == 0x3000)
result |= HWF_PADLOCK_MMUL;
}
}
@@ -337,28 +355,30 @@ detect_x86_gnuc (
has_sse3 = !!(features & 0x00000001);
if (max_cpuid_level >= 7 && has_sse3)
{
+ unsigned int intel_feat, intel_feat2, intel_feat3;
+
/* Get CPUID:7 contains further Intel feature flags. */
- get_cpuid(7, NULL, &features, &features2, NULL);
+ get_cpuid(7, NULL, &intel_feat, &intel_feat2, NULL);
/* Test bit 8 for BMI2. */
- if (features & 0x00000100)
+ if (intel_feat & 0x00000100)
result |= HWF_INTEL_BMI2;
#ifdef ENABLE_AVX2_SUPPORT
/* Test bit 5 for AVX2. */
- if (features & 0x00000020)
+ if (intel_feat & 0x00000020)
if (os_supports_avx_avx2_registers)
result |= HWF_INTEL_AVX2;
#endif /*ENABLE_AVX_SUPPORT*/
/* Test bit 29 for SHA Extensions. */
- if (features & (1 << 29))
+ if (intel_feat & (1 << 29))
result |= HWF_INTEL_SHAEXT;
#if defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT) && \
defined(ENABLE_PCLMUL_SUPPORT)
- /* Test features2 bit 9 for VAES and features2 bit 10 for VPCLMULDQD */
- if ((features2 & 0x00000200) && (features2 & 0x00000400))
+ /* Test intel_feat2 bit 9 for VAES and intel_feat2 bit 10 for VPCLMULDQD */
+ if ((intel_feat2 & 0x00000200) && (intel_feat2 & 0x00000400))
result |= HWF_INTEL_VAES_VPCLMUL;
#endif
@@ -367,44 +387,53 @@ detect_x86_gnuc (
* supporting CPUs are new enough not to suffer from reduced clock
* frequencies when AVX512 is used, which was issue on early AVX512
* capable CPUs.
- * - AVX512F (features bit 16)
- * - AVX512DQ (features bit 17)
- * - AVX512IFMA (features bit 21)
- * - AVX512CD (features bit 28)
- * - AVX512BW (features bit 30)
- * - AVX512VL (features bit 31)
- * - AVX512_VBMI (features2 bit 1)
- * - AVX512_VBMI2 (features2 bit 6)
- * - AVX512_VNNI (features2 bit 11)
- * - AVX512_BITALG (features2 bit 12)
- * - AVX512_VPOPCNTDQ (features2 bit 14)
+ * - AVX512F (intel_feat bit 16)
+ * - AVX512DQ (intel_feat bit 17)
+ * - AVX512IFMA (intel_feat bit 21)
+ * - AVX512CD (intel_feat bit 28)
+ * - AVX512BW (intel_feat bit 30)
+ * - AVX512VL (intel_feat bit 31)
+ * - AVX512_VBMI (intel_feat2 bit 1)
+ * - AVX512_VBMI2 (intel_feat2 bit 6)
+ * - AVX512_VNNI (intel_feat2 bit 11)
+ * - AVX512_BITALG (intel_feat2 bit 12)
+ * - AVX512_VPOPCNTDQ (intel_feat2 bit 14)
*/
if (os_supports_avx512_registers
- && (features & (1 << 16))
- && (features & (1 << 17))
- && (features & (1 << 21))
- && (features & (1 << 28))
- && (features & (1 << 30))
- && (features & (1U << 31))
- && (features2 & (1 << 1))
- && (features2 & (1 << 6))
- && (features2 & (1 << 11))
- && (features2 & (1 << 12))
- && (features2 & (1 << 14)))
+ && (intel_feat & (1 << 16))
+ && (intel_feat & (1 << 17))
+ && (intel_feat & (1 << 21))
+ && (intel_feat & (1 << 28))
+ && (intel_feat & (1 << 30))
+ && (intel_feat & (1U << 31))
+ && (intel_feat2 & (1 << 1))
+ && (intel_feat2 & (1 << 6))
+ && (intel_feat2 & (1 << 11))
+ && (intel_feat2 & (1 << 12))
+ && (intel_feat2 & (1 << 14)))
result |= HWF_INTEL_AVX512;
#endif
/* Test features2 bit 6 for GFNI (Galois field new instructions).
* These instructions are available for SSE/AVX/AVX2/AVX512. */
- if (features2 & (1 << 6))
+ if (intel_feat2 & (1 << 6))
result |= HWF_INTEL_GFNI;
+
+ /* Get CPUID:7 sub-leaf 1 for further Intel feature flags. */
+ get_cpuid_subleaf(7, 1, &intel_feat3, NULL, NULL, NULL);
+
+ /* Test bit 0 for Intel SHA512 instructions. */
+ if ((intel_feat3 & (1 << 0)) && os_supports_avx_avx2_registers)
+ result |= HWF_INTEL_SHA512;
}
/* Check additional feature flags. */
if (max_cpuid_level >= 0x21 && has_sse3)
{
- get_cpuid(0x21, &features, NULL, NULL, NULL);
- if (features & (1 << 23))
+ unsigned int amd_feat;
+
+ get_cpuid(0x21, &amd_feat, NULL, NULL, NULL);
+ if (amd_feat & (1 << 23))
{
has_avx512bmm = 1;
}
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 7d32c4de..4c13ce96 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -84,6 +84,7 @@ static struct
{ HWF_INTEL_VAES_VPCLMUL, "intel-vaes-vpclmul" },
{ HWF_INTEL_AVX512, "intel-avx512" },
{ HWF_INTEL_GFNI, "intel-gfni" },
+ { HWF_INTEL_SHA512, "intel-sha512" },
/* Following removed HW feature strings are kept for API compatibility. */
{ 0, "intel-fast-vpgather" },
#elif defined(HAVE_CPU_ARCH_ARM)
--
2.53.0
More information about the Gcrypt-devel
mailing list