[PATCH] blake2: avoid AVX/AVX2/AVX512 when CPU has high vector inst latency
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 21 17:21:40 CET 2025
* cipher/blake2.c (blake2b_init_ctx, blake2s_init_ctx): Disable
AVX/AVX2/AVX512 implementation if integer vector latency is higher
than 1.
* src/g10lib.h (_gcry_get_hwf_int_vector_latency): New.
* src/hwf-common.h (_gcry_hwf_detect_x86): Add 'int_vector_latency'.
* src/hwf-x86.c (detect_x86_gnuc): Detect Zen5 and add
'int_vector_latency'.
(_gcry_hwf_detect_x86): Add 'int_vector_latency'.
* src/hwfeatures.c (hwf_int_vector_latency)
(_gcry_get_hwf_int_vector_latency): New.
--
Blake2s/Blake2b AVX/AVX2/AVX512 implementations are slower than
generic C implementation if CPU has integer vector latency higher
than 1 (for example, AMD Zen5 has int-vector latency of 2). Therefore
add detection for integer vector latency for x86 CPUs and use
generic C for Blake2 when latency is greater than 1.
Generic C with AMD Zen5:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.473 ns/B 2016 MiB/s 2.72 c/B 5750
BLAKE2S_256 | 0.798 ns/B 1195 MiB/s 4.59 c/B 5750
AVX512 with AMD Zen5:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.923 ns/B 1033 MiB/s 5.31 c/B 5750
BLAKE2S_256 | 1.42 ns/B 672.4 MiB/s 8.15 c/B 5749
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/blake2.c | 12 ++++++++----
src/g10lib.h | 1 +
src/hwf-common.h | 2 +-
src/hwf-x86.c | 38 ++++++++++++++++++++++++++++++++++----
src/hwfeatures.c | 15 ++++++++++++++-
5 files changed, 58 insertions(+), 10 deletions(-)
diff --git a/cipher/blake2.c b/cipher/blake2.c
index 1a04fbd8..2fce448d 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -484,17 +484,19 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
{
BLAKE2B_CONTEXT *c = ctx;
unsigned int features = _gcry_get_hw_features ();
+ unsigned int int_vec_lat = _gcry_get_hwf_int_vector_latency ();
+ (void)int_vec_lat;
(void)features;
(void)flags;
memset (c, 0, sizeof (*c));
#ifdef USE_AVX2
- c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+ c->use_avx2 = !!(features & HWF_INTEL_AVX2) && (int_vec_lat <= 1);
#endif
#ifdef USE_AVX512
- c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+ c->use_avx512 = !!(features & HWF_INTEL_AVX512) && (int_vec_lat <= 1);
#endif
c->outlen = dbits / 8;
@@ -821,17 +823,19 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
{
BLAKE2S_CONTEXT *c = ctx;
unsigned int features = _gcry_get_hw_features ();
+ unsigned int int_vec_lat = _gcry_get_hwf_int_vector_latency ();
+ (void)int_vec_lat;
(void)features;
(void)flags;
memset (c, 0, sizeof (*c));
#ifdef USE_AVX
- c->use_avx = !!(features & HWF_INTEL_AVX);
+ c->use_avx = !!(features & HWF_INTEL_AVX) && (int_vec_lat <= 1);
#endif
#ifdef USE_AVX512
- c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+ c->use_avx512 = !!(features & HWF_INTEL_AVX512) && (int_vec_lat <= 1);
#endif
c->outlen = dbits / 8;
diff --git a/src/g10lib.h b/src/g10lib.h
index bb735e77..c229d717 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -292,6 +292,7 @@ gpg_err_code_t _gcry_disable_hw_feature (const char *name);
void _gcry_detect_hw_features (void);
unsigned int _gcry_get_hw_features (void);
const char *_gcry_enum_hw_features (int idx, unsigned int *r_feature);
+int _gcry_get_hwf_int_vector_latency (void);
const char *_gcry_get_sysconfdir (void);
diff --git a/src/hwf-common.h b/src/hwf-common.h
index 749ff040..ef9ffdf9 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -20,7 +20,7 @@
#ifndef HWF_COMMON_H
#define HWF_COMMON_H
-unsigned int _gcry_hwf_detect_x86 (void);
+unsigned int _gcry_hwf_detect_x86 (int *int_vector_latency);
unsigned int _gcry_hwf_detect_arm (void);
unsigned int _gcry_hwf_detect_ppc (void);
unsigned int _gcry_hwf_detect_s390x (void);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 54af1c83..f056641c 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -184,7 +184,9 @@ get_xgetbv(void)
#ifdef HAS_X86_CPUID
static unsigned int
-detect_x86_gnuc (void)
+detect_x86_gnuc (
+ int *int_vector_latency
+)
{
union
{
@@ -198,10 +200,15 @@ detect_x86_gnuc (void)
unsigned int fms, family, model;
unsigned int result = 0;
unsigned int is_amd_cpu = 0;
+ unsigned int has_avx512bmm = 0;
+ unsigned int has_sse3 = 0;
(void)os_supports_avx_avx2_registers;
(void)os_supports_avx512_registers;
+ /* Assume integer vector latency of 1 by default. */
+ *int_vector_latency = 1;
+
if (!is_cpuid_available())
return 0;
@@ -320,7 +327,8 @@ detect_x86_gnuc (void)
* too high max_cpuid_level, so don't check level 7 if processor does not
* support SSE3 (as cpuid:7 contains only features for newer processors).
* Source: http://www.sandpile.org/x86/cpuid.htm */
- if (max_cpuid_level >= 7 && (features & 0x00000001))
+ has_sse3 = !!(features & 0x00000001);
+ if (max_cpuid_level >= 7 && has_sse3)
{
/* Get CPUID:7 contains further Intel feature flags. */
get_cpuid(7, NULL, &features, &features2, NULL);
@@ -385,6 +393,16 @@ detect_x86_gnuc (void)
result |= HWF_INTEL_GFNI;
}
+ /* Check additional feature flags. */
+ if (max_cpuid_level >= 0x21 && has_sse3)
+ {
+ get_cpuid(0x21, &features, NULL, NULL, NULL);
+ if (features & (1 << 23))
+ {
+ has_avx512bmm = 1;
+ }
+ }
+
if ((result & HWF_INTEL_CPU) && family == 6)
{
/* These Intel Core processor models have SHLD/SHRD instruction that
@@ -413,6 +431,12 @@ detect_x86_gnuc (void)
}
}
+ if (is_amd_cpu && (family == 0x1a) && !has_avx512bmm)
+ {
+ /* Zen5 has integer vector instruction latency of 2. */
+ *int_vector_latency = 2;
+ }
+
#ifdef ENABLE_FORCE_SOFT_HWFEATURES
/* Soft HW features mark functionality that is available on all systems
* but not feasible to use because of slow HW implementation. */
@@ -428,6 +452,9 @@ detect_x86_gnuc (void)
* only for those Intel processors that benefit from the SHLD
* instruction. Enabled here unconditionally as requested. */
result |= HWF_INTEL_FAST_SHLD;
+
+ /* Assume that integer vector instructions have minimum latency. */
+ *int_vector_latency = 0;
#endif
return result;
@@ -436,11 +463,14 @@ detect_x86_gnuc (void)
unsigned int
-_gcry_hwf_detect_x86 (void)
+_gcry_hwf_detect_x86 (
+ int *int_vector_latency
+)
{
#if defined (HAS_X86_CPUID)
- return detect_x86_gnuc ();
+ return detect_x86_gnuc (int_vector_latency);
#else
+ *int_vector_latency = 0;
return 0;
#endif
}
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 1b107e63..1c3b8034 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -135,6 +135,9 @@ static unsigned int disabled_hw_features;
available. */
static unsigned int hw_features;
+/* Latency for simple interger vector instructions. */
+static int hwf_int_vector_latency;
+
static const char *
@@ -204,6 +207,14 @@ _gcry_get_hw_features (void)
}
+/* Return latency for integer vector instructions. */
+int
+_gcry_get_hwf_int_vector_latency (void)
+{
+ return hwf_int_vector_latency;
+}
+
+
/* Enumerate all features. The caller is expected to start with an
IDX of 0 and then increment IDX until NULL is returned. */
const char *
@@ -283,9 +294,11 @@ _gcry_detect_hw_features (void)
parse_hwf_deny_file ();
+ hwf_int_vector_latency = -1;
+
#if defined (HAVE_CPU_ARCH_X86)
{
- hw_features = _gcry_hwf_detect_x86 ();
+ hw_features = _gcry_hwf_detect_x86 (&hwf_int_vector_latency);
}
#elif defined (HAVE_CPU_ARCH_ARM)
{
--
2.51.0
More information about the Gcrypt-devel
mailing list