[PATCH] blake2: avoid AVX/AVX2/AVX512 when CPU has high vector inst latency

Sun Dec 21 17:21:40 CET 2025

* cipher/blake2.c (blake2b_init_ctx, blake2s_init_ctx): Disable
AVX/AVX2/AVX512 implementation if integer vector latency is higher
than 1.
* src/g10lib.h (_gcry_get_hwf_int_vector_latency): New.
* src/hwf-common.h (_gcry_hwf_detect_x86): Add 'int_vector_latency'.
* src/hwf-x86.c (detect_x86_gnuc): Detect Zen5 and add
'int_vector_latency'.
(_gcry_hwf_detect_x86): Add 'int_vector_latency'.
* src/hwfeatures.c (hwf_int_vector_latency)
(_gcry_get_hwf_int_vector_latency): New.
--

Blake2s/Blake2b AVX/AVX2/AVX512 implementations are slower than
generic C implementation if CPU has integer vector latency higher
than 1 (for example, AMD Zen5 has int-vector latency of 2). Therefore
add detection for integer vector latency for x86 CPUs and use
generic C for Blake2 when latency is greater than 1.

Generic C with AMD Zen5:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 BLAKE2B_512    |     0.473 ns/B      2016 MiB/s      2.72 c/B      5750
 BLAKE2S_256    |     0.798 ns/B      1195 MiB/s      4.59 c/B      5750

AVX512 with AMD Zen5:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 BLAKE2B_512    |     0.923 ns/B      1033 MiB/s      5.31 c/B      5750
 BLAKE2S_256    |      1.42 ns/B     672.4 MiB/s      8.15 c/B      5749

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/blake2.c  | 12 ++++++++----
 src/g10lib.h     |  1 +
 src/hwf-common.h |  2 +-
 src/hwf-x86.c    | 38 ++++++++++++++++++++++++++++++++++----
 src/hwfeatures.c | 15 ++++++++++++++-
 5 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/cipher/blake2.c b/cipher/blake2.c
index 1a04fbd8..2fce448d 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -484,17 +484,19 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
 {
   BLAKE2B_CONTEXT *c = ctx;
   unsigned int features = _gcry_get_hw_features ();
+  unsigned int int_vec_lat = _gcry_get_hwf_int_vector_latency ();
 
+  (void)int_vec_lat;
   (void)features;
   (void)flags;
 
   memset (c, 0, sizeof (*c));
 
 #ifdef USE_AVX2
-  c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+  c->use_avx2 = !!(features & HWF_INTEL_AVX2) && (int_vec_lat <= 1);
 #endif
 #ifdef USE_AVX512
-  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512) && (int_vec_lat <= 1);
 #endif
 
   c->outlen = dbits / 8;
@@ -821,17 +823,19 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
 {
   BLAKE2S_CONTEXT *c = ctx;
   unsigned int features = _gcry_get_hw_features ();
+  unsigned int int_vec_lat = _gcry_get_hwf_int_vector_latency ();
 
+  (void)int_vec_lat;
   (void)features;
   (void)flags;
 
   memset (c, 0, sizeof (*c));
 
 #ifdef USE_AVX
-  c->use_avx = !!(features & HWF_INTEL_AVX);
+  c->use_avx = !!(features & HWF_INTEL_AVX) && (int_vec_lat <= 1);
 #endif
 #ifdef USE_AVX512
-  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512) && (int_vec_lat <= 1);
 #endif
 
   c->outlen = dbits / 8;
diff --git a/src/g10lib.h b/src/g10lib.h
index bb735e77..c229d717 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -292,6 +292,7 @@ gpg_err_code_t _gcry_disable_hw_feature (const char *name);
 void _gcry_detect_hw_features (void);
 unsigned int _gcry_get_hw_features (void);
 const char *_gcry_enum_hw_features (int idx, unsigned int *r_feature);
+int _gcry_get_hwf_int_vector_latency (void);
 
 const char *_gcry_get_sysconfdir (void);
 
diff --git a/src/hwf-common.h b/src/hwf-common.h
index 749ff040..ef9ffdf9 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -20,7 +20,7 @@
 #ifndef HWF_COMMON_H
 #define HWF_COMMON_H
 
-unsigned int _gcry_hwf_detect_x86 (void);
+unsigned int _gcry_hwf_detect_x86 (int *int_vector_latency);
 unsigned int _gcry_hwf_detect_arm (void);
 unsigned int _gcry_hwf_detect_ppc (void);
 unsigned int _gcry_hwf_detect_s390x (void);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 54af1c83..f056641c 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -184,7 +184,9 @@ get_xgetbv(void)
 
 #ifdef HAS_X86_CPUID
 static unsigned int
-detect_x86_gnuc (void)
+detect_x86_gnuc (
+  int *int_vector_latency
+)
 {
   union
   {
@@ -198,10 +200,15 @@ detect_x86_gnuc (void)
   unsigned int fms, family, model;
   unsigned int result = 0;
   unsigned int is_amd_cpu = 0;
+  unsigned int has_avx512bmm = 0;
+  unsigned int has_sse3 = 0;
 
   (void)os_supports_avx_avx2_registers;
   (void)os_supports_avx512_registers;
 
+  /* Assume integer vector latency of 1 by default. */
+  *int_vector_latency = 1;
+
   if (!is_cpuid_available())
     return 0;
 
@@ -320,7 +327,8 @@ detect_x86_gnuc (void)
    * too high max_cpuid_level, so don't check level 7 if processor does not
    * support SSE3 (as cpuid:7 contains only features for newer processors).
    * Source: http://www.sandpile.org/x86/cpuid.htm  */
-  if (max_cpuid_level >= 7 && (features & 0x00000001))
+  has_sse3 = !!(features & 0x00000001);
+  if (max_cpuid_level >= 7 && has_sse3)
     {
       /* Get CPUID:7 contains further Intel feature flags. */
       get_cpuid(7, NULL, &features, &features2, NULL);
@@ -385,6 +393,16 @@ detect_x86_gnuc (void)
         result |= HWF_INTEL_GFNI;
     }
 
+  /* Check additional feature flags. */
+  if (max_cpuid_level >= 0x21 && has_sse3)
+    {
+      get_cpuid(0x21, &features, NULL, NULL, NULL);
+      if (features & (1 << 23))
+	{
+	  has_avx512bmm = 1;
+	}
+    }
+
   if ((result & HWF_INTEL_CPU) && family == 6)
     {
       /* These Intel Core processor models have SHLD/SHRD instruction that
@@ -413,6 +431,12 @@ detect_x86_gnuc (void)
 	}
     }
 
+  if (is_amd_cpu && (family == 0x1a) && !has_avx512bmm)
+    {
+      /* Zen5 has integer vector instruction latency of 2. */
+      *int_vector_latency = 2;
+    }
+
 #ifdef ENABLE_FORCE_SOFT_HWFEATURES
   /* Soft HW features mark functionality that is available on all systems
    * but not feasible to use because of slow HW implementation. */
@@ -428,6 +452,9 @@ detect_x86_gnuc (void)
    * only for those Intel processors that benefit from the SHLD
    * instruction. Enabled here unconditionally as requested. */
   result |= HWF_INTEL_FAST_SHLD;
+
+  /* Assume that integer vector instructions have minimum latency. */
+  *int_vector_latency = 0;
 #endif
 
   return result;
@@ -436,11 +463,14 @@ detect_x86_gnuc (void)
 
 
 unsigned int
-_gcry_hwf_detect_x86 (void)
+_gcry_hwf_detect_x86 (
+  int *int_vector_latency
+)
 {
 #if defined (HAS_X86_CPUID)
-  return detect_x86_gnuc ();
+  return detect_x86_gnuc (int_vector_latency);
 #else
+  *int_vector_latency = 0;
   return 0;
 #endif
 }
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 1b107e63..1c3b8034 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -135,6 +135,9 @@ static unsigned int disabled_hw_features;
    available. */
 static unsigned int hw_features;
 
+/* Latency for simple interger vector instructions. */
+static int hwf_int_vector_latency;
+
 
 
 static const char *
@@ -204,6 +207,14 @@ _gcry_get_hw_features (void)
 }
 
 
+/* Return latency for integer vector instructions. */
+int
+_gcry_get_hwf_int_vector_latency (void)
+{
+  return hwf_int_vector_latency;
+}
+
+
 /* Enumerate all features.  The caller is expected to start with an
    IDX of 0 and then increment IDX until NULL is returned.  */
 const char *
@@ -283,9 +294,11 @@ _gcry_detect_hw_features (void)
 
   parse_hwf_deny_file ();
 
+  hwf_int_vector_latency = -1;
+
 #if defined (HAVE_CPU_ARCH_X86)
   {
-    hw_features = _gcry_hwf_detect_x86 ();
+    hw_features = _gcry_hwf_detect_x86 (&hwf_int_vector_latency);
   }
 #elif defined (HAVE_CPU_ARCH_ARM)
   {
-- 
2.51.0