[PATCH 7/7] sm4: deduplicate OCB bulk dispatch using function pointers

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Jun 28 14:37:41 CEST 2026


* cipher/sm4.c (ocb_crypt_fn_t, ocb_auth_fn_t) [ASM_FUNC_ABI]: New.
(_gcry_sm4_aesni_avx_ocb_enc, _gcry_sm4_aesni_avx_ocb_dec)
(_gcry_sm4_aesni_avx_ocb_auth, _gcry_sm4_aesni_avx2_ocb_enc)
(_gcry_sm4_aesni_avx2_ocb_dec, _gcry_sm4_aesni_avx2_ocb_auth)
(_gcry_sm4_intel_avx2_ocb_enc, _gcry_sm4_intel_avx2_ocb_dec)
(_gcry_sm4_intel_avx2_ocb_auth, _gcry_sm4_intel_avx512_ocb_enc_blk32)
(_gcry_sm4_intel_avx512_ocb_dec_blk32, _gcry_sm4_gfni_avx2_ocb_enc)
(_gcry_sm4_gfni_avx2_ocb_dec, _gcry_sm4_gfni_avx2_ocb_auth)
(_gcry_sm4_gfni_avx512_ocb_enc, _gcry_sm4_gfni_avx512_ocb_dec)
(_gcry_sm4_gfni_avx512_ocb_auth, _gcry_sm4_gfni_avx512_ocb_enc_blk32)
(_gcry_sm4_gfni_avx512_ocb_dec_blk32): Make 'Ls' an unsized array
parameter.
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [ASM_FUNC_ABI]: Select the
bulk implementation through a function pointer and share the
8/16/32-block processing loops.
--

The bulk OCB encrypt/decrypt and authentication paths repeated the same
block-chunk processing loop for each implementation. Select the
implementation's bulk function through a function pointer instead, so
the loops are written only once.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4.c | 338 ++++++++++++++++-----------------------------------
 1 file changed, 103 insertions(+), 235 deletions(-)

diff --git a/cipher/sm4.c b/cipher/sm4.c
index 37e1a6c3..3c60dafc 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -172,6 +172,16 @@ static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
 typedef bulk_crypt_fn_t crypt_blk1_16_fn_t;
 
+#ifdef ASM_FUNC_ABI
+typedef void (*ocb_crypt_fn_t)(const u32 *rk_enc, unsigned char *out,
+			       const unsigned char *in, unsigned char *offset,
+			       unsigned char *checksum,
+			       const u64 Ls[]) ASM_FUNC_ABI;
+typedef void (*ocb_auth_fn_t)(const u32 *rk_enc, const unsigned char *abuf,
+			      unsigned char *offset, unsigned char *checksum,
+			      const u64 Ls[]) ASM_FUNC_ABI;
+#endif
+
 typedef struct
 {
   u32 rkey_enc[32];
@@ -1934,79 +1944,71 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 blkn = c->u_mode.ocb.data_nblocks;
+  u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
   int burn_stack_depth = 0;
+#ifdef ASM_FUNC_ABI
+  int bulk_func_available = 0;
+  ocb_crypt_fn_t crypt_blk32 = NULL;
+  ocb_crypt_fn_t crypt_blk16 = NULL;
+  ocb_crypt_fn_t crypt_blk8 = NULL;
 
-#ifdef USE_INTEL_SM4_AVX512
-  if (ctx->use_intel_sm4_avx512)
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
     {
-      u64 Ls[32];
-      u64 *l;
-
-      if (nblocks >= 32)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
-
-	  /* Process data in 32 block chunks. */
-	  while (nblocks >= 32)
-	    {
-	      blkn += 32;
-	      *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
-
-	      if (encrypt)
-		_gcry_sm4_intel_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
-                                                      inbuf, c->u_iv.iv,
-                                                      c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_intel_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
-                                                      inbuf, c->u_iv.iv,
-                                                      c->u_ctr.ctr, Ls);
-
-	      nblocks -= 32;
-	      outbuf += 32 * 16;
-	      inbuf += 32 * 16;
-	    }
-	}
+      bulk_func_available = 1;
+      crypt_blk8 = encrypt ? _gcry_sm4_aesni_avx_ocb_enc :
+			     _gcry_sm4_aesni_avx_ocb_dec;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      bulk_func_available = 1;
+      crypt_blk16 = encrypt ? _gcry_sm4_aesni_avx2_ocb_enc :
+			      _gcry_sm4_aesni_avx2_ocb_dec;
+    }
+#endif
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      bulk_func_available = 1;
+      crypt_blk16 = encrypt ? _gcry_sm4_gfni_avx2_ocb_enc :
+			      _gcry_sm4_gfni_avx2_ocb_dec;
+    }
+#endif
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      bulk_func_available = 1;
+      crypt_blk16 = encrypt ? _gcry_sm4_gfni_avx512_ocb_enc :
+			      _gcry_sm4_gfni_avx512_ocb_dec;
+      crypt_blk32 = encrypt ? _gcry_sm4_gfni_avx512_ocb_enc_blk32 :
+			      _gcry_sm4_gfni_avx512_ocb_dec_blk32;
     }
 #endif
-
 #ifdef USE_INTEL_SM4_AVX2
   if (ctx->use_intel_sm4_avx2)
     {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-	  /* Process data in 16 block chunks. */
-	  while (nblocks >= 16)
-	    {
-	      blkn += 16;
-	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
-	      if (encrypt)
-		_gcry_sm4_intel_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
-					     c->u_iv.iv, c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_intel_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
-					     c->u_iv.iv, c->u_ctr.ctr, Ls);
-
-	      nblocks -= 16;
-	      outbuf += 16 * 16;
-	      inbuf += 16 * 16;
-	    }
-	}
+      bulk_func_available = 1;
+      crypt_blk16 = encrypt ? _gcry_sm4_intel_avx2_ocb_enc :
+			      _gcry_sm4_intel_avx2_ocb_dec;
+    }
+#endif
+#ifdef USE_INTEL_SM4_AVX512
+  if (ctx->use_intel_sm4_avx512)
+    {
+      bulk_func_available = 1;
+      crypt_blk32 = encrypt ? _gcry_sm4_intel_avx512_ocb_enc_blk32 :
+			      _gcry_sm4_intel_avx512_ocb_dec_blk32;
     }
 #endif
 
-#ifdef USE_GFNI_AVX512
-  if (ctx->use_gfni_avx512)
+  if (bulk_func_available)
     {
       u64 Ls[32];
       u64 *l;
 
-      if (nblocks >= 32)
+      if (crypt_blk32 != NULL && nblocks >= 32)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
 
@@ -2016,14 +2018,8 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 	      blkn += 32;
 	      *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
 
-	      if (encrypt)
-		_gcry_sm4_gfni_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
-                                                     inbuf, c->u_iv.iv,
-                                                     c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_gfni_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
-                                                     inbuf, c->u_iv.iv,
-                                                     c->u_ctr.ctr, Ls);
+	      crypt_blk32 (rk, outbuf, inbuf, c->u_iv.iv,
+			   c->u_ctr.ctr, Ls);
 
 	      nblocks -= 32;
 	      outbuf += 32 * 16;
@@ -2031,66 +2027,7 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 	    }
 	}
 
-      if (nblocks >= 16)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-	  /* Process data in 16 block chunks. */
-	  blkn += 16;
-	  *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
-	  if (encrypt)
-	    _gcry_sm4_gfni_avx512_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
-					  c->u_iv.iv, c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_sm4_gfni_avx512_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
-					  c->u_iv.iv, c->u_ctr.ctr, Ls);
-
-	  nblocks -= 16;
-	  outbuf += 16 * 16;
-	  inbuf += 16 * 16;
-	}
-    }
-#endif
-
-#ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2)
-    {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-	  /* Process data in 16 block chunks. */
-	  while (nblocks >= 16)
-	    {
-	      blkn += 16;
-	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
-	      if (encrypt)
-		_gcry_sm4_gfni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
-					    c->u_iv.iv, c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_gfni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
-					    c->u_iv.iv, c->u_ctr.ctr, Ls);
-
-	      nblocks -= 16;
-	      outbuf += 16 * 16;
-	      inbuf += 16 * 16;
-	    }
-	}
-    }
-#endif
-
-#ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
-    {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
+      if (crypt_blk16 != NULL && nblocks >= 16)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
@@ -2100,28 +2037,16 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 	      blkn += 16;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
 
-	      if (encrypt)
-		_gcry_sm4_aesni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
-					     c->u_iv.iv, c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_aesni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
-					     c->u_iv.iv, c->u_ctr.ctr, Ls);
+	      crypt_blk16 (rk, outbuf, inbuf, c->u_iv.iv,
+			   c->u_ctr.ctr, Ls);
 
 	      nblocks -= 16;
 	      outbuf += 16 * 16;
 	      inbuf += 16 * 16;
 	    }
 	}
-    }
-#endif
-
-#ifdef USE_AESNI_AVX
-  if (ctx->use_aesni_avx)
-    {
-      u64 Ls[8];
-      u64 *l;
 
-      if (nblocks >= 8)
+      if (crypt_blk8 != NULL && nblocks >= 8)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
@@ -2131,12 +2056,8 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 	      blkn += 8;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
 
-	      if (encrypt)
-		_gcry_sm4_aesni_avx_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
-					    c->u_iv.iv, c->u_ctr.ctr, Ls);
-	      else
-		_gcry_sm4_aesni_avx_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
-					    c->u_iv.iv, c->u_ctr.ctr, Ls);
+	      crypt_blk8 (rk, outbuf, inbuf, c->u_iv.iv,
+			  c->u_ctr.ctr, Ls);
 
 	      nblocks -= 8;
 	      outbuf += 8 * 16;
@@ -2150,7 +2071,6 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   if (nblocks)
     {
       crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
-      u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
       unsigned char tmpbuf[16 * 16];
       unsigned int tmp_used = 16;
       size_t nburn;
@@ -2179,95 +2099,53 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   const unsigned char *abuf = abuf_arg;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
   int burn_stack_depth = 0;
+#ifdef ASM_FUNC_ABI
+  int bulk_func_available = 0;
+  ocb_auth_fn_t auth_blk16 = NULL;
+  ocb_auth_fn_t auth_blk8 = NULL;
 
-#ifdef USE_INTEL_SM4_AVX2
-  if (ctx->use_intel_sm4_avx2)
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
     {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-	  /* Process data in 16 block chunks. */
-	  while (nblocks >= 16)
-	    {
-	      blkn += 16;
-	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
-	      _gcry_sm4_intel_avx2_ocb_auth(ctx->rkey_enc, abuf,
-					    c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
-
-	      nblocks -= 16;
-	      abuf += 16 * 16;
-	    }
-	}
+      bulk_func_available = 1;
+      auth_blk8 = _gcry_sm4_aesni_avx_ocb_auth;
     }
 #endif
-
-#ifdef USE_GFNI_AVX512
-  if (ctx->use_gfni_avx512)
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
     {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
-        {
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-          /* Process data in 16 block chunks. */
-          while (nblocks >= 16)
-            {
-              blkn += 16;
-              *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 16);
-
-              _gcry_sm4_gfni_avx512_ocb_auth (ctx->rkey_enc, abuf,
-                                              c->u_mode.ocb.aad_offset,
-                                              c->u_mode.ocb.aad_sum, Ls);
-
-              nblocks -= 16;
-              abuf += 16 * 16;
-            }
-        }
+      bulk_func_available = 1;
+      auth_blk16 = _gcry_sm4_aesni_avx2_ocb_auth;
     }
 #endif
-
 #ifdef USE_GFNI_AVX2
   if (ctx->use_gfni_avx2)
     {
-      u64 Ls[16];
-      u64 *l;
-
-      if (nblocks >= 16)
-	{
-          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
-	  /* Process data in 16 block chunks. */
-	  while (nblocks >= 16)
-	    {
-	      blkn += 16;
-	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
-	      _gcry_sm4_gfni_avx2_ocb_auth(ctx->rkey_enc, abuf,
-					   c->u_mode.ocb.aad_offset,
-					   c->u_mode.ocb.aad_sum, Ls);
-
-	      nblocks -= 16;
-	      abuf += 16 * 16;
-	    }
-	}
+      bulk_func_available = 1;
+      auth_blk16 = _gcry_sm4_gfni_avx2_ocb_auth;
+    }
+#endif
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      bulk_func_available = 1;
+      auth_blk16 = _gcry_sm4_gfni_avx512_ocb_auth;
+    }
+#endif
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      bulk_func_available = 1;
+      auth_blk16 = _gcry_sm4_intel_avx2_ocb_auth;
     }
 #endif
 
-#ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (bulk_func_available)
     {
       u64 Ls[16];
       u64 *l;
 
-      if (nblocks >= 16)
+      if (auth_blk16 != NULL && nblocks >= 16)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
@@ -2277,24 +2155,15 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 	      blkn += 16;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
 
-	      _gcry_sm4_aesni_avx2_ocb_auth(ctx->rkey_enc, abuf,
-					    c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
+	      auth_blk16(ctx->rkey_enc, abuf, c->u_mode.ocb.aad_offset,
+			 c->u_mode.ocb.aad_sum, Ls);
 
 	      nblocks -= 16;
 	      abuf += 16 * 16;
 	    }
 	}
-    }
-#endif
 
-#ifdef USE_AESNI_AVX
-  if (ctx->use_aesni_avx)
-    {
-      u64 Ls[8];
-      u64 *l;
-
-      if (nblocks >= 8)
+      if (auth_blk8 != NULL && nblocks >= 8)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
@@ -2304,9 +2173,8 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 	      blkn += 8;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
 
-	      _gcry_sm4_aesni_avx_ocb_auth(ctx->rkey_enc, abuf,
-					   c->u_mode.ocb.aad_offset,
-					   c->u_mode.ocb.aad_sum, Ls);
+	      auth_blk8(ctx->rkey_enc, abuf, c->u_mode.ocb.aad_offset,
+			c->u_mode.ocb.aad_sum, Ls);
 
 	      nblocks -= 8;
 	      abuf += 8 * 16;
-- 
2.53.0




More information about the Gcrypt-devel mailing list