[PATCH 7/7] sm4: deduplicate OCB bulk dispatch using function pointers
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Jun 28 14:37:41 CEST 2026
* cipher/sm4.c (ocb_crypt_fn_t, ocb_auth_fn_t) [ASM_FUNC_ABI]: New.
(_gcry_sm4_aesni_avx_ocb_enc, _gcry_sm4_aesni_avx_ocb_dec)
(_gcry_sm4_aesni_avx_ocb_auth, _gcry_sm4_aesni_avx2_ocb_enc)
(_gcry_sm4_aesni_avx2_ocb_dec, _gcry_sm4_aesni_avx2_ocb_auth)
(_gcry_sm4_intel_avx2_ocb_enc, _gcry_sm4_intel_avx2_ocb_dec)
(_gcry_sm4_intel_avx2_ocb_auth, _gcry_sm4_intel_avx512_ocb_enc_blk32)
(_gcry_sm4_intel_avx512_ocb_dec_blk32, _gcry_sm4_gfni_avx2_ocb_enc)
(_gcry_sm4_gfni_avx2_ocb_dec, _gcry_sm4_gfni_avx2_ocb_auth)
(_gcry_sm4_gfni_avx512_ocb_enc, _gcry_sm4_gfni_avx512_ocb_dec)
(_gcry_sm4_gfni_avx512_ocb_auth, _gcry_sm4_gfni_avx512_ocb_enc_blk32)
(_gcry_sm4_gfni_avx512_ocb_dec_blk32): Make 'Ls' an unsized array
parameter.
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [ASM_FUNC_ABI]: Select the
bulk implementation through a function pointer and share the
8/16/32-block processing loops.
--
The bulk OCB encrypt/decrypt and authentication paths repeated the same
block-chunk processing loop for each implementation. Select the
implementation's bulk function through a function pointer instead, so
the loops are written only once.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/sm4.c | 338 ++++++++++++++++-----------------------------------
1 file changed, 103 insertions(+), 235 deletions(-)
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 37e1a6c3..3c60dafc 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -172,6 +172,16 @@ static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
typedef bulk_crypt_fn_t crypt_blk1_16_fn_t;
+#ifdef ASM_FUNC_ABI
+typedef void (*ocb_crypt_fn_t)(const u32 *rk_enc, unsigned char *out,
+ const unsigned char *in, unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[]) ASM_FUNC_ABI;
+typedef void (*ocb_auth_fn_t)(const u32 *rk_enc, const unsigned char *abuf,
+ unsigned char *offset, unsigned char *checksum,
+ const u64 Ls[]) ASM_FUNC_ABI;
+#endif
+
typedef struct
{
u32 rkey_enc[32];
@@ -1934,79 +1944,71 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 blkn = c->u_mode.ocb.data_nblocks;
+ u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
int burn_stack_depth = 0;
+#ifdef ASM_FUNC_ABI
+ int bulk_func_available = 0;
+ ocb_crypt_fn_t crypt_blk32 = NULL;
+ ocb_crypt_fn_t crypt_blk16 = NULL;
+ ocb_crypt_fn_t crypt_blk8 = NULL;
-#ifdef USE_INTEL_SM4_AVX512
- if (ctx->use_intel_sm4_avx512)
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
{
- u64 Ls[32];
- u64 *l;
-
- if (nblocks >= 32)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
-
- /* Process data in 32 block chunks. */
- while (nblocks >= 32)
- {
- blkn += 32;
- *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
-
- if (encrypt)
- _gcry_sm4_intel_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
- inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_intel_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
- inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 32;
- outbuf += 32 * 16;
- inbuf += 32 * 16;
- }
- }
+ bulk_func_available = 1;
+ crypt_blk8 = encrypt ? _gcry_sm4_aesni_avx_ocb_enc :
+ _gcry_sm4_aesni_avx_ocb_dec;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
+ {
+ bulk_func_available = 1;
+ crypt_blk16 = encrypt ? _gcry_sm4_aesni_avx2_ocb_enc :
+ _gcry_sm4_aesni_avx2_ocb_dec;
+ }
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ bulk_func_available = 1;
+ crypt_blk16 = encrypt ? _gcry_sm4_gfni_avx2_ocb_enc :
+ _gcry_sm4_gfni_avx2_ocb_dec;
+ }
+#endif
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ bulk_func_available = 1;
+ crypt_blk16 = encrypt ? _gcry_sm4_gfni_avx512_ocb_enc :
+ _gcry_sm4_gfni_avx512_ocb_dec;
+ crypt_blk32 = encrypt ? _gcry_sm4_gfni_avx512_ocb_enc_blk32 :
+ _gcry_sm4_gfni_avx512_ocb_dec_blk32;
}
#endif
-
#ifdef USE_INTEL_SM4_AVX2
if (ctx->use_intel_sm4_avx2)
{
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
- if (encrypt)
- _gcry_sm4_intel_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_intel_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
-
- nblocks -= 16;
- outbuf += 16 * 16;
- inbuf += 16 * 16;
- }
- }
+ bulk_func_available = 1;
+ crypt_blk16 = encrypt ? _gcry_sm4_intel_avx2_ocb_enc :
+ _gcry_sm4_intel_avx2_ocb_dec;
+ }
+#endif
+#ifdef USE_INTEL_SM4_AVX512
+ if (ctx->use_intel_sm4_avx512)
+ {
+ bulk_func_available = 1;
+ crypt_blk32 = encrypt ? _gcry_sm4_intel_avx512_ocb_enc_blk32 :
+ _gcry_sm4_intel_avx512_ocb_dec_blk32;
}
#endif
-#ifdef USE_GFNI_AVX512
- if (ctx->use_gfni_avx512)
+ if (bulk_func_available)
{
u64 Ls[32];
u64 *l;
- if (nblocks >= 32)
+ if (crypt_blk32 != NULL && nblocks >= 32)
{
l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
@@ -2016,14 +2018,8 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
- if (encrypt)
- _gcry_sm4_gfni_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
- inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_gfni_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
- inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
+ crypt_blk32 (rk, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
nblocks -= 32;
outbuf += 32 * 16;
@@ -2031,66 +2027,7 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
}
}
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
- if (encrypt)
- _gcry_sm4_gfni_avx512_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_gfni_avx512_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
-
- nblocks -= 16;
- outbuf += 16 * 16;
- inbuf += 16 * 16;
- }
- }
-#endif
-
-#ifdef USE_GFNI_AVX2
- if (ctx->use_gfni_avx2)
- {
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
- if (encrypt)
- _gcry_sm4_gfni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_gfni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
-
- nblocks -= 16;
- outbuf += 16 * 16;
- inbuf += 16 * 16;
- }
- }
- }
-#endif
-
-#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2)
- {
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
+ if (crypt_blk16 != NULL && nblocks >= 16)
{
l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
@@ -2100,28 +2037,16 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 16;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
- if (encrypt)
- _gcry_sm4_aesni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_aesni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
+ crypt_blk16 (rk, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
nblocks -= 16;
outbuf += 16 * 16;
inbuf += 16 * 16;
}
}
- }
-#endif
-
-#ifdef USE_AESNI_AVX
- if (ctx->use_aesni_avx)
- {
- u64 Ls[8];
- u64 *l;
- if (nblocks >= 8)
+ if (crypt_blk8 != NULL && nblocks >= 8)
{
l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
@@ -2131,12 +2056,8 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 8;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
- if (encrypt)
- _gcry_sm4_aesni_avx_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
- else
- _gcry_sm4_aesni_avx_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
- c->u_iv.iv, c->u_ctr.ctr, Ls);
+ crypt_blk8 (rk, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
nblocks -= 8;
outbuf += 8 * 16;
@@ -2150,7 +2071,6 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (nblocks)
{
crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
- u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
@@ -2179,95 +2099,53 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
const unsigned char *abuf = abuf_arg;
u64 blkn = c->u_mode.ocb.aad_nblocks;
int burn_stack_depth = 0;
+#ifdef ASM_FUNC_ABI
+ int bulk_func_available = 0;
+ ocb_auth_fn_t auth_blk16 = NULL;
+ ocb_auth_fn_t auth_blk8 = NULL;
-#ifdef USE_INTEL_SM4_AVX2
- if (ctx->use_intel_sm4_avx2)
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
{
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
- _gcry_sm4_intel_avx2_ocb_auth(ctx->rkey_enc, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
-
- nblocks -= 16;
- abuf += 16 * 16;
- }
- }
+ bulk_func_available = 1;
+ auth_blk8 = _gcry_sm4_aesni_avx_ocb_auth;
}
#endif
-
-#ifdef USE_GFNI_AVX512
- if (ctx->use_gfni_avx512)
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
{
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 16);
-
- _gcry_sm4_gfni_avx512_ocb_auth (ctx->rkey_enc, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
-
- nblocks -= 16;
- abuf += 16 * 16;
- }
- }
+ bulk_func_available = 1;
+ auth_blk16 = _gcry_sm4_aesni_avx2_ocb_auth;
}
#endif
-
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
- u64 Ls[16];
- u64 *l;
-
- if (nblocks >= 16)
- {
- l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
-
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- blkn += 16;
- *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
-
- _gcry_sm4_gfni_avx2_ocb_auth(ctx->rkey_enc, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
-
- nblocks -= 16;
- abuf += 16 * 16;
- }
- }
+ bulk_func_available = 1;
+ auth_blk16 = _gcry_sm4_gfni_avx2_ocb_auth;
+ }
+#endif
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ bulk_func_available = 1;
+ auth_blk16 = _gcry_sm4_gfni_avx512_ocb_auth;
+ }
+#endif
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ bulk_func_available = 1;
+ auth_blk16 = _gcry_sm4_intel_avx2_ocb_auth;
}
#endif
-#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2)
+ if (bulk_func_available)
{
u64 Ls[16];
u64 *l;
- if (nblocks >= 16)
+ if (auth_blk16 != NULL && nblocks >= 16)
{
l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
@@ -2277,24 +2155,15 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
blkn += 16;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
- _gcry_sm4_aesni_avx2_ocb_auth(ctx->rkey_enc, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ auth_blk16(ctx->rkey_enc, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
nblocks -= 16;
abuf += 16 * 16;
}
}
- }
-#endif
-#ifdef USE_AESNI_AVX
- if (ctx->use_aesni_avx)
- {
- u64 Ls[8];
- u64 *l;
-
- if (nblocks >= 8)
+ if (auth_blk8 != NULL && nblocks >= 8)
{
l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
@@ -2304,9 +2173,8 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
blkn += 8;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
- _gcry_sm4_aesni_avx_ocb_auth(ctx->rkey_enc, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ auth_blk8(ctx->rkey_enc, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
nblocks -= 8;
abuf += 8 * 16;
--
2.53.0
More information about the Gcrypt-devel
mailing list