From jussi.kivilinna at iki.fi Sat Jul 4 07:37:32 2026 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 4 Jul 2026 08:37:32 +0300 Subject: [PATCH 1/2] sm4: tail-call ARM and PPC crypt_blk1_x helpers Message-ID: <20260704053733.3618370-1-jussi.kivilinna@iki.fi> * cipher/sm4-aarch64.S (sm4_aarch64_crypt_blk1_4) (_gcry_sm4_aarch64_crypt_blk1_8): Return zero stack burn size. * cipher/sm4-armv8-aarch64-ce.S (sm4_armv8_ce_crypt_blk1_4) (_gcry_sm4_armv8_ce_crypt_blk1_8): Likewise. * cipher/sm4-armv9-aarch64-sve-ce.S (_gcry_sm4_armv9_sve_ce_crypt): Likewise. * cipher/sm4-ppc.c (sm4_ppc_crypt_blk1_16, _gcry_sm4_ppc8le_crypt_blk1_16) (_gcry_sm4_ppc9le_crypt_blk1_16): Likewise. * cipher/sm4.c (_gcry_sm4_aarch64_crypt_blk1_8) (_gcry_sm4_armv8_ce_crypt_blk1_8, _gcry_sm4_armv9_sve_ce_crypt) (_gcry_sm4_ppc8le_crypt_blk1_16, _gcry_sm4_ppc9le_crypt_blk1_16): Likewise. (sm4_aarch64_crypt_blk1_16, sm4_armv8_ce_crypt_blk1_16) (sm4_armv9_sve_ce_crypt_blk1_16, sm4_ppc8le_crypt_blk1_16) (sm4_ppc9le_crypt_blk1_16): Tail-call the underlying blk1_x helper. -- These wrappers called the helper and then returned a constant zero, which prevents the compiler from emitting a tail call. Signed-off-by: Jussi Kivilinna --- cipher/sm4-aarch64.S | 2 ++ cipher/sm4-armv8-aarch64-ce.S | 2 ++ cipher/sm4-armv9-aarch64-sve-ce.S | 1 + cipher/sm4-ppc.c | 14 +++++----- cipher/sm4.c | 43 ++++++++++++++----------------- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S index bab4b4df..c71bf3ca 100644 --- a/cipher/sm4-aarch64.S +++ b/cipher/sm4-aarch64.S @@ -222,6 +222,7 @@ sm4_aarch64_crypt_blk1_4: .Lblk4_store_output_done: VPOP_ABI; + mov x0, #0; ret_spec_stop; CFI_ENDPROC(); ELF(.size sm4_aarch64_crypt_blk1_4,.-sm4_aarch64_crypt_blk1_4;) @@ -396,6 +397,7 @@ _gcry_sm4_aarch64_crypt_blk1_8: CFI_ADJUST_CFA_OFFSET(-16); CFI_RESTORE(x29); CFI_RESTORE(x30); + mov x0, #0; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aarch64_crypt_blk1_8,.-_gcry_sm4_aarch64_crypt_blk1_8;) diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S index 01f3df92..5917ad5b 100644 --- a/cipher/sm4-armv8-aarch64-ce.S +++ b/cipher/sm4-armv8-aarch64-ce.S @@ -335,6 +335,7 @@ sm4_armv8_ce_crypt_blk1_4: st1 {v3.16b}, [x1]; .Lblk4_store_output_done: + mov x0, #0; ret_spec_stop; CFI_ENDPROC(); ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;) @@ -385,6 +386,7 @@ _gcry_sm4_armv8_ce_crypt_blk1_8: .Lblk8_store_output_done: CLEAR_ALL_REGS(); + mov x0, #0; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;) diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S index 7367cd28..ba9bde84 100644 --- a/cipher/sm4-armv9-aarch64-sve-ce.S +++ b/cipher/sm4-armv9-aarch64-sve-ce.S @@ -444,6 +444,7 @@ _gcry_sm4_armv9_sve_ce_crypt: cbnz x3, .Lcrypt_tail; .Lcrypt_end: + mov x0, #0; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_armv9_sve_ce_crypt,.-_gcry_sm4_armv9_sve_ce_crypt;) diff --git a/cipher/sm4-ppc.c b/cipher/sm4-ppc.c index 2b26c39d..9d9227c0 100644 --- a/cipher/sm4-ppc.c +++ b/cipher/sm4-ppc.c @@ -293,7 +293,7 @@ sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks) vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); } -static ASM_FUNC_ATTR_INLINE void +static ASM_FUNC_ATTR_INLINE unsigned int sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) { if (nblks >= 16) @@ -321,25 +321,27 @@ sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) } clear_vec_regs(); + + return 0; } -ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 unsigned int _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) { - sm4_ppc_crypt_blk1_16(rk, out, in, nblks); + return sm4_ppc_crypt_blk1_16(rk, out, in, nblks); } -ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 unsigned int _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) { #ifdef HAVE_FUNC_ATTR_TARGET /* Inline for POWER9 target optimization. */ - sm4_ppc_crypt_blk1_16(rk, out, in, nblks); + return sm4_ppc_crypt_blk1_16(rk, out, in, nblks); #else /* Target selecting not working, just call the other noinline function. */ - _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks); + return _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks); #endif } diff --git a/cipher/sm4.c b/cipher/sm4.c index 165a66b2..cc98b676 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -635,9 +635,9 @@ extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out, byte *iv, size_t nblocks); -extern void _gcry_sm4_aarch64_crypt_blk1_8(u32 *rk, byte *out, - const byte *in, - size_t num_blocks); +extern unsigned int _gcry_sm4_aarch64_crypt_blk1_8(u32 *rk, byte *out, + const byte *in, + size_t num_blocks); static inline unsigned int sm4_aarch64_crypt_blk1_16(void *rk, byte *out, const byte *in, @@ -651,8 +651,7 @@ sm4_aarch64_crypt_blk1_16(void *rk, byte *out, const byte *in, num_blks -= 8; } - _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks); - return 0; + return _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks); } #endif /* USE_AARCH64_SIMD */ @@ -686,9 +685,9 @@ extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out, byte *tweak, size_t nblocks); -extern void _gcry_sm4_armv8_ce_crypt_blk1_8(u32 *rk, byte *out, - const byte *in, - size_t num_blocks); +extern unsigned int _gcry_sm4_armv8_ce_crypt_blk1_8(u32 *rk, byte *out, + const byte *in, + size_t num_blocks); static inline unsigned int sm4_armv8_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, @@ -702,16 +701,15 @@ sm4_armv8_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, num_blks -= 8; } - _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks); - return 0; + return _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks); } #endif /* USE_ARM_CE */ #ifdef USE_ARM_SVE_CE -extern void _gcry_sm4_armv9_sve_ce_crypt(u32 *rk, byte *out, - const byte *in, - size_t nblocks); +extern unsigned int _gcry_sm4_armv9_sve_ce_crypt(u32 *rk, byte *out, + const byte *in, + size_t nblocks); extern void _gcry_sm4_armv9_sve_ce_ctr_enc(const u32 *rk_enc, byte *out, const byte *in, @@ -732,32 +730,31 @@ static inline unsigned int sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) { - _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks); - return 0; + return _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks); } extern unsigned int _gcry_sm4_armv9_sve_get_vl(void); #endif /* USE_ARM_SVE_CE */ #ifdef USE_PPC_CRYPTO -extern void _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, - size_t num_blks); +extern unsigned int _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, + const byte *in, + size_t num_blks); -extern void _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, - size_t num_blks); +extern unsigned int _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, + const byte *in, + size_t num_blks); static inline unsigned int sm4_ppc8le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) { - _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks); - return 0; + return _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks); } static inline unsigned int sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) { - _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks); - return 0; + return _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks); } #endif /* USE_PPC_CRYPTO */ -- 2.53.0 From jussi.kivilinna at iki.fi Sat Jul 4 07:37:33 2026 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 4 Jul 2026 08:37:33 +0300 Subject: [PATCH 2/2] cipher: limit table prefetch to the look-up table arrays In-Reply-To: <20260704053733.3618370-1-jussi.kivilinna@iki.fi> References: <20260704053733.3618370-1-jussi.kivilinna@iki.fi> Message-ID: <20260704053733.3618370-2-jussi.kivilinna@iki.fi> * cipher/aria.c (prefetch_sboxes): Compute unshare counter once and prefetch only look-up arrays. * cipher/cipher-gcm.c (do_prefetch_tables): Likewise. * cipher/rijndael.c (prefetch_enc, prefetch_dec): Likewise. * cipher/sm4.c (prefetch_sbox_table): Likewise. * cipher/rijndael-tables.h (dec_tables): Rename 'inv_sbox' field to 'inv_sboxT'. -- Signed-off-by: Jussi Kivilinna --- cipher/aria.c | 7 ++++--- cipher/cipher-gcm.c | 7 ++++--- cipher/rijndael-tables.h | 4 ++-- cipher/rijndael.c | 15 +++++++++------ cipher/sm4.c | 7 ++++--- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/cipher/aria.c b/cipher/aria.c index 26546a63..bb67ed03 100644 --- a/cipher/aria.c +++ b/cipher/aria.c @@ -627,11 +627,12 @@ prefetch_sboxes(void) * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ - sboxes.counter_head++; - sboxes.counter_tail++; + u32 counter = sboxes.counter_head + 1; + sboxes.counter_head = counter; + sboxes.counter_tail = counter; /* Prefetch look-up tables to cache. */ - prefetch_table((const void *)&sboxes, sizeof(sboxes)); + prefetch_table((const void *)&sboxes.s1[0], sizeof(sboxes.s1) * 4); } diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 2db371b8..1627cd1c 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -255,12 +255,13 @@ do_prefetch_tables (const void *gcmM, size_t gcmM_size) * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ - gcm_table.counter_head++; - gcm_table.counter_tail++; + u32 counter = gcm_table.counter_head + 1; + gcm_table.counter_head = counter; + gcm_table.counter_tail = counter; /* Prefetch look-up tables to cache. */ prefetch_table(gcmM, gcmM_size); - prefetch_table(&gcm_table, sizeof(gcm_table)); + prefetch_table(&gcm_table.R, sizeof(gcm_table.R)); } #ifdef GCM_TABLES_USE_U64 diff --git a/cipher/rijndael-tables.h b/cipher/rijndael-tables.h index e46ce08c..52b9518f 100644 --- a/cipher/rijndael-tables.h +++ b/cipher/rijndael-tables.h @@ -107,7 +107,7 @@ static struct volatile u32 counter_head; u32 cacheline_align[64 / 4 - 1]; u32 T[256]; - byte inv_sbox[256]; + byte inv_sboxT[256]; volatile u32 counter_tail; } dec_tables ATTR_ALIGNED_64 = { @@ -217,4 +217,4 @@ static struct }; #define decT dec_tables.T -#define inv_sbox dec_tables.inv_sbox +#define inv_sbox dec_tables.inv_sboxT diff --git a/cipher/rijndael.c b/cipher/rijndael.c index f3daf35a..645c0e2f 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -597,11 +597,12 @@ static void prefetch_enc(void) * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ - enc_tables.counter_head++; - enc_tables.counter_tail++; + u32 counter = enc_tables.counter_head + 1; + enc_tables.counter_head = counter; + enc_tables.counter_tail = counter; /* Prefetch look-up tables to cache. */ - prefetch_table((const void *)&enc_tables, sizeof(enc_tables)); + prefetch_table((const void *)&enc_tables.T[0], sizeof(enc_tables.T)); } static void prefetch_dec(void) @@ -610,11 +611,13 @@ static void prefetch_dec(void) * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ - dec_tables.counter_head++; - dec_tables.counter_tail++; + u32 counter = dec_tables.counter_head + 1; + dec_tables.counter_head = counter; + dec_tables.counter_tail = counter; /* Prefetch look-up tables to cache. */ - prefetch_table((const void *)&dec_tables, sizeof(dec_tables)); + prefetch_table((const void *)&dec_tables.T[0], + sizeof(dec_tables.T) + sizeof(dec_tables.inv_sboxT)); } diff --git a/cipher/sm4.c b/cipher/sm4.c index cc98b676..70b4901c 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -760,14 +760,15 @@ sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) static inline void prefetch_sbox_table(void) { - const volatile byte *vtab = (void *)&sbox_table; + const volatile byte *vtab = (void *)&sbox_table.S[0]; /* Modify counters to trigger copy-on-write and unsharing if physical pages * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ - sbox_table.counter_head++; - sbox_table.counter_tail++; + u32 counter = sbox_table.counter_head + 1; + sbox_table.counter_head = counter; + sbox_table.counter_tail = counter; /* Prefetch look-up table to cache. */ (void)vtab[0 * 32]; -- 2.53.0