[PATCH 4/4] camellia-gfni-avx512: add 1-block constant-time implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 21 11:58:49 CET 2025
* cipher/camellia-gfni-avx512-amd64.S
(_gcry_camellia_gfni_avx512_enc_blk1)
(_gcry_camellia_gfni_avx512_dec_blk1): New.
* cipher/camellia-glue.c [USE_GFNI_AVX512]
(_gcry_camellia_gfni_avx512_enc_blk1)
(_gcry_camellia_gfni_avx512_dec_blk1): New prototypes.
(camellia_decrypt, camellia_encrypt) [USE_GFNI_AVX512]: Use
GFNI/AVX512 1-block implementation if supported by CPU.
--
Benchmark on Intel (tigerlake):
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 5.57 ns/B 171.3 MiB/s 22.77 c/B 4090
CFB enc | 5.57 ns/B 171.2 MiB/s 22.79 c/B 4090
After (~27% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 4.36 ns/B 218.9 MiB/s 17.82 c/B 4090
CFB enc | 4.35 ns/B 219.1 MiB/s 17.80 c/B 4090
Benchmark on AMD Ryzen 9 9950X3D (zen5):
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 3.15 ns/B 302.8 MiB/s 18.10 c/B 5747
CFB enc | 3.18 ns/B 300.0 MiB/s 18.27 c/B 5748
After (~13% slower):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 3.58 ns/B 266.7 MiB/s 20.55 c/B 5746±5
CFB enc | 3.58 ns/B 266.7 MiB/s 20.55 c/B 5748
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-gfni-avx512-amd64.S | 236 +++++++++++++++++++++++++++-
cipher/camellia-glue.c | 26 +++
2 files changed, 261 insertions(+), 1 deletion(-)
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 643eed3e..22ae43d9 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -1,6 +1,6 @@
/* camellia-gfni-avx512-amd64.S - GFNI/AVX512 implementation of Camellia
*
- * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023,2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -692,6 +692,21 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;)
.Lbige_addb_16:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+.align 16
+/* Shuffling constants for AVX512+GFNI 1-way variant. */
+.Lsp1mask_swap32_gfni:
+ .byte 0xff, 0x04, 0x04, 0x04, 0xff, 0x04, 0x04, 0x04
+ .byte 0xff, 0x03, 0x03, 0x03, 0x03, 0xff, 0xff, 0x03
+.Lsp2mask_swap32_gfni:
+ .byte 0x07, 0x07, 0x07, 0xff, 0x07, 0x07, 0x07, 0xff
+ .byte 0x02, 0x02, 0x02, 0xff, 0xff, 0xff, 0x02, 0x02
+.Lsp3mask_swap32_gfni:
+ .byte 0x06, 0x06, 0xff, 0x06, 0x06, 0x06, 0xff, 0x06
+ .byte 0x01, 0x01, 0xff, 0x01, 0xff, 0x01, 0x01, 0xff
+.Lsp4mask_swap32_gfni:
+ .byte 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff
+ .byte 0x05, 0xff, 0x05, 0x05, 0x05, 0xff, 0x05, 0x05
+
ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)
.text
@@ -1630,5 +1645,224 @@ _gcry_camellia_gfni_avx512_dec_blk64:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_dec_blk64,.-_gcry_camellia_gfni_avx512_dec_blk64;)
+/**********************************************************************
+ 1-block non-parallel camellia
+ **********************************************************************/
+
+/* Camellia F function, AVX512+GFNI version */
+#define camellia_f_gfni(ab, cd, x1, x2, x3, x4, \
+ pre_filter_bitmatrix_s123, pre_filter_bitmatrix_s4, \
+ post_filter_bitmatrix_s14, post_filter_bitmatrix_s2, \
+ post_filter_bitmatrix_s3, sp1mask, sp2mask, sp3mask, \
+ sp4mask) \
+ /* camellia sboxes s1, s2, s3, s4 */ \
+ vgf2p8affineqb $(pre_filter_constant_s1234), \
+ pre_filter_bitmatrix_s4, ab, x4; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), \
+ post_filter_bitmatrix_s14, x4, x4; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), \
+ pre_filter_bitmatrix_s123, ab, x1; \
+ vgf2p8affineinvqb $(post_filter_constant_s2), \
+ post_filter_bitmatrix_s2, x1, x2; \
+ vgf2p8affineinvqb $(post_filter_constant_s3), \
+ post_filter_bitmatrix_s3, x1, x3; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), \
+ post_filter_bitmatrix_s14, x1, x1; \
+ \
+ /* permutation */ \
+ vpshufb sp4mask, x4, x4; \
+ vpshufb sp2mask, x2, x2; \
+ vpshufb sp3mask, x3, x3; \
+ vpshufb sp1mask, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpternlogd $0x96, x3, x1, x2; \
+ vpsrldq $8, x2, x1; \
+ \
+ /* output xor */ \
+ vpternlogd $0x96, x2, x1, cd;
+
+#define preload_camellia_f_consts() \
+ vmovq .Lpre_filter_bitmatrix_s123 rRIP, %xmm14; \
+ vmovq .Lpre_filter_bitmatrix_s4 rRIP, %xmm13; \
+ vmovq .Lpost_filter_bitmatrix_s14 rRIP, %xmm12; \
+ vmovq .Lpost_filter_bitmatrix_s2 rRIP, %xmm11; \
+ vmovq .Lpost_filter_bitmatrix_s3 rRIP, %xmm10; \
+ vmovdqa .Lsp1mask_swap32_gfni rRIP, %xmm9; \
+ vmovdqa .Lsp2mask_swap32_gfni rRIP, %xmm8; \
+ vmovdqa .Lsp3mask_swap32_gfni rRIP, %xmm7; \
+ vmovdqa .Lsp4mask_swap32_gfni rRIP, %xmm6;
+
+#define do_camellia_f(ab, cd, t0, t1, t2, t3) \
+ camellia_f_gfni(ab, cd, t0, t1, t2, t3, \
+ %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, \
+ %xmm9, %xmm8, %xmm7, %xmm6);
+
+#define preload_camellia_key_consts() \
+ kxnorb %k1, %k1, %k1; \
+ vmovdqa .Lpack_bswap rRIP, %xmm15; \
+ kshiftrb $7, %k1, %k1;
+
+#define add_roundkey_blk1(cd, key) \
+ vpxorq key, cd, cd{%k1}{z};
+
+#define do_fls_blk1(ll, lr, rl, rr, t0, t1, kll, klr, krl, krr) \
+ vpternlogd $0x1E, krr, rr, rl{%k1}{z}; \
+ vpandd kll, ll, t0{%k1}{z}; \
+ vpandd krl, rl, t1{%k1}{z}; \
+ vprold $1, t0, t0; \
+ vprold $1, t1, t1; \
+ vpxor t0, lr, lr; \
+ vpxor t1, rr, rr; \
+ vpternlogd $0x1E, klr, lr, ll{%k1}{z};
+
+#define clear_regs_blk1() \
+ kxorq %k1, %k1, %k1; \
+ vzeroall;
+
+#define roundsm_blk1(ab, cd, t0, t1, t2, t3, key) \
+ add_roundkey_blk1(cd, key); \
+ do_camellia_f(ab, cd, t0, t1, t2, t3);
+
+#define two_roundsm_blk1(ab, cd, t0, t1, t2, t3, i, dir, ctx) \
+ roundsm_blk1(ab, cd, t0, t1, t2, t3, \
+ (key_table + (i) * 8)(ctx)); \
+ roundsm_blk1(cd, ab, t0, t1, t2, t3, \
+ (key_table + ((i) + (dir)) * 8)(ctx));
+
+#define enc_rounds_blk1(ab, cd, t0, t1, t2, t3, i, ctx) \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 2, 1, ctx); \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 4, 1, ctx); \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 6, 1, ctx);
+
+#define dec_rounds_blk1(ab, cd, t0, t1, t2, t3, i, ctx) \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 7, -1, ctx); \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 5, -1, ctx); \
+ two_roundsm_blk1(ab, cd, t0, t1, t2, t3, (i) + 3, -1, ctx);
+
+#define fls_blk1(ll_lr, rl_rr, tmp_lr, tmp_rr, t0, t1, kll, klr, krl, krr) \
+ vpsrlq $32, rl_rr, tmp_rr; \
+ vpsrlq $32, ll_lr, tmp_lr; \
+ do_fls_blk1(ll_lr, tmp_lr, rl_rr, tmp_rr, t0, t1, kll, klr, krl, krr); \
+ vpunpckldq tmp_lr, ll_lr, ll_lr; \
+ vpunpckldq tmp_rr, rl_rr, rl_rr;
+
+#define inpack_blk1(ab, cd, src, key) \
+ vmovq 0(src), ab; \
+ vmovq 8(src), cd; \
+ vpshufb %xmm15, ab, ab; \
+ vpshufb %xmm15, cd, cd; \
+ add_roundkey_blk1(ab, key);
+
+#define outunpack_blk1(ab, cd, dst, key) \
+ add_roundkey_blk1(cd, key); \
+ vpshufb %xmm15, ab, ab; \
+ vpshufb %xmm15, cd, cd; \
+ vmovq ab, 8(dst); \
+ vmovq cd, 0(dst);
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_enc_blk1
+ELF(.type _gcry_camellia_gfni_avx512_enc_blk1, at function;)
+
+_gcry_camellia_gfni_avx512_enc_blk1:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 bytes)
+ * %rdx: src (16 bytes)
+ */
+ CFI_STARTPROC();
+ spec_stop_avx512;
+
+ preload_camellia_key_consts();
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack_blk1(%xmm0, %xmm1, %rdx, (key_table)(CTX));
+
+ preload_camellia_f_consts();
+
+ leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+.align 16
+.Lenc_loop_blk1:
+ enc_rounds_blk1(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, 0, CTX);
+
+ cmpq %r8, CTX;
+ je .Lenc_done_blk1;
+ leaq (8 * 8)(CTX), CTX;
+
+ fls_blk1(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5,
+ ((key_table) + 0)(CTX),
+ ((key_table) + 4)(CTX),
+ ((key_table) + 8)(CTX),
+ ((key_table) + 12)(CTX));
+
+ jmp .Lenc_loop_blk1;
+
+.align 16
+.Lenc_done_blk1:
+ outunpack_blk1(%xmm0, %xmm1, %rsi, ((key_table) + 8 * 8)(CTX));
+
+ clear_regs_blk1();
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_enc_blk1,.-_gcry_camellia_gfni_avx512_enc_blk1;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_dec_blk1
+ELF(.type _gcry_camellia_gfni_avx512_dec_blk1, at function;)
+
+_gcry_camellia_gfni_avx512_dec_blk1:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 bytes)
+ * %rdx: src (16 bytes)
+ */
+ CFI_STARTPROC();
+ spec_stop_avx512;
+
+ preload_camellia_key_consts();
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack_blk1(%xmm0, %xmm1, %rdx, (key_table)(CTX, %r8, 8));
+
+ preload_camellia_f_consts();
+
+ leaq (-8 * 8)(CTX, %r8, 8), %rax;
+
+.align 16
+.Ldec_loop_blk1:
+ dec_rounds_blk1(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, 0, %rax);
+
+ cmpq CTX, %rax;
+ je .Ldec_done_blk1;
+
+ fls_blk1(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5,
+ ((key_table) + 8)(%rax),
+ ((key_table) + 12)(%rax),
+ ((key_table) + 0)(%rax),
+ ((key_table) + 4)(%rax));
+
+ leaq (-8 * 8)(%rax), %rax;
+ jmp .Ldec_loop_blk1;
+
+.align 16
+.Ldec_done_blk1:
+ outunpack_blk1(%xmm0, %xmm1, %rsi, (key_table)(CTX));
+
+ clear_regs_blk1();
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_dec_blk1,.-_gcry_camellia_gfni_avx512_dec_blk1;)
+
#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) */
#endif /* __x86_64 */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 5051a305..78ff22b9 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -427,6 +427,16 @@ extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx,
const unsigned char *in)
ASM_FUNC_ABI;
+extern void _gcry_camellia_gfni_avx512_enc_blk1(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_dec_blk1(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in)
+ ASM_FUNC_ABI;
+
/* Stack not used by AVX512 implementation. */
static const int avx512_burn_stack_depth = 0;
#endif
@@ -715,6 +725,14 @@ camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
{
CAMELLIA_context *ctx=c;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ _gcry_camellia_gfni_avx512_enc_blk1(ctx, outbuf, inbuf);
+ return 0;
+ }
+#endif
+
Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
#define CAMELLIA_encrypt_stack_burn_size \
@@ -732,6 +750,14 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
{
CAMELLIA_context *ctx=c;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ _gcry_camellia_gfni_avx512_dec_blk1(ctx, outbuf, inbuf);
+ return 0;
+ }
+#endif
+
Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
#define CAMELLIA_decrypt_stack_burn_size \
--
2.51.0
More information about the Gcrypt-devel
mailing list