[PATCH 1/4] camellia-aesni-avx: optimize camellia_f used for key setup
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 21 11:58:46 CET 2025
* cipher/camellia-aesni-avx-amd64.S (filter_8bit_3op): New.
(filter_8bit): Refactor.
(transpose_8x8b): Remove.
(camellia_f, camellia_f_core): Refactor.
(.Lsbox4_input_mask): Remove.
(__camellia_avx_setup128, __camellia_avx_setup256): Adjust for new
'camellia_f'.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-aesni-avx-amd64.S | 183 ++++++++++++------------------
1 file changed, 73 insertions(+), 110 deletions(-)
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 76e62ea8..e21a8468 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
/* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher
*
- * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015,2020,2023,2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -39,14 +39,17 @@
/**********************************************************************
helper macros
**********************************************************************/
-#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
- vpand x, mask4bit, tmp0; \
- vpandn x, mask4bit, x; \
- vpsrld $4, x, x; \
+#define filter_8bit_3op(out, in, lo_t, hi_t, mask4bit, tmp0) \
+ vpand in, mask4bit, tmp0; \
+ vpandn in, mask4bit, out; \
+ vpsrld $4, out, out; \
\
vpshufb tmp0, lo_t, tmp0; \
- vpshufb x, hi_t, x; \
- vpxor tmp0, x, x;
+ vpshufb out, hi_t, out; \
+ vpxor tmp0, out, out;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ filter_8bit_3op(x, x, lo_t, hi_t, mask4bit, tmp0);
/**********************************************************************
16-way camellia
@@ -450,65 +453,6 @@
vmovdqu st1, b1; \
/* does not adjust output bytes inside vectors */
-#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
- vpunpcklbw a, b, t0; \
- vpunpckhbw a, b, b; \
- \
- vpunpcklbw c, d, t1; \
- vpunpckhbw c, d, d; \
- \
- vpunpcklbw e, f, t2; \
- vpunpckhbw e, f, f; \
- \
- vpunpcklbw g, h, t3; \
- vpunpckhbw g, h, h; \
- \
- vpunpcklwd t0, t1, g; \
- vpunpckhwd t0, t1, t0; \
- \
- vpunpcklwd b, d, t1; \
- vpunpckhwd b, d, e; \
- \
- vpunpcklwd t2, t3, c; \
- vpunpckhwd t2, t3, t2; \
- \
- vpunpcklwd f, h, t3; \
- vpunpckhwd f, h, b; \
- \
- vpunpcklwd e, b, t4; \
- vpunpckhwd e, b, b; \
- \
- vpunpcklwd t1, t3, e; \
- vpunpckhwd t1, t3, f; \
- \
- vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
- \
- vpunpcklwd g, c, d; \
- vpunpckhwd g, c, c; \
- \
- vpunpcklwd t0, t2, t1; \
- vpunpckhwd t0, t2, h; \
- \
- vpunpckhqdq b, h, a; \
- vpshufb t3, a, a; \
- vpunpcklqdq b, h, b; \
- vpshufb t3, b, b; \
- \
- vpunpckhqdq e, d, g; \
- vpshufb t3, g, g; \
- vpunpcklqdq e, d, h; \
- vpshufb t3, h, h; \
- \
- vpunpckhqdq f, c, e; \
- vpshufb t3, e, e; \
- vpunpcklqdq f, c, f; \
- vpshufb t3, f, f; \
- \
- vpunpckhqdq t4, t1, c; \
- vpshufb t3, c, c; \
- vpunpcklqdq t4, t1, d; \
- vpshufb t3, d, d;
-
/* load blocks to registers and apply pre-whitening */
#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
@@ -1830,63 +1774,86 @@ _gcry_camellia_aesni_avx_ocb_auth:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
-/*
- * IN:
- * ab: 64-bit AB state
- * cd: 64-bit CD state
- */
-#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
- _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
- vmovq key, t0; \
- vpxor x, x, t3; \
- \
- vpxor ab, t0, x; \
+/* Camellia F function, AVX+AESNI version */
+#define camellia_f_core(ab, x, t0, t1, t2, t3, t4, \
+ inv_shift_row_n_s2n3_shuffle, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, \
+ sp1mask, sp2mask, sp3mask, sp4mask, fn_out_xor, \
+ out_xor_dst) \
\
/* \
* S-function with AES subbytes \
*/ \
\
- /* input rotation for sbox4 (<<< 1) */ \
- vpand x, sbox4mask, t0; \
- vpandn x, sbox4mask, x; \
- vpaddw t0, t0, t1; \
- vpsrlw $7, t0, t0; \
- vpor t0, t1, t0; \
- vpand sbox4mask, t0, t0; \
- vpor t0, x, x; \
+ vmovdqa .Lpre_tf_lo_s4(%rip), t0; \
+ vmovdqa .Lpre_tf_hi_s4(%rip), t1; \
+ vpxor t3, t3, t3; \
+ \
+ /* prefilter sboxes s1,s2,s3 */ \
+ filter_8bit_3op(t4, ab, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
\
- vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
- vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
+ /* prefilter sbox s4 */ \
+ filter_8bit_3op(x, ab, t0, t1, _0f0f0f0fmask, t2); \
\
- /* prefilter sboxes */ \
- filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+ vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
\
- /* AES subbytes + AES shift rows + AES inv shift rows */ \
+ /* AES subbytes + AES shift rows */ \
+ vaesenclast t3, t4, t4; \
vaesenclast t3, x, x; \
\
- /* postfilter sboxes */ \
+ /* postfilter sboxes s1,s2,s3 */ \
+ filter_8bit(t4, t0, t1, _0f0f0f0fmask, t2); \
+ \
+ /* postfilter sboxes s4 */ \
filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
\
+ /* Unpack 8-bit fields in lower 64-bits of XMM to */ \
+ /* 16-bit fields in full 128-bit XMM. This is to allow faster */ \
+ /* byte rotation for s2&s3 as SSE/AVX lacks native byte */ \
+ /* shift/rotation instructions. */ \
+ vpshufb inv_shift_row_n_s2n3_shuffle, t4, t1; \
+ \
+ vpshufb sp1mask, t4, t4; \
+ vpshufb sp4mask, x, x; \
+ \
/* output rotation for sbox2 (<<< 1) */ \
/* output rotation for sbox3 (>>> 1) */ \
- vpshufb inv_shift_row, x, t1; \
- vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
- vpshufb .Lsp1110111010011110mask rRIP, x, x; \
vpaddb t1, t1, t2; \
vpsrlw $7, t1, t0; \
vpsllw $7, t1, t3; \
vpor t0, t2, t0; \
vpsrlw $1, t1, t1; \
- vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
+ vpshufb sp2mask, t0, t0; \
vpor t1, t3, t1; \
\
vpxor x, t4, t4; \
- vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
+ vpshufb sp3mask, t1, t1; \
vpxor t4, t0, t0; \
vpxor t1, t0, t0; \
vpsrldq $8, t0, x; \
+ fn_out_xor(t0, x, out_xor_dst);
+
+#define camellia_f_xor_x(t0, x, _) \
vpxor t0, x, x;
+/*
+ * IN:
+ * ab: 64-bit AB state
+ * cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+ vmovq key, t0; \
+ vpxor ab, t0, x; \
+ camellia_f_core(x, x, t0, t1, t2, t3, t4, inv_shift_row, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, \
+ .Lsp1110111010011110mask rRIP, \
+ .Lsp0222022222000222mask rRIP, \
+ .Lsp3033303303303033mask rRIP, \
+ .Lsp0044440444044404mask rRIP, \
+ camellia_f_xor_x, _);
+
#define vec_rol128(in, out, nrol, t0) \
vpshufd $0x4e, in, out; \
vpsllq $(nrol), in, t0; \
@@ -1920,8 +1887,6 @@ _camellia_aesni_avx_keysetup_data:
.Lsp3033303303303033mask:
.long 0x04ff0404, 0x04ff0404;
.long 0xff0a0aff, 0x0aff0a0a;
-.Lsbox4_input_mask:
- .byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
.Lsigma1:
.long 0x3BCC908B, 0xA09E667F;
.Lsigma2:
@@ -1953,7 +1918,6 @@ __camellia_avx_setup128:
vpshufb .Lbswap128_mask rRIP, KL128, KL128;
vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
- vmovq .Lsbox4_input_mask rRIP, %xmm12;
vbroadcastss .L0f0f0f0f rRIP, %xmm13;
vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
@@ -1968,18 +1932,18 @@ __camellia_avx_setup128:
camellia_f(%xmm2, %xmm4, %xmm1,
%xmm5, %xmm6, %xmm7, %xmm8,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
vpxor %xmm4, %xmm3, %xmm3;
camellia_f(%xmm3, %xmm2, %xmm1,
%xmm5, %xmm6, %xmm7, %xmm8,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
camellia_f(%xmm2, %xmm3, %xmm1,
%xmm5, %xmm6, %xmm7, %xmm8,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
vpxor %xmm4, %xmm3, %xmm3;
camellia_f(%xmm3, %xmm4, %xmm1,
%xmm5, %xmm6, %xmm7, %xmm8,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
vpslldq $8, %xmm3, %xmm3;
vpxor %xmm4, %xmm2, %xmm2;
@@ -2303,7 +2267,6 @@ __camellia_avx_setup256:
vpshufb .Lbswap128_mask rRIP, KR128, KR128;
vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
- vmovq .Lsbox4_input_mask rRIP, %xmm12;
vbroadcastss .L0f0f0f0f rRIP, %xmm13;
vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
@@ -2319,20 +2282,20 @@ __camellia_avx_setup256:
camellia_f(%xmm2, %xmm4, %xmm5,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
vpxor %xmm4, %xmm3, %xmm3;
camellia_f(%xmm3, %xmm2, %xmm5,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
vpxor %xmm6, %xmm2, %xmm2;
camellia_f(%xmm2, %xmm3, %xmm5,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
vpxor %xmm4, %xmm3, %xmm3;
vpxor KR128, %xmm3, %xmm3;
camellia_f(%xmm3, %xmm4, %xmm5,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
vpslldq $8, %xmm3, %xmm3;
vpxor %xmm4, %xmm2, %xmm2;
@@ -2350,12 +2313,12 @@ __camellia_avx_setup256:
camellia_f(%xmm4, %xmm5, %xmm6,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
vpxor %xmm5, %xmm3, %xmm3;
camellia_f(%xmm3, %xmm5, %xmm6,
%xmm7, %xmm8, %xmm9, %xmm10,
- %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
+ %xmm11, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
vpslldq $8, %xmm3, %xmm3;
vpxor %xmm5, %xmm4, %xmm4;
vpsrldq $8, %xmm3, %xmm3;
--
2.51.0
More information about the Gcrypt-devel
mailing list