[PATCH 2/4] camellia-simd128: optimize round key loading and key setup
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 21 11:58:47 CET 2025
* cipher/camellia-simd128.h (if_vprolb128, vprolb128)
(vmovd128_amemld, vmovq128_amemld, vmovq128_memld)
(memory_barrier_with_vec, filter_8bit_3op): New.
(LE64_LO32, LE64_HI32): Remove.
(roundsm16, fls16, inpack16_pre, outunpack16): Use 'vmovd128_amemld'
and 'vmovq128_amemld' for loading round keys.
(camellia_f): Optimize/Rewrite and split core to ...
(camellia_f_core): ... this.
(camellia_f_xor_x): New.
(sp0044440444044404mask, sp1110111010011110mask)
(sp0222022222000222mask, sp3033303303303033mask): Adjust constants
for optimized/rewritten 'camellia_f'.
(camellia_setup128, camellia_setup256): Adjust for optimized
'camellia_f'; Use 'vmovq128_amemld' for loading round keys.
(FUNC_KEY_SETUP): Use 'vmovq128_amemld' instead of 'vmovq128'.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-simd128.h | 319 +++++++++++++++++++++++---------------
1 file changed, 198 insertions(+), 121 deletions(-)
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
index c39823ac..d0f6ea32 100644
--- a/cipher/camellia-simd128.h
+++ b/cipher/camellia-simd128.h
@@ -1,5 +1,5 @@
/* camellia-simd128.h - Camellia cipher SIMD128 intrinsics implementation
- * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2023,2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -97,6 +97,12 @@ asm_sbox_be(uint8x16_t b)
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+#define if_vprolb128(...) __VA_ARGS__
+#define if_not_vprolb128(...) /*_*/
+#define vprolb128(s, a, o, tmp) ({ vpsllb128((s), a, tmp); \
+ vpsrlb128((8-(s)), a, o); \
+ vpxor128(tmp, o, o); })
+
#define vpaddb128(a, b, o) (o = (__m128i)vec_add((uint8x16_t)b, (uint8x16_t)a))
#define vpcmpgtb128(a, b, o) (o = (__m128i)vec_cmpgt((int8x16_t)b, (int8x16_t)a))
@@ -120,6 +126,13 @@ asm_sbox_be(uint8x16_t b)
#define vmovq128(a, o) ({ uint64x2_t __tmp = { (a), 0 }; \
o = (__m128i)(__tmp); })
+#define vmovd128_amemld(z, a, o) ({ \
+ const uint32_t *__tmp_ptr = (const void *)(a); \
+ uint32x4_t __tmp = { __tmp_ptr[z], 0, 0, 0 }; \
+ o = (__m128i)(__tmp); })
+#define vmovq128_amemld(a, o) ({ uint64x2_t __tmp = { *(const uint64_t *)(a), 0 }; \
+ o = (__m128i)(__tmp); })
+
#define vmovdqa128_memld(a, o) (o = *(const __m128i *)(a))
#define vmovdqa128_memst(a, o) (*(__m128i *)(o) = (a))
#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
@@ -127,13 +140,15 @@ asm_sbox_be(uint8x16_t b)
/* Following operations may have unaligned memory input */
#define vmovdqu128_memld(a, o) (o = (__m128i)vec_xl(0, (const uint8_t *)(a)))
#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vec_xl(0, (const uint8_t *)(a)), o)
+#define vmovq128_memld(a, o) ({ uint64x2_t __tmp = { *(const uint64_unaligned_t *)(a), 0 }; \
+ o = (__m128i)(__tmp); })
/* Following operations may have unaligned memory output */
#define vmovdqu128_memst(a, o) vec_xst((uint8x16_t)(a), 0, (uint8_t *)(o))
#define vmovq128_memst(a, o) (((uint64_unaligned_t *)(o))[0] = ((__m128i)(a))[0])
/* PowerPC AES encrypt last round => ShiftRows + SubBytes + XOR round key */
-static const uint8x16_t shift_row =
+static const uint8x16_t shift_row __attribute__((unused)) =
{ 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
#define vaesenclast128(a, b, o) \
({ uint64x2_t __tmp = (__m128i)vec_sbox_be((uint8x16_t)(b)); \
@@ -152,6 +167,8 @@ static const uint8x16_t shift_row =
#define if_aes_subbytes(...) __VA_ARGS__
#define if_not_aes_subbytes(...) /*_*/
+#define memory_barrier_with_vec(a) __asm__("" : "+wa"(a) :: "memory")
+
#endif /* __powerpc__ */
#ifdef __ARM_NEON
@@ -189,6 +206,13 @@ static const uint8x16_t shift_row =
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+#define if_vprolb128(...) __VA_ARGS__
+#define if_not_vprolb128(...) /*_*/
+#define vprolb128(s, a, o, t) ({ t = (__m128i)vshlq_n_u8((uint8x16_t)a, s); \
+ o = (__m128i)vsriq_n_u8((uint8x16_t)t, \
+ (uint8x16_t)a, \
+ 8-(s)); })
+
#define vpaddb128(a, b, o) (o = (__m128i)vaddq_u8((uint8x16_t)b, (uint8x16_t)a))
#define vpcmpgtb128(a, b, o) (o = (__m128i)vcgtq_s8((int8x16_t)b, (int8x16_t)a))
@@ -210,6 +234,13 @@ static const uint8x16_t shift_row =
#define vmovd128(a, o) ({ uint32x4_t __tmp = { a, 0, 0, 0 }; o = (__m128i)__tmp; })
#define vmovq128(a, o) ({ uint64x2_t __tmp = { a, 0 }; o = (__m128i)__tmp; })
+#define vmovd128_amemld(z, a, o) ({ \
+ const uint32_t *__tmp_ptr = (const void *)(a); \
+ uint32x4_t __tmp = { __tmp_ptr[z], 0, 0, 0 }; \
+ o = (__m128i)(__tmp); })
+#define vmovq128_amemld(a, o) ({ uint64x1_t __tmp = vld1_u64((const uint64_t *)(a)); \
+ o = (__m128i)vcombine_u64(__tmp, vcreate_u64(0)); })
+
#define vmovdqa128_memld(a, o) (o = (*(const __m128i *)(a)))
#define vmovdqa128_memst(a, o) (*(__m128i *)(o) = (a))
#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
@@ -217,6 +248,8 @@ static const uint8x16_t shift_row =
/* Following operations may have unaligned memory input */
#define vmovdqu128_memld(a, o) (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vld1q_u8((const uint8_t *)(a)), o)
+#define vmovq128_memld(a, o) ({ uint8x8_t __tmp = vld1_u8((const uint8_t *)(a)); \
+ o = (__m128i)vcombine_u8(__tmp, vcreate_u8(0)); })
/* Following operations may have unaligned memory output */
#define vmovdqu128_memst(a, o) vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
@@ -232,6 +265,8 @@ static const uint8x16_t shift_row =
#define if_aes_subbytes(...) /*_*/
#define if_not_aes_subbytes(...) __VA_ARGS__
+#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+
#endif /* __ARM_NEON */
#if defined(__x86_64__) || defined(__i386__)
@@ -260,6 +295,9 @@ static const uint8x16_t shift_row =
#define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
#define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
+#define if_vprolb128(...) /*_*/
+#define if_not_vprolb128(...) __VA_ARGS__
+
#define vpaddb128(a, b, o) (o = _mm_add_epi8(b, a))
#define vpcmpgtb128(a, b, o) (o = _mm_cmpgt_epi8(b, a))
@@ -281,6 +319,11 @@ static const uint8x16_t shift_row =
#define vmovd128(a, o) (o = _mm_set_epi32(0, 0, 0, a))
#define vmovq128(a, o) (o = _mm_set_epi64x(0, a))
+#define vmovd128_amemld(z, a, o) ({ \
+ const uint32_t *__tmp_ptr = (const void *)(a); \
+ o = (__m128i)_mm_loadu_si32(__tmp_ptr + (z)); })
+#define vmovq128_amemld(a, o) (o = (__m128i)_mm_loadu_si64(a))
+
#define vmovdqa128_memld(a, o) (o = (*(const __m128i *)(a)))
#define vmovdqa128_memst(a, o) (*(__m128i *)(o) = (a))
#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
@@ -289,6 +332,7 @@ static const uint8x16_t shift_row =
#define vmovdqu128_memld(a, o) (o = _mm_loadu_si128((const __m128i *)(a)))
#define vpxor128_memld(a, b, o) \
vpxor128(b, _mm_loadu_si128((const __m128i *)(a)), o)
+#define vmovq128_memld(a, o) vmovq128_amemld(a, o)
/* Following operations may have unaligned memory output */
#define vmovdqu128_memst(a, o) _mm_storeu_si128((__m128i *)(o), a)
@@ -305,7 +349,6 @@ static const uint8x16_t shift_row =
#define if_not_aes_subbytes(...) __VA_ARGS__
#define memory_barrier_with_vec(a) __asm__("" : "+x"(a) :: "memory")
-#define clear_vec_regs() ((void)0)
#endif /* defined(__x86_64__) || defined(__i386__) */
@@ -322,6 +365,10 @@ static const uint8x16_t shift_row =
vpshufb128(x, hi_t, x); \
vpxor128(tmp0, x, x);
+#define filter_8bit_3op(out, in, lo_t, hi_t, mask4bit, tmp0) \
+ vmovdqa128(in, out); \
+ filter_8bit(out, lo_t, hi_t, mask4bit, tmp0);
+
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
vpunpckhdq128(x1, x0, t2); \
vpunpckldq128(x1, x0, x0); \
@@ -462,7 +509,7 @@ static const uint8x16_t shift_row =
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
- vmovq128((key), t0); \
+ vmovq128_amemld(&(key), t0); \
\
/* postfilter sbox 2 */ \
filter_8bit(x1, t4, t5, t7, t2); \
@@ -582,9 +629,6 @@ static const uint8x16_t shift_row =
two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
-#define LE64_LO32(x) ((x) & 0xffffffffU)
-#define LE64_HI32(x) ((x >> 32) & 0xffffffffU)
-
/*
* IN:
* v0..3: byte-sliced 32-bit integers
@@ -633,7 +677,7 @@ static const uint8x16_t shift_row =
* lr ^= rol32(t0, 1); \
*/ \
load_zero(tt0); \
- vmovd128(LE64_LO32(*(kl)), t0); \
+ vmovd128_amemld(0, kl, t0); \
vpshufb128(tt0, t0, t3); \
vpshufb128(bcast[1], t0, t2); \
vpshufb128(bcast[2], t0, t1); \
@@ -661,7 +705,7 @@ static const uint8x16_t shift_row =
* rl ^= t2; \
*/ \
\
- vmovd128(LE64_HI32(*(kr)), t0); \
+ vmovd128_amemld(1, kr, t0); \
vpshufb128(tt0, t0, t3); \
vpshufb128(bcast[1], t0, t2); \
vpshufb128(bcast[2], t0, t1); \
@@ -686,7 +730,7 @@ static const uint8x16_t shift_row =
* t2 &= rl; \
* rr ^= rol32(t2, 1); \
*/ \
- vmovd128(LE64_LO32(*(kr)), t0); \
+ vmovd128_amemld(0, kr, t0); \
vpshufb128(tt0, t0, t3); \
vpshufb128(bcast[1], t0, t2); \
vpshufb128(bcast[2], t0, t1); \
@@ -714,7 +758,7 @@ static const uint8x16_t shift_row =
* ll ^= t0; \
*/ \
\
- vmovd128(LE64_HI32(*(kl)), t0); \
+ vmovd128_amemld(1, kl, t0); \
vpshufb128(tt0, t0, t3); \
vpshufb128(bcast[1], t0, t2); \
vpshufb128(bcast[2], t0, t1); \
@@ -786,7 +830,7 @@ static const uint8x16_t shift_row =
/* load blocks to registers and apply pre-whitening */
#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
- vmovq128((key), x0); \
+ vmovq128_amemld(&(key), x0); \
vpshufb128(pack_bswap_stack, x0, x0); \
\
vpxor128_memld((rio) + 0 * 16, x0, y7); \
@@ -837,7 +881,7 @@ static const uint8x16_t shift_row =
\
vmovdqa128(x0, stack_tmp0); \
\
- vmovq128((key), x0); \
+ vmovq128_amemld(&(key), x0); \
vpshufb128(pack_bswap_stack, x0, x0); \
\
vpxor128(x0, y7, y7); \
@@ -1200,64 +1244,92 @@ FUNC_DEC_BLK16(const void *key_table, void *vout, const void *vin,
/********* Key setup **********************************************************/
-/*
- * Camellia F-function, 1-way SIMD/AESNI.
- *
- * IN:
- * ab: 64-bit AB state
- * cd: 64-bit CD state
- */
-#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
- _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
- vmovq128((key), t0); \
- load_zero(t3); \
- \
- vpxor128(ab, t0, x); \
- \
+/* Camellia F-function, 1-way SIMD128. */
+#define camellia_f_core(ab, x, t0, t1, t2, t3, t4, inv_shift_row_n_s2n3_shuffle, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, \
+ sp1mask, sp2mask, sp3mask, sp4mask, fn_out_xor, \
+ out_xor_dst) \
/* \
* S-function with AES subbytes \
*/ \
\
- /* input rotation for sbox4 (<<< 1) */ \
- vpand128(x, sbox4mask, t0); \
- vpandn128(x, sbox4mask, x); \
- vpaddb128(t0, t0, t1); \
- vpsrl_byte_128(7, t0, t0); \
- vpor128(t0, t1, t0); \
- vpand128(sbox4mask, t0, t0); \
- vpor128(t0, x, x); \
+ vmovdqa128_memld(&pre_tf_lo_s4, t0); \
+ vmovdqa128_memld(&pre_tf_hi_s4, t1); \
+ if_not_aes_subbytes(load_zero(t3)); \
+ \
+ /* prefilter sboxes s1,s2,s3 */ \
+ filter_8bit_3op(t4, ab, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+ \
+ /* prefilter sbox s4 */ \
+ filter_8bit_3op(x, ab, t0, t1, _0f0f0f0fmask, t2); \
\
vmovdqa128_memld(&post_tf_lo_s1, t0); \
vmovdqa128_memld(&post_tf_hi_s1, t1); \
\
- /* prefilter sboxes */ \
- filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+ if_not_aes_subbytes(/* AES subbytes + AES shift rows */); \
+ if_not_aes_subbytes(aes_subbytes_and_shuf_and_xor(t3, t4, t4)); \
+ if_not_aes_subbytes(aes_subbytes_and_shuf_and_xor(t3, x, x)); \
+ \
+ if_aes_subbytes(/* AES subbytes */); \
+ if_aes_subbytes(aes_subbytes(t4, t4)); \
+ if_aes_subbytes(aes_subbytes(x, x)); \
\
- /* AES subbytes + AES shift rows + AES inv shift rows */ \
- aes_subbytes_and_shuf_and_xor(t3, x, x); \
+ /* postfilter sboxes s1,s2,s3 */ \
+ filter_8bit(t4, t0, t1, _0f0f0f0fmask, t2); \
\
- /* postfilter sboxes */ \
+ /* postfilter sbox s4 */ \
filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
\
/* output rotation for sbox2 (<<< 1) */ \
/* output rotation for sbox3 (>>> 1) */ \
- aes_inv_shuf(inv_shift_row, x, t1); \
- vpshufb128_amemld(&sp0044440444044404mask, x, t4); \
- vpshufb128_amemld(&sp1110111010011110mask, x, x); \
- vpaddb128(t1, t1, t2); \
- vpsrl_byte_128(7, t1, t0); \
- vpsll_byte_128(7, t1, t3); \
- vpor128(t0, t2, t0); \
- vpsrl_byte_128(1, t1, t1); \
- vpshufb128_amemld(&sp0222022222000222mask, t0, t0); \
- vpor128(t1, t3, t1); \
+ /* permutation */ \
+ if_vprolb128(vpshufb128(sp2mask, t4, t0)); \
+ if_vprolb128(vpshufb128(sp3mask, t4, t1)); \
+ if_vprolb128(vpshufb128(sp1mask, t4, t4)); \
+ if_vprolb128(vpshufb128(sp4mask, x, x)); \
+ if_vprolb128(vprolb128(1, t0, t0, t2)); \
+ if_vprolb128(vprolb128(7, t1, t1, t3)); \
+ if_not_vprolb128(aes_inv_shuf(inv_shift_row_n_s2n3_shuffle, t4, t1)); \
+ if_not_vprolb128(vpshufb128(sp1mask, t4, t4)); \
+ if_not_vprolb128(vpshufb128(sp4mask, x, x)); \
+ if_not_vprolb128(vpaddb128(t1, t1, t2)); \
+ if_not_vprolb128(vpsrl_byte_128(7, t1, t0)); \
+ if_not_vprolb128(vpsll_byte_128(7, t1, t3)); \
+ if_not_vprolb128(vpor128(t0, t2, t0)); \
+ if_not_vprolb128(vpsrl_byte_128(1, t1, t1)); \
+ if_not_vprolb128(vpshufb128(sp2mask, t0, t0)); \
+ if_not_vprolb128(vpor128(t1, t3, t1)); \
+ if_not_vprolb128(vpshufb128(sp3mask, t1, t1)); \
\
vpxor128(x, t4, t4); \
- vpshufb128_amemld(&sp3033303303303033mask, t1, t1); \
vpxor128(t4, t0, t0); \
vpxor128(t1, t0, t0); \
vpsrldq128(8, t0, x); \
- vpxor128(t0, x, x); \
+ fn_out_xor(t0, x, out_xor_dst);
+
+#define camellia_f_xor_x(t0, x, _) \
+ vpxor128(t0, x, x);
+
+/*
+ * IN:
+ * ab: 64-bit AB state
+ * cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+ ({ \
+ __m128i sp1mask, sp2mask, sp3mask, sp4mask; \
+ vmovq128_amemld(&(key), t0); \
+ vmovdqa128_memld(&sp1110111010011110mask, sp1mask); \
+ vmovdqa128_memld(&sp0222022222000222mask, sp2mask); \
+ vmovdqa128_memld(&sp3033303303303033mask, sp3mask); \
+ vmovdqa128_memld(&sp0044440444044404mask, sp4mask); \
+ vpxor128(ab, t0, x); \
+ camellia_f_core(x, x, t0, t1, t2, t3, t4, inv_shift_row, \
+ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, \
+ sp1mask, sp2mask, sp3mask, sp4mask, \
+ camellia_f_xor_x, _); \
+ })
#define vec_rol128(in, out, nrol, t0) \
vpshufd128_0x4e(in, out); \
@@ -1292,24 +1364,31 @@ FUNC_DEC_BLK16(const void *key_table, void *vout, const void *vin,
static const __m128i bswap128_mask =
M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-static const __m128i inv_shift_row_and_unpcklbw =
- M128I_BYTE(0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff,
- 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff);
+if_not_vprolb128(
+ static const __m128i inv_shift_row_and_unpcklbw =
+ M128I_BYTE(0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff,
+ 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff);
+)
static const __m128i sp0044440444044404mask =
- M128I_U32(0xffff0404, 0x0404ff04, 0x0d0dff0d, 0x0d0dff0d);
+ if_aes_subbytes(M128I_U32(0xffff0404, 0x0404ff04, 0x0101ff01, 0x0101ff01))
+ if_not_aes_subbytes(M128I_U32(0xffff0404, 0x0404ff04, 0x0d0dff0d, 0x0d0dff0d));
static const __m128i sp1110111010011110mask =
- M128I_U32(0x000000ff, 0x000000ff, 0x0bffff0b, 0x0b0b0bff);
+ if_aes_subbytes(M128I_U32(0x000000ff, 0x000000ff, 0x07ffff07, 0x070707ff))
+ if_not_aes_subbytes(M128I_U32(0x000000ff, 0x000000ff, 0x0bffff0b, 0x0b0b0bff));
static const __m128i sp0222022222000222mask =
- M128I_U32(0xff060606, 0xff060606, 0x0c0cffff, 0xff0c0c0c);
+ if_aes_subbytes(if_vprolb128(M128I_U32(0xff030303, 0xff030303, 0x0606ffff, 0xff060606)))
+ if_aes_subbytes(if_not_vprolb128(M128I_U32(0xff0e0e0e, 0xff0e0e0e, 0x0c0cffff, 0xff0c0c0c)))
+ if_not_aes_subbytes(if_vprolb128(M128I_U32(0xff070707, 0xff070707, 0x0e0effff, 0xff0e0e0e)))
+ if_not_aes_subbytes(if_not_vprolb128(M128I_U32(0xff060606, 0xff060606, 0x0c0cffff, 0xff0c0c0c)));
static const __m128i sp3033303303303033mask =
- M128I_U32(0x04ff0404, 0x04ff0404, 0xff0a0aff, 0x0aff0a0a);
-
-static const u64 sbox4_input_mask =
- U64_BYTE(0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00);
+ if_aes_subbytes(if_vprolb128(M128I_U32(0x02ff0202, 0x02ff0202, 0xff0505ff, 0x05ff0505)))
+ if_aes_subbytes(if_not_vprolb128(M128I_U32(0x04ff0404, 0x04ff0404, 0xff0202ff, 0x0202ff02)))
+ if_not_aes_subbytes(if_vprolb128(M128I_U32(0x0aff0a0a, 0x0aff0a0a, 0xff0101ff, 0x01ff0101)))
+ if_not_aes_subbytes(if_not_vprolb128(M128I_U32(0x04ff0404, 0x04ff0404, 0xff0a0aff, 0x0aff0a0a)));
static const u64 sigma1 =
U64_U32(0x3BCC908B, 0xA09E667F);
@@ -1353,8 +1432,7 @@ camellia_setup128(void *key_table, __m128i x0)
vpshufb128_amemld(&bswap128_mask, KL128, KL128);
- vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
- vmovq128(sbox4_input_mask, x12);
+ if_not_vprolb128(vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11));
vmovdqa128_memld(&mask_0f, x13);
vmovdqa128_memld(&pre_tf_lo_s1, x14);
vmovdqa128_memld(&pre_tf_hi_s1, x15);
@@ -1369,18 +1447,18 @@ camellia_setup128(void *key_table, __m128i x0)
camellia_f(x2, x4, x1,
x5, x6, x7, x8,
- x11, x12, x13, x14, x15, sigma1);
+ x11, x13, x14, x15, sigma1);
vpxor128(x4, x3, x3);
camellia_f(x3, x2, x1,
x5, x6, x7, x8,
- x11, x12, x13, x14, x15, sigma2);
+ x11, x13, x14, x15, sigma2);
camellia_f(x2, x3, x1,
x5, x6, x7, x8,
- x11, x12, x13, x14, x15, sigma3);
+ x11, x13, x14, x15, sigma3);
vpxor128(x4, x3, x3);
camellia_f(x3, x4, x1,
x5, x6, x7, x8,
- x11, x12, x13, x14, x15, sigma4);
+ x11, x13, x14, x15, sigma4);
vpslldq128(8, x3, x3);
vpxor128(x4, x2, x2);
@@ -1581,10 +1659,10 @@ camellia_setup128(void *key_table, __m128i x0)
vmovq128_memst(x4, cmll_sub(5, ctx));
vmovq128_memst(x5, cmll_sub(6, ctx));
- vmovq128(*cmll_sub(7, ctx), x7);
- vmovq128(*cmll_sub(8, ctx), x8);
- vmovq128(*cmll_sub(9, ctx), x9);
- vmovq128(*cmll_sub(10, ctx), x10);
+ vmovq128_amemld(cmll_sub(7, ctx), x7);
+ vmovq128_amemld(cmll_sub(8, ctx), x8);
+ vmovq128_amemld(cmll_sub(9, ctx), x9);
+ vmovq128_amemld(cmll_sub(10, ctx), x10);
/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
vpandn128(x10, x8, x15);
vpsrldq128(4, x15, x15);
@@ -1601,11 +1679,11 @@ camellia_setup128(void *key_table, __m128i x0)
vpxor128(x0, x6, x6);
vmovq128_memst(x6, cmll_sub(7, ctx));
- vmovq128(*cmll_sub(11, ctx), x11);
- vmovq128(*cmll_sub(12, ctx), x12);
- vmovq128(*cmll_sub(13, ctx), x13);
- vmovq128(*cmll_sub(14, ctx), x14);
- vmovq128(*cmll_sub(15, ctx), x15);
+ vmovq128_amemld(cmll_sub(11, ctx), x11);
+ vmovq128_amemld(cmll_sub(12, ctx), x12);
+ vmovq128_amemld(cmll_sub(13, ctx), x13);
+ vmovq128_amemld(cmll_sub(14, ctx), x14);
+ vmovq128_amemld(cmll_sub(15, ctx), x15);
/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
vpandn128(x7, x9, x1);
vpsrldq128(4, x1, x1);
@@ -1630,11 +1708,11 @@ camellia_setup128(void *key_table, __m128i x0)
vmovq128_memst(x12, cmll_sub(13, ctx));
vmovq128_memst(x13, cmll_sub(14, ctx));
- vmovq128(*cmll_sub(16, ctx), x6);
- vmovq128(*cmll_sub(17, ctx), x7);
- vmovq128(*cmll_sub(18, ctx), x8);
- vmovq128(*cmll_sub(19, ctx), x9);
- vmovq128(*cmll_sub(20, ctx), x10);
+ vmovq128_amemld(cmll_sub(16, ctx), x6);
+ vmovq128_amemld(cmll_sub(17, ctx), x7);
+ vmovq128_amemld(cmll_sub(18, ctx), x8);
+ vmovq128_amemld(cmll_sub(19, ctx), x9);
+ vmovq128_amemld(cmll_sub(20, ctx), x10);
/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
vpandn128(x8, x6, x1);
vpsrldq128(4, x1, x1);
@@ -1664,10 +1742,10 @@ camellia_setup128(void *key_table, __m128i x0)
vpsrldq128(8, x1, x1);
vpxor128(x1, x0, x0);
- vmovq128(*cmll_sub(21, ctx), x1);
- vmovq128(*cmll_sub(22, ctx), x2);
- vmovq128(*cmll_sub(23, ctx), x3);
- vmovq128(*cmll_sub(24, ctx), x4);
+ vmovq128_amemld(cmll_sub(21, ctx), x1);
+ vmovq128_amemld(cmll_sub(22, ctx), x2);
+ vmovq128_amemld(cmll_sub(23, ctx), x3);
+ vmovq128_amemld(cmll_sub(24, ctx), x4);
vpxor128(x9, x0, x0);
vpxor128(x10, x8, x8);
@@ -1720,8 +1798,7 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vpshufb128_amemld(&bswap128_mask, KL128, KL128);
vpshufb128_amemld(&bswap128_mask, KR128, KR128);
- vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
- vmovq128(*&sbox4_input_mask, x12);
+ if_not_vprolb128(vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11));
vmovdqa128_memld(&mask_0f, x13);
vmovdqa128_memld(&pre_tf_lo_s1, x14);
vmovdqa128_memld(&pre_tf_hi_s1, x15);
@@ -1737,20 +1814,20 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
camellia_f(x2, x4, x5,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma1);
+ x11, x13, x14, x15, sigma1);
vpxor128(x4, x3, x3);
camellia_f(x3, x2, x5,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma2);
+ x11, x13, x14, x15, sigma2);
vpxor128(x6, x2, x2);
camellia_f(x2, x3, x5,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma3);
+ x11, x13, x14, x15, sigma3);
vpxor128(x4, x3, x3);
vpxor128(KR128, x3, x3);
camellia_f(x3, x4, x5,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma4);
+ x11, x13, x14, x15, sigma4);
vpslldq128(8, x3, x3);
vpxor128(x4, x2, x2);
@@ -1768,12 +1845,12 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
camellia_f(x4, x5, x6,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma5);
+ x11, x13, x14, x15, sigma5);
vpxor128(x5, x3, x3);
camellia_f(x3, x5, x6,
x7, x8, x9, x10,
- x11, x12, x13, x14, x15, sigma6);
+ x11, x13, x14, x15, sigma6);
vpslldq128(8, x3, x3);
vpxor128(x5, x4, x4);
vpsrldq128(8, x3, x3);
@@ -2031,10 +2108,10 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vmovq128_memst(x4, cmll_sub(5, ctx));
vmovq128_memst(x5, cmll_sub(6, ctx));
- vmovq128(*cmll_sub(7, ctx), x7);
- vmovq128(*cmll_sub(8, ctx), x8);
- vmovq128(*cmll_sub(9, ctx), x9);
- vmovq128(*cmll_sub(10, ctx), x10);
+ vmovq128_amemld(cmll_sub(7, ctx), x7);
+ vmovq128_amemld(cmll_sub(8, ctx), x8);
+ vmovq128_amemld(cmll_sub(9, ctx), x9);
+ vmovq128_amemld(cmll_sub(10, ctx), x10);
/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
vpandn128(x10, x8, x15);
vpsrldq128(4, x15, x15);
@@ -2051,11 +2128,11 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vpxor128(x0, x6, x6);
vmovq128_memst(x6, cmll_sub(7, ctx));
- vmovq128(*cmll_sub(11, ctx), x11);
- vmovq128(*cmll_sub(12, ctx), x12);
- vmovq128(*cmll_sub(13, ctx), x13);
- vmovq128(*cmll_sub(14, ctx), x14);
- vmovq128(*cmll_sub(15, ctx), x15);
+ vmovq128_amemld(cmll_sub(11, ctx), x11);
+ vmovq128_amemld(cmll_sub(12, ctx), x12);
+ vmovq128_amemld(cmll_sub(13, ctx), x13);
+ vmovq128_amemld(cmll_sub(14, ctx), x14);
+ vmovq128_amemld(cmll_sub(15, ctx), x15);
/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
vpandn128(x7, x9, x1);
vpsrldq128(4, x1, x1);
@@ -2080,11 +2157,11 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vmovq128_memst(x12, cmll_sub(13, ctx));
vmovq128_memst(x13, cmll_sub(14, ctx));
- vmovq128(*cmll_sub(16, ctx), x6);
- vmovq128(*cmll_sub(17, ctx), x7);
- vmovq128(*cmll_sub(18, ctx), x8);
- vmovq128(*cmll_sub(19, ctx), x9);
- vmovq128(*cmll_sub(20, ctx), x10);
+ vmovq128_amemld(cmll_sub(16, ctx), x6);
+ vmovq128_amemld(cmll_sub(17, ctx), x7);
+ vmovq128_amemld(cmll_sub(18, ctx), x8);
+ vmovq128_amemld(cmll_sub(19, ctx), x9);
+ vmovq128_amemld(cmll_sub(20, ctx), x10);
/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
vpandn128(x8, x6, x1);
vpsrldq128(4, x1, x1);
@@ -2114,10 +2191,10 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vpsrldq128(8, x1, x1);
vpxor128(x1, x0, x0);
- vmovq128(*cmll_sub(21, ctx), x1);
- vmovq128(*cmll_sub(22, ctx), x2);
- vmovq128(*cmll_sub(23, ctx), x3);
- vmovq128(*cmll_sub(24, ctx), x4);
+ vmovq128_amemld(cmll_sub(21, ctx), x1);
+ vmovq128_amemld(cmll_sub(22, ctx), x2);
+ vmovq128_amemld(cmll_sub(23, ctx), x3);
+ vmovq128_amemld(cmll_sub(24, ctx), x4);
vpxor128(x9, x0, x0);
vpxor128(x10, x8, x8);
@@ -2131,14 +2208,14 @@ camellia_setup256(void *key_table, __m128i x0, __m128i x1)
vmovq128_memst(x10, cmll_sub(21, ctx));
vmovq128_memst(x1, cmll_sub(22, ctx));
- vmovq128(*cmll_sub(25, ctx), x5);
- vmovq128(*cmll_sub(26, ctx), x6);
- vmovq128(*cmll_sub(27, ctx), x7);
- vmovq128(*cmll_sub(28, ctx), x8);
- vmovq128(*cmll_sub(29, ctx), x9);
- vmovq128(*cmll_sub(30, ctx), x10);
- vmovq128(*cmll_sub(31, ctx), x11);
- vmovq128(*cmll_sub(32, ctx), x12);
+ vmovq128_amemld(cmll_sub(25, ctx), x5);
+ vmovq128_amemld(cmll_sub(26, ctx), x6);
+ vmovq128_amemld(cmll_sub(27, ctx), x7);
+ vmovq128_amemld(cmll_sub(28, ctx), x8);
+ vmovq128_amemld(cmll_sub(29, ctx), x9);
+ vmovq128_amemld(cmll_sub(30, ctx), x10);
+ vmovq128_amemld(cmll_sub(31, ctx), x11);
+ vmovq128_amemld(cmll_sub(32, ctx), x12);
/* tl = subl(26) ^ (subr(26) & ~subr(24)); */
vpandn128(x6, x4, x15);
@@ -2223,7 +2300,7 @@ FUNC_KEY_SETUP(void *key_table, const void *vkey, unsigned int keylen)
case 24:
vmovdqu128_memld(key, x0);
- vmovq128(*(uint64_unaligned_t *)(key + 16), x1);
+ vmovq128_amemld((uint64_unaligned_t *)(key + 16), x1);
x2[0] = -1;
x2[1] = -1;
--
2.51.0
More information about the Gcrypt-devel
mailing list