[PATCH] twofish-avx2: de-unroll round function
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon May 29 20:52:57 CEST 2023
* cipher/twofish-avx2-amd64.S (__twofish_enc_blk16)
(__twofish_dec_blk16): Use loop structure instead of unrolling.
--
De-unrolling reduces code-size significantly and gives
small (<1%) increase in speed (tested on zen4, tiger-lake).
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/twofish-avx2-amd64.S | 115 +++++++++++++++---------------------
1 file changed, 49 insertions(+), 66 deletions(-)
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 8a6aae19..d05ec1f9 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -39,8 +39,8 @@
/* register macros */
#define CTX %rdi
-#define RROUND %rbp
-#define RROUNDd %ebp
+#define RROUND %r12
+#define RROUNDd %r12d
#define RS0 CTX
#define RS1 %r8
#define RS2 %r9
@@ -154,9 +154,9 @@
#define encrypt_round_end16(a, b, c, d, nk, r) \
vpaddd RY0, RX0, RX0; \
vpaddd RX0, RY0, RY0; \
- vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd ((nk))(RK,r), RT0; \
vpaddd RT0, RX0, RX0; \
- vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd 4+((nk))(RK,r), RT0; \
vpaddd RT0, RY0, RY0; \
\
vpxor RY0, d ## 0, d ## 0; \
@@ -168,9 +168,9 @@
\
vpaddd RY1, RX1, RX1; \
vpaddd RX1, RY1, RY1; \
- vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd ((nk))(RK,r), RT0; \
vpaddd RT0, RX1, RX1; \
- vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd 4+((nk))(RK,r), RT0; \
vpaddd RT0, RY1, RY1; \
\
vpxor RY1, d ## 1, d ## 1; \
@@ -216,9 +216,9 @@
#define decrypt_round_end16(a, b, c, d, nk, r) \
vpaddd RY0, RX0, RX0; \
vpaddd RX0, RY0, RY0; \
- vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd ((nk))(RK,r), RT0; \
vpaddd RT0, RX0, RX0; \
- vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd 4+((nk))(RK,r), RT0; \
vpaddd RT0, RY0, RY0; \
\
vpxor RX0, c ## 0, c ## 0; \
@@ -230,9 +230,9 @@
\
vpaddd RY1, RX1, RX1; \
vpaddd RX1, RY1, RY1; \
- vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd ((nk))(RK,r), RT0; \
vpaddd RT0, RX1, RX1; \
- vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpbroadcastd 4+((nk))(RK,r), RT0; \
vpaddd RT0, RY1, RY1; \
\
vpxor RX1, c ## 1, c ## 1; \
@@ -275,30 +275,6 @@
\
decrypt_round_end16(a, b, c, d, nk, r);
-#define encrypt_cycle16(r) \
- encrypt_round16(RA, RB, RC, RD, 0, r); \
- encrypt_round16(RC, RD, RA, RB, 8, r);
-
-#define encrypt_cycle_first16(r) \
- encrypt_round_first16(RA, RB, RC, RD, 0, r); \
- encrypt_round16(RC, RD, RA, RB, 8, r);
-
-#define encrypt_cycle_last16(r) \
- encrypt_round16(RA, RB, RC, RD, 0, r); \
- encrypt_round_last16(RC, RD, RA, RB, 8, r);
-
-#define decrypt_cycle16(r) \
- decrypt_round16(RC, RD, RA, RB, 8, r); \
- decrypt_round16(RA, RB, RC, RD, 0, r);
-
-#define decrypt_cycle_first16(r) \
- decrypt_round_first16(RC, RD, RA, RB, 8, r); \
- decrypt_round16(RA, RB, RC, RD, 0, r);
-
-#define decrypt_cycle_last16(r) \
- decrypt_round16(RC, RD, RA, RB, 8, r); \
- decrypt_round_last16(RA, RB, RC, RD, 0, r);
-
#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
vpunpckhdq x1, x0, t2; \
vpunpckldq x1, x0, x0; \
@@ -312,22 +288,6 @@
vpunpckhqdq x2, t2, x3; \
vpunpcklqdq x2, t2, x2;
-#define read_blocks8(offs,a,b,c,d) \
- vmovdqu 16*offs(RIO), a; \
- vmovdqu 16*offs+32(RIO), b; \
- vmovdqu 16*offs+64(RIO), c; \
- vmovdqu 16*offs+96(RIO), d; \
- \
- transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define write_blocks8(offs,a,b,c,d) \
- transpose_4x4(a, b, c, d, RX0, RY0); \
- \
- vmovdqu a, 16*offs(RIO); \
- vmovdqu b, 16*offs+32(RIO); \
- vmovdqu c, 16*offs+64(RIO); \
- vmovdqu d, 16*offs+96(RIO);
-
#define inpack_enc8(a,b,c,d) \
vpbroadcastd 4*0(RW), RT0; \
vpxor RT0, a, a; \
@@ -414,23 +374,35 @@ __twofish_enc_blk16:
* ciphertext blocks
*/
CFI_STARTPROC();
+
+ pushq RROUND;
+ CFI_PUSH(RROUND);
+
init_round_constants();
transpose4x4_16(RA, RB, RC, RD);
inpack_enc16(RA, RB, RC, RD);
- encrypt_cycle_first16(0);
- encrypt_cycle16(2);
- encrypt_cycle16(4);
- encrypt_cycle16(6);
- encrypt_cycle16(8);
- encrypt_cycle16(10);
- encrypt_cycle16(12);
- encrypt_cycle_last16(14);
+ xorl RROUNDd, RROUNDd;
+
+ encrypt_round_first16(RA, RB, RC, RD, 0, RROUND);
+
+.align 16
+.Loop_enc16:
+ encrypt_round16(RC, RD, RA, RB, 8, RROUND);
+ encrypt_round16(RA, RB, RC, RD, 16, RROUND);
+ leal 16(RROUNDd), RROUNDd;
+ cmpl $8*14, RROUNDd;
+ jb .Loop_enc16;
+
+ encrypt_round_last16(RC, RD, RA, RB, 8, RROUND);
outunpack_enc16(RA, RB, RC, RD);
transpose4x4_16(RA, RB, RC, RD);
+ popq RROUND;
+ CFI_POP(RROUND);
+
ret_spec_stop;
CFI_ENDPROC();
ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
@@ -447,23 +419,34 @@ __twofish_dec_blk16:
* ciphertext blocks
*/
CFI_STARTPROC();
+
+ pushq RROUND;
+ CFI_PUSH(RROUND);
+
init_round_constants();
transpose4x4_16(RA, RB, RC, RD);
inpack_dec16(RA, RB, RC, RD);
- decrypt_cycle_first16(14);
- decrypt_cycle16(12);
- decrypt_cycle16(10);
- decrypt_cycle16(8);
- decrypt_cycle16(6);
- decrypt_cycle16(4);
- decrypt_cycle16(2);
- decrypt_cycle_last16(0);
+ movl $14*8, RROUNDd;
+
+ decrypt_round_first16(RC, RD, RA, RB, 8, RROUND);
+
+.align 16
+.Loop_dec16:
+ decrypt_round16(RA, RB, RC, RD, 0, RROUND);
+ decrypt_round16(RC, RD, RA, RB, -8, RROUND);
+ subl $16, RROUNDd;
+ jnz .Loop_dec16;
+
+ decrypt_round_last16(RA, RB, RC, RD, 0, RROUND);
outunpack_dec16(RA, RB, RC, RD);
transpose4x4_16(RA, RB, RC, RD);
+ popq RROUND;
+ CFI_POP(RROUND);
+
ret_spec_stop;
CFI_ENDPROC();
ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
--
2.39.2
More information about the Gcrypt-devel
mailing list