[PATCH 1/2] Tweak Camellia-AVX key-setup for small speed-up

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Nov 19 18:30:00 CET 2013


* cipher/camellia-aesni-avx-amd64.S (camellia_f): Merge S-function output
rotation with P-function.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S |   72 ++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 44 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index b25a8c7..ffb1aed 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1213,7 +1213,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	/* input rotation for sbox4 (<<< 1) */ \
 	vpand x, sbox4mask, t0; \
 	vpandn x, sbox4mask, x; \
-	vpsllw $1, t0, t1; \
+	vpaddw t0, t0, t1; \
 	vpsrlw $7, t0, t0; \
 	vpor t0, t1, t0; \
 	vpand sbox4mask, t0, t0; \
@@ -1238,34 +1238,22 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	vpor sbox2mask, t4, t2; \
 	vpand x, sbox2mask, t0; \
 	vpand x, t4, t1; \
-	vpandn x, t2, x; \
-	vpsllw $1, t0, t2; \
+	vpaddb x, x, t2; \
+	vpshufb .Lsp1110111044044404mask RIP, x, t4; \
+	vpshufb .Lsp0044440410011110mask RIP, x, x; \
 	vpsrlw $7, t0, t0; \
-	vpor t0, t2, t0; \
-	vpand sbox2mask, t0, t0; \
-	vpsllw $7, t1, t2; \
+	vpsllw $7, t1, t3; \
 	vpsrlw $1, t1, t1; \
-	vpor t1, t2, t1; \
-	vpand t4, t1, t1; \
-	vpor x, t0, x; \
-	vpor x, t1, x; \
-	\
-	vpshufb .Lsp11101110mask RIP, x, t4; \
-	vpshufb .Lsp44044404mask RIP, x, t1; \
-	vpshufb .Lsp30333033mask RIP, x, t2; \
-	vpshufb .Lsp02220222mask RIP, x, t0; \
-	vpxor t2, t1, t1; \
-	\
-	vpshufb .Lsp00444404mask RIP, x, t2; \
-	vpxor t0, t1, t1; \
-	vpshufb .Lsp03303033mask RIP, x, t0; \
-	vpxor t2, t4, t4; \
-	vpshufb .Lsp22000222mask RIP, x, t2; \
-	vpxor t0, t1, t1; \
-	vpxor t2, t4, t4; \
-	vpshufb .Lsp10011110mask RIP, x, x; \
-	vpxor t1, x, x; \
-	vpxor t4, x, x;
+	vpor t0, t2, t0; \
+	vpshufb .Lsp0222022222000222mask RIP, t0, t0; \
+	vpor t1, t3, t1; \
+	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
+	\
+	vpxor x, t4, t4; \
+	vpxor t1, t0, t0; \
+	vpxor t4, t0, t0; \
+	vpsrldq $8, t0, x; \
+	vpxor t0, x, x;
 
 #define vec_rol128(in, out, nrol, t0) \
 	vpshufd $0x4e, in, out; \
@@ -1281,29 +1269,25 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 .data
 
-.align 8
+.align 16
+.Lsp1110111044044404mask:
+	.long 0x000000ff, 0x000000ff;
+	.long 0x0101ff01, 0x0101ff01;
+.Lsp0044440410011110mask:
+	.long 0xffff0404, 0x0404ff04;
+	.long 0x07ffff07, 0x070707ff;
+.Lsp0222022222000222mask:
+	.long 0xff030303, 0xff030303;
+	.long 0x0606ffff, 0xff060606;
+.Lsp3033303303303033mask:
+	.long 0x02ff0202, 0x02ff0202;
+	.long 0xff0505ff, 0x05ff0505;
 .Lsbox2_output_mask:
 	.byte 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0xff, 0x00;
 .Lsbox3_output_mask:
 	.byte 0x00, 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00;
 .Lsbox4_input_mask:
 	.byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
-.Lsp11101110mask:
-	.long 0x000000ff, 0x000000ff;
-.Lsp44044404mask:
-	.long 0x0101ff01, 0x0101ff01;
-.Lsp30333033mask:
-	.long 0x02ff0202, 0x02ff0202;
-.Lsp02220222mask:
-	.long 0xff030303, 0xff030303;
-.Lsp00444404mask:
-	.long 0xffff0404, 0x0404ff04;
-.Lsp03303033mask:
-	.long 0xff0505ff, 0x05ff0505;
-.Lsp22000222mask:
-	.long 0x0606ffff, 0xff060606;
-.Lsp10011110mask:
-	.long 0x07ffff07, 0x070707ff;
 .Lsigma1:
 	.long 0x3BCC908B, 0xA09E667F;
 .Lsigma2:




More information about the Gcrypt-devel mailing list