[PATCH v2] camellia-gfni: use GFNI for uint8 right shift in FLS

Tue Mar 14 18:28:55 CET 2023

* cipher/camellia-aesni-avx2-amd64.h (IF_GFNI, IF_NOT_GFNI): New.
[CAMELLIA_GFNI_BUILD] (rol32_1_32): Add GFNI variant which uses
vgf2p8affineqb for uint8 right shift by 7.
(fls32): Load 'right shift by 7' bit-matrix on GFNI build.
[CAMELLIA_GFNI_BUILD] (.Lright_shift_by_7): New.
* cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1.
(rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7.
(fls64): Adjust for rol32_1_64 changes.
(.Lbyte_ones): Remove.
(.Lright_shift_by_7): New.
(_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use.
--

Benchmark on Intel Core i3-1115G4:

Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.194 ns/B      4920 MiB/s     0.794 c/B      4096±4
        ECB dec |     0.194 ns/B      4916 MiB/s     0.793 c/B      4089

After (~1.7% faster)
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.190 ns/B      5008 MiB/s     0.780 c/B      4096±3
        ECB dec |     0.191 ns/B      5002 MiB/s     0.781 c/B      4096±3

[v2]:
  Do same optimization for GFNI build of "cipher/camellia-aesni-avx2-amd64.h".

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx2-amd64.h  | 43 ++++++++++++++++++++++++++++-
 cipher/camellia-gfni-avx512-amd64.S | 37 +++++++++++++------------
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 003c4496..dff8b386 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -73,6 +73,14 @@
 # define IF_VAES(...)
 #endif
 
+#ifdef CAMELLIA_GFNI_BUILD
+# define IF_GFNI(...) __VA_ARGS__
+# define IF_NOT_GFNI(...)
+#else
+# define IF_GFNI(...)
+# define IF_NOT_GFNI(...) __VA_ARGS__
+#endif
+
 /**********************************************************************
   GFNI helper macros and constants
  **********************************************************************/
@@ -459,6 +467,26 @@
  * OUT:
  *  v0..3: (IN <<< 1)
  */
+#ifdef CAMELLIA_GFNI_BUILD
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, right_shift_by_7) \
+	vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
+	vpaddb v0, v0, v0; \
+	\
+	vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
+	vpaddb v1, v1, v1; \
+	\
+	vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
+	vpaddb v2, v2, v2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vgf2p8affineqb $0, right_shift_by_7, v3, t0; \
+	vpaddb v3, v3, v3; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+#else
 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
 	vpcmpgtb v0, zero, t0; \
 	vpaddb v0, v0, v0; \
@@ -481,6 +509,7 @@
 	vpor t1, v2, v2; \
 	vpor t2, v3, v3; \
 	vpor t0, v0, v0;
+#endif
 
 /*
  * IN:
@@ -496,7 +525,8 @@
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
-	vpxor tt0, tt0, tt0; \
+	IF_NOT_GFNI(vpxor tt0, tt0, tt0); \
+	IF_GFNI(vpbroadcastq .Lright_shift_by_7 rRIP, tt0); \
 	vpbroadcastb 0+kll, t3; \
 	vpbroadcastb 1+kll, t2; \
 	vpbroadcastb 2+kll, t1; \
@@ -867,6 +897,17 @@ ELF(.type   FUNC_NAME(_constants), at object;)
 		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+	.quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
 #else /* CAMELLIA_GFNI_BUILD */
 
 /*
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index b676379f..643eed3e 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -105,7 +105,6 @@
 	clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
 
 #define clear_regs() \
-	kxorq %k1, %k1, %k1; \
 	vzeroall; \
 	clear_zmm16_zmm31()
 
@@ -307,22 +306,18 @@
  *  v0..3: (IN << 1)
  *  t0, t1, t2, zero: (IN >> 7)
  */
-#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \
-	vpcmpltb zero, v0, %k1; \
+#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \
+	vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
 	vpaddb v0, v0, v0; \
-	vpaddb one, zero, t0{%k1}{z}; \
 	\
-	vpcmpltb zero, v1, %k1; \
+	vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
 	vpaddb v1, v1, v1; \
-	vpaddb one, zero, t1{%k1}{z}; \
 	\
-	vpcmpltb zero, v2, %k1; \
+	vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
 	vpaddb v2, v2, v2; \
-	vpaddb one, zero, t2{%k1}{z}; \
 	\
-	vpcmpltb zero, v3, %k1; \
-	vpaddb v3, v3, v3; \
-	vpaddb one, zero, zero{%k1}{z};
+	vgf2p8affineqb $0, right_shift_by_7, v3, t3; \
+	vpaddb v3, v3, v3;
 
 /*
  * IN:
@@ -338,8 +333,7 @@
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
-	vpbroadcastq .Lbyte_ones rRIP, tmp; \
-	vpxor tt3##_y, tt3##_y, tt3##_y; \
+	vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \
 	vpbroadcastb 0+kll, t3; \
 	vpbroadcastb 1+kll, t2; \
 	vpbroadcastb 2+kll, t1; \
@@ -360,7 +354,6 @@
 	vmovdqu64 l6, l##_6; \
 	vpternlogq $0x96, tt3, t3, l7; \
 	vmovdqu64 l7, l##_7; \
-	vpxor tt3##_y, tt3##_y, tt3##_y; \
 	\
 	/* \
 	 * t2 = krr; \
@@ -399,7 +392,6 @@
 	vpternlogq $0x96, tt1, t1, r##_5; \
 	vpternlogq $0x96, tt0, t2, r##_6; \
 	vpternlogq $0x96, tt3, t3, r##_7; \
-	vpxor tt3##_y, tt3##_y, tt3##_y; \
 	\
 	/* \
 	 * t0 = klr; \
@@ -596,9 +588,6 @@ ELF(.type   _gcry_camellia_gfni_avx512__constants, at object;)
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-.Lbyte_ones:
-	.quad 0x0101010101010101
-
 /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
  * and s4.
  *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
@@ -663,6 +652,17 @@ ELF(.type   _gcry_camellia_gfni_avx512__constants, at object;)
 		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+	.quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
 /* CTR byte addition constants */
 .align 64
 .Lbige_addb_0_1:
@@ -904,6 +904,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */
 	add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */
 	add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */
+	kxorq %k1, %k1, %k1;
 
 .align 4
 .Lload_ctr_done:
-- 
2.37.2