[PATCH 5/8] camellia-avx2: add fast path for full 32 block ECB input
Jussi Kivilinna
jussi.kivilinna at iki.fi
Wed Feb 22 20:29:21 CET 2023
* cipher/camellia-aesni-avx2-amd64.h (enc_blk1_32, dec_blk1_32): Add
fast path for 32 block input.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-aesni-avx2-amd64.h | 41 ++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 8 deletions(-)
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 7d451c09..92f0ce5f 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -2127,12 +2127,9 @@ FUNC_NAME(enc_blk1_32):
cmpl $31, %ecx;
vpxor %xmm0, %xmm0, %xmm0;
- ja 1f;
+ ja .Lenc_blk32;
jb 2f;
vmovdqu 15 * 32(%rdx), %xmm0;
- jmp 2f;
- 1:
- vmovdqu 15 * 32(%rdx), %ymm0;
2:
vmovdqu %ymm0, (%rax);
@@ -2195,13 +2192,29 @@ FUNC_NAME(enc_blk1_32):
STORE_OUTPUT(ymm9, 14);
STORE_OUTPUT(ymm8, 15);
+.align 8
2:
+.Lenc_blk32_done:
vzeroall;
leave;
CFI_LEAVE();
ret_spec_stop;
CFI_ENDPROC();
+
+.align 8
+.Lenc_blk32:
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX));
+
+ call FUNC_NAME(enc_blk32);
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+ jmp .Lenc_blk32_done;
+ CFI_ENDPROC();
ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
.align 16
@@ -2235,12 +2248,9 @@ FUNC_NAME(dec_blk1_32):
cmpl $31, %ecx;
vpxor %xmm0, %xmm0, %xmm0;
- ja 1f;
+ ja .Ldec_blk32;
jb 2f;
vmovdqu 15 * 32(%rdx), %xmm0;
- jmp 2f;
- 1:
- vmovdqu 15 * 32(%rdx), %ymm0;
2:
vmovdqu %ymm0, (%rax);
@@ -2284,12 +2294,27 @@ FUNC_NAME(dec_blk1_32):
STORE_OUTPUT(ymm9, 14);
STORE_OUTPUT(ymm8, 15);
+.align 8
2:
+.Ldec_blk32_done:
vzeroall;
leave;
CFI_LEAVE();
ret_spec_stop;
+
+.align 8
+.Ldec_blk32:
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ call FUNC_NAME(dec_blk32);
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+ jmp .Ldec_blk32_done;
CFI_ENDPROC();
ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
--
2.37.2
More information about the Gcrypt-devel
mailing list