[PATCH 4/7] sm4: deduplicate AVX2 cipher mode code between AES-NI and GFNI
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Jun 28 14:37:38 CEST 2026
* cipher/Makefile.am: Add 'sm4-avx2-amd64.h'.
* cipher/sm4-avx2-amd64.h: New, shared cipher mode functions moved
here from 'sm4-aesni-avx2-amd64.S' and 'sm4-gfni-avx2-amd64.S'.
* cipher/sm4-aesni-avx2-amd64.S (__sm4_crypt_blk16): Rename to
'__sm4_aesni_crypt_blk16'.
(_gcry_sm4_aesni_avx2_ctr_enc, _gcry_sm4_aesni_avx2_cbc_dec)
(_gcry_sm4_aesni_avx2_cfb_dec, _gcry_sm4_aesni_avx2_ocb_enc)
(_gcry_sm4_aesni_avx2_ocb_dec, _gcry_sm4_aesni_avx2_ocb_auth)
(inc_le128, Lbige_addb_0_*): Move to 'sm4-avx2-amd64.h'.
(FUNC_NAME, SM4_CRYPT_BLK16): New.
* cipher/sm4-gfni-avx2-amd64.S (_gcry_sm4_gfni_avx2_ctr_enc)
(_gcry_sm4_gfni_avx2_cbc_dec, _gcry_sm4_gfni_avx2_cfb_dec)
(_gcry_sm4_gfni_avx2_ocb_enc, _gcry_sm4_gfni_avx2_ocb_dec)
(_gcry_sm4_gfni_avx2_ocb_auth, inc_le128, Lbige_addb_0_*): Move
to 'sm4-avx2-amd64.h'.
(FUNC_NAME, SM4_CRYPT_BLK16): New.
--
The CTR/CBC/CFB/OCB mode functions were identical between the AES-NI
and GFNI AVX2 implementations apart from the public symbol prefix and
the 16-block transform function called. Move them to a shared header
included by both, parametrized through FUNC_NAME and SM4_CRYPT_BLK16,
as already done for the Camellia AVX2 implementations.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 1 +
cipher/sm4-aesni-avx2-amd64.S | 630 +--------------------------------
cipher/sm4-avx2-amd64.h | 648 ++++++++++++++++++++++++++++++++++
cipher/sm4-gfni-avx2-amd64.S | 620 +-------------------------------
4 files changed, 662 insertions(+), 1237 deletions(-)
create mode 100644 cipher/sm4-avx2-amd64.h
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b18ccfd9..f8777837 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -137,6 +137,7 @@ EXTRA_libcipher_la_SOURCES = \
serpent-avx512-x86.c serpent-armv7-neon.S \
simd-common-aarch64.h simd-common-ppc.h simd-common-riscv.h \
sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
+ sm4-avx2-amd64.h \
sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
sm4-ppc.c \
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index 03f979fa..128cac80 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,4 +1,4 @@
-/* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher
+/* sm4-aesni-avx2-amd64.S - AESNI/AVX2 implementation of SM4 cipher
*
* Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
@@ -163,7 +163,7 @@ _sm4_aesni_avx2_consts:
.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
-/* For CTR-mode IV byteswap */
+/* For output word byteswap */
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -171,33 +171,6 @@ _sm4_aesni_avx2_consts:
.Lbswap32_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-/* CTR byte addition constants */
-.align 32
-.Lbige_addb_0_1:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
-.Lbige_addb_2_3:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
-.Lbige_addb_4_5:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
-.Lbige_addb_6_7:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
-.Lbige_addb_8_9:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
-.Lbige_addb_10_11:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
-.Lbige_addb_12_13:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
-.Lbige_addb_14_15:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
-
.align 4
/* 4-bit mask */
.L0f0f0f0f:
@@ -206,8 +179,8 @@ _sm4_aesni_avx2_consts:
.text
.align 16
-ELF(.type __sm4_crypt_blk16, at function;)
-__sm4_crypt_blk16:
+ELF(.type __sm4_aesni_crypt_blk16, at function;)
+__sm4_aesni_crypt_blk16:
/* input:
* %rdi: ctx, CTX
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
@@ -318,7 +291,7 @@ __sm4_crypt_blk16:
ret_spec_stop;
CFI_ENDPROC();
-ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
+ELF(.size __sm4_aesni_crypt_blk16,.-__sm4_aesni_crypt_blk16;)
.align 16
.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
@@ -352,7 +325,7 @@ _gcry_sm4_aesni_avx2_crypt_blk1_16:
#undef LOAD_INPUT
.Lblk16_load_input_done:
- call __sm4_crypt_blk16;
+ call __sm4_aesni_crypt_blk16;
#define STORE_OUTPUT(yreg, offset) \
cmpq $(1 + 2 * (offset)), %rcx; \
@@ -380,594 +353,9 @@ _gcry_sm4_aesni_avx2_crypt_blk1_16:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;)
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_ctr_enc
-ELF(.type _gcry_sm4_aesni_avx2_ctr_enc, at function;)
-_gcry_sm4_aesni_avx2_ctr_enc:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (big endian, 128bit)
- */
- CFI_STARTPROC();
-
- cmpb $(0x100 - 16), 15(%rcx);
- jbe .Lctr_byteadd;
-
- movq 8(%rcx), %rax;
- bswapq %rax;
-
- vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
- vpcmpeqd RNOT, RNOT, RNOT;
- vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
- vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
-
- /* load IV and byteswap */
- vmovdqu (%rcx), RTMP4x;
- vpshufb RTMP3x, RTMP4x, RTMP4x;
- vmovdqa RTMP4x, RTMP0x;
- inc_le128(RTMP4x, RNOTx, RTMP1x);
- vinserti128 $1, RTMP4x, RTMP0, RTMP0;
- vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
-
- /* check need for handling 64-bit overflow and carry */
- cmpq $(0xffffffffffffffff - 16), %rax;
- ja .Lhandle_ctr_carry;
-
- /* construct IVs */
- vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
- vpshufb RTMP3, RTMP0, RA1;
- vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
- vpshufb RTMP3, RTMP0, RA2;
- vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
- vpshufb RTMP3, RTMP0, RA3;
- vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
- vpshufb RTMP3, RTMP0, RB0;
- vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
- vpshufb RTMP3, RTMP0, RB1;
- vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
- vpshufb RTMP3, RTMP0, RB2;
- vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
- vpshufb RTMP3, RTMP0, RB3;
- vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
- vpshufb RTMP3x, RTMP0x, RTMP0x;
-
- jmp .Lctr_carry_done;
-
-.Lhandle_ctr_carry:
- /* construct IVs */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
- inc_le128(RTMP0, RNOT, RTMP1);
- vextracti128 $1, RTMP0, RTMP0x;
- vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
-
-.Lctr_carry_done:
- /* store new IV */
- vmovdqu RTMP0x, (%rcx);
-
-.align 8
-.Lload_ctr_done:
- call __sm4_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
-
-.align 8
-.Lctr_byteadd_full_ctr_carry:
- movq 8(%rcx), %r11;
- movq (%rcx), %r10;
- bswapq %r11;
- bswapq %r10;
- addq $16, %r11;
- adcq $0, %r10;
- bswapq %r11;
- bswapq %r10;
- movq %r11, 8(%rcx);
- movq %r10, (%rcx);
- jmp .Lctr_byteadd_ymm;
-.align 8
-.Lctr_byteadd:
- vbroadcasti128 (%rcx), RB3;
- je .Lctr_byteadd_full_ctr_carry;
- addb $16, 15(%rcx);
-.Lctr_byteadd_ymm:
- vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
- vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
- vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
- vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
- vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
- vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
- vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
- vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
-
- jmp .Lload_ctr_done;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_cbc_dec
-ELF(.type _gcry_sm4_aesni_avx2_cbc_dec, at function;)
-_gcry_sm4_aesni_avx2_cbc_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- CFI_STARTPROC();
-
- vmovdqu (0 * 32)(%rdx), RA0;
- vmovdqu (1 * 32)(%rdx), RA1;
- vmovdqu (2 * 32)(%rdx), RA2;
- vmovdqu (3 * 32)(%rdx), RA3;
- vmovdqu (4 * 32)(%rdx), RB0;
- vmovdqu (5 * 32)(%rdx), RB1;
- vmovdqu (6 * 32)(%rdx), RB2;
- vmovdqu (7 * 32)(%rdx), RB3;
-
- call __sm4_crypt_blk16;
-
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RNOT;
- vpxor RNOT, RA0, RA0;
- vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
- vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
- vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
- vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
- vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
- vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
- vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx); /* store new IV */
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;)
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_cfb_dec
-ELF(.type _gcry_sm4_aesni_avx2_cfb_dec, at function;)
-_gcry_sm4_aesni_avx2_cfb_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- CFI_STARTPROC();
-
- /* Load input */
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RA0;
- vmovdqu (0 * 32 + 16)(%rdx), RA1;
- vmovdqu (1 * 32 + 16)(%rdx), RA2;
- vmovdqu (2 * 32 + 16)(%rdx), RA3;
- vmovdqu (3 * 32 + 16)(%rdx), RB0;
- vmovdqu (4 * 32 + 16)(%rdx), RB1;
- vmovdqu (5 * 32 + 16)(%rdx), RB2;
- vmovdqu (6 * 32 + 16)(%rdx), RB3;
-
- /* Update IV */
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx);
-
- call __sm4_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;)
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_ocb_enc
-ELF(.type _gcry_sm4_aesni_avx2_ocb_enc, at function;)
-
-_gcry_sm4_aesni_avx2_ocb_enc:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: offset
- * %r8 : checksum
- * %r9 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rcx), RTMP0x;
- vmovdqu (%r8), RTMP1x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rdx), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RTMP1, RTMP1; \
- vpxor yreg, RNOT, yreg; \
- vmovdqu RNOT, (n * 32)(%rsi);
-
- movq (0 * 8)(%r9), %r10;
- movq (1 * 8)(%r9), %r11;
- movq (2 * 8)(%r9), %r12;
- movq (3 * 8)(%r9), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r9), %r10;
- movq (5 * 8)(%r9), %r11;
- movq (6 * 8)(%r9), %r12;
- movq (7 * 8)(%r9), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r9), %r10;
- movq (9 * 8)(%r9), %r11;
- movq (10 * 8)(%r9), %r12;
- movq (11 * 8)(%r9), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r9), %r10;
- movq (13 * 8)(%r9), %r11;
- movq (14 * 8)(%r9), %r12;
- movq (15 * 8)(%r9), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vextracti128 $1, RTMP1, RNOTx;
- vmovdqu RTMP0x, (%rcx);
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%r8);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vpxor (0 * 32)(%rsi), RA0, RA0;
- vpxor (1 * 32)(%rsi), RA1, RA1;
- vpxor (2 * 32)(%rsi), RA2, RA2;
- vpxor (3 * 32)(%rsi), RA3, RA3;
- vpxor (4 * 32)(%rsi), RB0, RB0;
- vpxor (5 * 32)(%rsi), RB1, RB1;
- vpxor (6 * 32)(%rsi), RB2, RB2;
- vpxor (7 * 32)(%rsi), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;)
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_ocb_dec
-ELF(.type _gcry_sm4_aesni_avx2_ocb_dec, at function;)
-
-_gcry_sm4_aesni_avx2_ocb_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: offset
- * %r8 : checksum
- * %r9 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rcx), RTMP0x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rdx), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RNOT, yreg; \
- vmovdqu RNOT, (n * 32)(%rsi);
-
- movq (0 * 8)(%r9), %r10;
- movq (1 * 8)(%r9), %r11;
- movq (2 * 8)(%r9), %r12;
- movq (3 * 8)(%r9), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r9), %r10;
- movq (5 * 8)(%r9), %r11;
- movq (6 * 8)(%r9), %r12;
- movq (7 * 8)(%r9), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r9), %r10;
- movq (9 * 8)(%r9), %r11;
- movq (10 * 8)(%r9), %r12;
- movq (11 * 8)(%r9), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r9), %r10;
- movq (13 * 8)(%r9), %r11;
- movq (14 * 8)(%r9), %r12;
- movq (15 * 8)(%r9), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vmovdqu RTMP0x, (%rcx);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vmovdqu (%r8), RTMP1x;
-
- vpxor (0 * 32)(%rsi), RA0, RA0;
- vpxor (1 * 32)(%rsi), RA1, RA1;
- vpxor (2 * 32)(%rsi), RA2, RA2;
- vpxor (3 * 32)(%rsi), RA3, RA3;
- vpxor (4 * 32)(%rsi), RB0, RB0;
- vpxor (5 * 32)(%rsi), RB1, RB1;
- vpxor (6 * 32)(%rsi), RB2, RB2;
- vpxor (7 * 32)(%rsi), RB3, RB3;
-
- /* Checksum_i = Checksum_{i-1} xor P_i */
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vpxor RA0, RTMP1, RTMP1;
- vmovdqu RA1, (1 * 32)(%rsi);
- vpxor RA1, RTMP1, RTMP1;
- vmovdqu RA2, (2 * 32)(%rsi);
- vpxor RA2, RTMP1, RTMP1;
- vmovdqu RA3, (3 * 32)(%rsi);
- vpxor RA3, RTMP1, RTMP1;
- vmovdqu RB0, (4 * 32)(%rsi);
- vpxor RB0, RTMP1, RTMP1;
- vmovdqu RB1, (5 * 32)(%rsi);
- vpxor RB1, RTMP1, RTMP1;
- vmovdqu RB2, (6 * 32)(%rsi);
- vpxor RB2, RTMP1, RTMP1;
- vmovdqu RB3, (7 * 32)(%rsi);
- vpxor RB3, RTMP1, RTMP1;
-
- vextracti128 $1, RTMP1, RNOTx;
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%r8);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;)
-
-.align 16
-.globl _gcry_sm4_aesni_avx2_ocb_auth
-ELF(.type _gcry_sm4_aesni_avx2_ocb_auth, at function;)
-
-_gcry_sm4_aesni_avx2_ocb_auth:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: abuf (16 blocks)
- * %rdx: offset
- * %rcx: checksum
- * %r8 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rdx), RTMP0x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rsi), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RNOT, yreg;
-
- movq (0 * 8)(%r8), %r10;
- movq (1 * 8)(%r8), %r11;
- movq (2 * 8)(%r8), %r12;
- movq (3 * 8)(%r8), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r8), %r10;
- movq (5 * 8)(%r8), %r11;
- movq (6 * 8)(%r8), %r12;
- movq (7 * 8)(%r8), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r8), %r10;
- movq (9 * 8)(%r8), %r11;
- movq (10 * 8)(%r8), %r12;
- movq (11 * 8)(%r8), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r8), %r10;
- movq (13 * 8)(%r8), %r11;
- movq (14 * 8)(%r8), %r12;
- movq (15 * 8)(%r8), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vmovdqu RTMP0x, (%rdx);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vpxor RA0, RB0, RA0;
- vpxor RA1, RB1, RA1;
- vpxor RA2, RB2, RA2;
- vpxor RA3, RB3, RA3;
-
- vpxor RA1, RA0, RA0;
- vpxor RA3, RA2, RA2;
-
- vpxor RA2, RA0, RTMP1;
-
- vextracti128 $1, RTMP1, RNOTx;
- vpxor (%rcx), RTMP1x, RTMP1x;
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%rcx);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_aesni_avx2_ocb_auth,.-_gcry_sm4_aesni_avx2_ocb_auth;)
+#define FUNC_NAME(func) _gcry_sm4_aesni_avx2_ ## func
+#define SM4_CRYPT_BLK16 __sm4_aesni_crypt_blk16
+#include "sm4-avx2-amd64.h"
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/sm4-avx2-amd64.h b/cipher/sm4-avx2-amd64.h
new file mode 100644
index 00000000..6d3740dd
--- /dev/null
+++ b/cipher/sm4-avx2-amd64.h
@@ -0,0 +1,648 @@
+/* sm4-avx2-amd64.h - Shared AVX2 cipher mode code for SM4 cipher
+ *
+ * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_SM4_AVX2_AMD64_H
+#define GCRY_SM4_AVX2_AMD64_H
+
+SECTION_RODATA
+.align 32
+
+ELF(.type FUNC_NAME(cipher_mode_consts), at object)
+FUNC_NAME(cipher_mode_consts):
+
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+.text
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+.align 16
+.globl FUNC_NAME(ctr_enc)
+ELF(.type FUNC_NAME(ctr_enc), at function;)
+FUNC_NAME(ctr_enc):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ CFI_STARTPROC();
+
+ cmpb $(0x100 - 16), 15(%rcx);
+ jbe .Lctr_byteadd;
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RA2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RA3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RB2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RB3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+.align 8
+.Lload_ctr_done:
+ call SM4_CRYPT_BLK16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+ movq 8(%rcx), %r11;
+ movq (%rcx), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ addq $16, %r11;
+ adcq $0, %r10;
+ bswapq %r11;
+ bswapq %r10;
+ movq %r11, 8(%rcx);
+ movq %r10, (%rcx);
+ jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+ vbroadcasti128 (%rcx), RB3;
+ je .Lctr_byteadd_full_ctr_carry;
+ addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+ vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+ vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+ vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+ vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+ vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+ vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+ vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+ vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+ jmp .Lload_ctr_done;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)
+
+.align 16
+.globl FUNC_NAME(cbc_dec)
+ELF(.type FUNC_NAME(cbc_dec), at function;)
+FUNC_NAME(cbc_dec):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ call SM4_CRYPT_BLK16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+ vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+ vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+ vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);)
+
+.align 16
+.globl FUNC_NAME(cfb_dec)
+ELF(.type FUNC_NAME(cfb_dec), at function;)
+FUNC_NAME(cfb_dec):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+
+ /* Load input */
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RA0;
+ vmovdqu (0 * 32 + 16)(%rdx), RA1;
+ vmovdqu (1 * 32 + 16)(%rdx), RA2;
+ vmovdqu (2 * 32 + 16)(%rdx), RA3;
+ vmovdqu (3 * 32 + 16)(%rdx), RB0;
+ vmovdqu (4 * 32 + 16)(%rdx), RB1;
+ vmovdqu (5 * 32 + 16)(%rdx), RB2;
+ vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx);
+
+ call SM4_CRYPT_BLK16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);)
+
+.align 16
+.globl FUNC_NAME(ocb_enc)
+ELF(.type FUNC_NAME(ocb_enc), at function;)
+
+FUNC_NAME(ocb_enc):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rcx), RTMP0x;
+ vmovdqu (%r8), RTMP1x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RTMP1, RTMP1; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vmovdqu RTMP0x, (%rcx);
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%r8);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call SM4_CRYPT_BLK16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RA1, RA1;
+ vpxor (2 * 32)(%rsi), RA2, RA2;
+ vpxor (3 * 32)(%rsi), RA3, RA3;
+ vpxor (4 * 32)(%rsi), RB0, RB0;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RB2, RB2;
+ vpxor (7 * 32)(%rsi), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);)
+
+.align 16
+.globl FUNC_NAME(ocb_dec)
+ELF(.type FUNC_NAME(ocb_dec), at function;)
+
+FUNC_NAME(ocb_dec):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rcx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rcx);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call SM4_CRYPT_BLK16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vmovdqu (%r8), RTMP1x;
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RA1, RA1;
+ vpxor (2 * 32)(%rsi), RA2, RA2;
+ vpxor (3 * 32)(%rsi), RA3, RA3;
+ vpxor (4 * 32)(%rsi), RB0, RB0;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RB2, RB2;
+ vpxor (7 * 32)(%rsi), RB3, RB3;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vpxor RA0, RTMP1, RTMP1;
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vpxor RA1, RTMP1, RTMP1;
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vpxor RA2, RTMP1, RTMP1;
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vpxor RA3, RTMP1, RTMP1;
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vpxor RB0, RTMP1, RTMP1;
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vpxor RB1, RTMP1, RTMP1;
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vpxor RB2, RTMP1, RTMP1;
+ vmovdqu RB3, (7 * 32)(%rsi);
+ vpxor RB3, RTMP1, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%r8);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);)
+
+.align 16
+.globl FUNC_NAME(ocb_auth)
+ELF(.type FUNC_NAME(ocb_auth), at function;)
+
+FUNC_NAME(ocb_auth):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: abuf (16 blocks)
+ * %rdx: offset
+ * %rcx: checksum
+ * %r8 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rdx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rsi), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg;
+
+ movq (0 * 8)(%r8), %r10;
+ movq (1 * 8)(%r8), %r11;
+ movq (2 * 8)(%r8), %r12;
+ movq (3 * 8)(%r8), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r8), %r10;
+ movq (5 * 8)(%r8), %r11;
+ movq (6 * 8)(%r8), %r12;
+ movq (7 * 8)(%r8), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r8), %r10;
+ movq (9 * 8)(%r8), %r11;
+ movq (10 * 8)(%r8), %r12;
+ movq (11 * 8)(%r8), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r8), %r10;
+ movq (13 * 8)(%r8), %r11;
+ movq (14 * 8)(%r8), %r12;
+ movq (15 * 8)(%r8), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rdx);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call SM4_CRYPT_BLK16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vpxor RA0, RB0, RA0;
+ vpxor RA1, RB1, RA1;
+ vpxor RA2, RB2, RA2;
+ vpxor RA3, RB3, RA3;
+
+ vpxor RA1, RA0, RA0;
+ vpxor RA3, RA2, RA2;
+
+ vpxor RA2, RA0, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor (%rcx), RTMP1x, RTMP1x;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%rcx);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
+
+#endif /* GCRY_SM4_AVX2_AMD64_H */
diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
index 464da399..9aff85c6 100644
--- a/cipher/sm4-gfni-avx2-amd64.S
+++ b/cipher/sm4-gfni-avx2-amd64.S
@@ -128,7 +128,7 @@ _sm4_gfni_avx2_consts:
.byte 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04
.byte 0x09, 0x0a, 0x0b, 0x08, 0x0d, 0x0e, 0x0f, 0x0c
-/* For CTR-mode IV byteswap */
+/* For output word byteswap */
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -136,33 +136,6 @@ _sm4_gfni_avx2_consts:
.Lbswap32_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-/* CTR byte addition constants */
-.align 32
-.Lbige_addb_0_1:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
-.Lbige_addb_2_3:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
-.Lbige_addb_4_5:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
-.Lbige_addb_6_7:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
-.Lbige_addb_8_9:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
-.Lbige_addb_10_11:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
-.Lbige_addb_12_13:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
-.Lbige_addb_14_15:
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
-
.text
.align 16
@@ -667,594 +640,9 @@ _gcry_sm4_gfni_avx2_crypt_blk1_16:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_16,.-_gcry_sm4_gfni_avx2_crypt_blk1_16;)
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_ctr_enc
-ELF(.type _gcry_sm4_gfni_avx2_ctr_enc, at function;)
-_gcry_sm4_gfni_avx2_ctr_enc:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (big endian, 128bit)
- */
- CFI_STARTPROC();
-
- cmpb $(0x100 - 16), 15(%rcx);
- jbe .Lctr_byteadd;
-
- movq 8(%rcx), %rax;
- bswapq %rax;
-
- vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
- vpcmpeqd RNOT, RNOT, RNOT;
- vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
- vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
-
- /* load IV and byteswap */
- vmovdqu (%rcx), RTMP4x;
- vpshufb RTMP3x, RTMP4x, RTMP4x;
- vmovdqa RTMP4x, RTMP0x;
- inc_le128(RTMP4x, RNOTx, RTMP1x);
- vinserti128 $1, RTMP4x, RTMP0, RTMP0;
- vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
-
- /* check need for handling 64-bit overflow and carry */
- cmpq $(0xffffffffffffffff - 16), %rax;
- ja .Lhandle_ctr_carry;
-
- /* construct IVs */
- vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
- vpshufb RTMP3, RTMP0, RA1;
- vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
- vpshufb RTMP3, RTMP0, RA2;
- vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
- vpshufb RTMP3, RTMP0, RA3;
- vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
- vpshufb RTMP3, RTMP0, RB0;
- vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
- vpshufb RTMP3, RTMP0, RB1;
- vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
- vpshufb RTMP3, RTMP0, RB2;
- vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
- vpshufb RTMP3, RTMP0, RB3;
- vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
- vpshufb RTMP3x, RTMP0x, RTMP0x;
-
- jmp .Lctr_carry_done;
-
-.Lhandle_ctr_carry:
- /* construct IVs */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
- inc_le128(RTMP0, RNOT, RTMP1);
- inc_le128(RTMP0, RNOT, RTMP1);
- vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
- inc_le128(RTMP0, RNOT, RTMP1);
- vextracti128 $1, RTMP0, RTMP0x;
- vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
-
-.Lctr_carry_done:
- /* store new IV */
- vmovdqu RTMP0x, (%rcx);
-
-.align 8
-.Lload_ctr_done:
- call __sm4_gfni_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
-
-.align 8
-.Lctr_byteadd_full_ctr_carry:
- movq 8(%rcx), %r11;
- movq (%rcx), %r10;
- bswapq %r11;
- bswapq %r10;
- addq $16, %r11;
- adcq $0, %r10;
- bswapq %r11;
- bswapq %r10;
- movq %r11, 8(%rcx);
- movq %r10, (%rcx);
- jmp .Lctr_byteadd_ymm;
-.align 8
-.Lctr_byteadd:
- vbroadcasti128 (%rcx), RB3;
- je .Lctr_byteadd_full_ctr_carry;
- addb $16, 15(%rcx);
-.Lctr_byteadd_ymm:
- vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
- vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
- vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
- vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
- vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
- vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
- vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
- vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
-
- jmp .Lload_ctr_done;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_cbc_dec
-ELF(.type _gcry_sm4_gfni_avx2_cbc_dec, at function;)
-_gcry_sm4_gfni_avx2_cbc_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- CFI_STARTPROC();
-
- vmovdqu (0 * 32)(%rdx), RA0;
- vmovdqu (1 * 32)(%rdx), RA1;
- vmovdqu (2 * 32)(%rdx), RA2;
- vmovdqu (3 * 32)(%rdx), RA3;
- vmovdqu (4 * 32)(%rdx), RB0;
- vmovdqu (5 * 32)(%rdx), RB1;
- vmovdqu (6 * 32)(%rdx), RB2;
- vmovdqu (7 * 32)(%rdx), RB3;
-
- call __sm4_gfni_crypt_blk16;
-
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RNOT;
- vpxor RNOT, RA0, RA0;
- vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
- vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
- vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
- vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
- vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
- vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
- vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx); /* store new IV */
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_cbc_dec,.-_gcry_sm4_gfni_avx2_cbc_dec;)
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_cfb_dec
-ELF(.type _gcry_sm4_gfni_avx2_cfb_dec, at function;)
-_gcry_sm4_gfni_avx2_cfb_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- CFI_STARTPROC();
-
- /* Load input */
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RA0;
- vmovdqu (0 * 32 + 16)(%rdx), RA1;
- vmovdqu (1 * 32 + 16)(%rdx), RA2;
- vmovdqu (2 * 32 + 16)(%rdx), RA3;
- vmovdqu (3 * 32 + 16)(%rdx), RB0;
- vmovdqu (4 * 32 + 16)(%rdx), RB1;
- vmovdqu (5 * 32 + 16)(%rdx), RB2;
- vmovdqu (6 * 32 + 16)(%rdx), RB3;
-
- /* Update IV */
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx);
-
- call __sm4_gfni_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_cfb_dec,.-_gcry_sm4_gfni_avx2_cfb_dec;)
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_ocb_enc
-ELF(.type _gcry_sm4_gfni_avx2_ocb_enc, at function;)
-
-_gcry_sm4_gfni_avx2_ocb_enc:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: offset
- * %r8 : checksum
- * %r9 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rcx), RTMP0x;
- vmovdqu (%r8), RTMP1x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rdx), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RTMP1, RTMP1; \
- vpxor yreg, RNOT, yreg; \
- vmovdqu RNOT, (n * 32)(%rsi);
-
- movq (0 * 8)(%r9), %r10;
- movq (1 * 8)(%r9), %r11;
- movq (2 * 8)(%r9), %r12;
- movq (3 * 8)(%r9), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r9), %r10;
- movq (5 * 8)(%r9), %r11;
- movq (6 * 8)(%r9), %r12;
- movq (7 * 8)(%r9), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r9), %r10;
- movq (9 * 8)(%r9), %r11;
- movq (10 * 8)(%r9), %r12;
- movq (11 * 8)(%r9), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r9), %r10;
- movq (13 * 8)(%r9), %r11;
- movq (14 * 8)(%r9), %r12;
- movq (15 * 8)(%r9), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vextracti128 $1, RTMP1, RNOTx;
- vmovdqu RTMP0x, (%rcx);
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%r8);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_gfni_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vpxor (0 * 32)(%rsi), RA0, RA0;
- vpxor (1 * 32)(%rsi), RA1, RA1;
- vpxor (2 * 32)(%rsi), RA2, RA2;
- vpxor (3 * 32)(%rsi), RA3, RA3;
- vpxor (4 * 32)(%rsi), RB0, RB0;
- vpxor (5 * 32)(%rsi), RB1, RB1;
- vpxor (6 * 32)(%rsi), RB2, RB2;
- vpxor (7 * 32)(%rsi), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_ocb_enc,.-_gcry_sm4_gfni_avx2_ocb_enc;)
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_ocb_dec
-ELF(.type _gcry_sm4_gfni_avx2_ocb_dec, at function;)
-
-_gcry_sm4_gfni_avx2_ocb_dec:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: offset
- * %r8 : checksum
- * %r9 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rcx), RTMP0x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rdx), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RNOT, yreg; \
- vmovdqu RNOT, (n * 32)(%rsi);
-
- movq (0 * 8)(%r9), %r10;
- movq (1 * 8)(%r9), %r11;
- movq (2 * 8)(%r9), %r12;
- movq (3 * 8)(%r9), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r9), %r10;
- movq (5 * 8)(%r9), %r11;
- movq (6 * 8)(%r9), %r12;
- movq (7 * 8)(%r9), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r9), %r10;
- movq (9 * 8)(%r9), %r11;
- movq (10 * 8)(%r9), %r12;
- movq (11 * 8)(%r9), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r9), %r10;
- movq (13 * 8)(%r9), %r11;
- movq (14 * 8)(%r9), %r12;
- movq (15 * 8)(%r9), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vmovdqu RTMP0x, (%rcx);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_gfni_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vmovdqu (%r8), RTMP1x;
-
- vpxor (0 * 32)(%rsi), RA0, RA0;
- vpxor (1 * 32)(%rsi), RA1, RA1;
- vpxor (2 * 32)(%rsi), RA2, RA2;
- vpxor (3 * 32)(%rsi), RA3, RA3;
- vpxor (4 * 32)(%rsi), RB0, RB0;
- vpxor (5 * 32)(%rsi), RB1, RB1;
- vpxor (6 * 32)(%rsi), RB2, RB2;
- vpxor (7 * 32)(%rsi), RB3, RB3;
-
- /* Checksum_i = Checksum_{i-1} xor P_i */
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vpxor RA0, RTMP1, RTMP1;
- vmovdqu RA1, (1 * 32)(%rsi);
- vpxor RA1, RTMP1, RTMP1;
- vmovdqu RA2, (2 * 32)(%rsi);
- vpxor RA2, RTMP1, RTMP1;
- vmovdqu RA3, (3 * 32)(%rsi);
- vpxor RA3, RTMP1, RTMP1;
- vmovdqu RB0, (4 * 32)(%rsi);
- vpxor RB0, RTMP1, RTMP1;
- vmovdqu RB1, (5 * 32)(%rsi);
- vpxor RB1, RTMP1, RTMP1;
- vmovdqu RB2, (6 * 32)(%rsi);
- vpxor RB2, RTMP1, RTMP1;
- vmovdqu RB3, (7 * 32)(%rsi);
- vpxor RB3, RTMP1, RTMP1;
-
- vextracti128 $1, RTMP1, RNOTx;
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%r8);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_ocb_dec,.-_gcry_sm4_gfni_avx2_ocb_dec;)
-
-.align 16
-.globl _gcry_sm4_gfni_avx2_ocb_auth
-ELF(.type _gcry_sm4_gfni_avx2_ocb_auth, at function;)
-
-_gcry_sm4_gfni_avx2_ocb_auth:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: abuf (16 blocks)
- * %rdx: offset
- * %rcx: checksum
- * %r8 : L pointers (void *L[16])
- */
- CFI_STARTPROC();
-
- subq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(4 * 8);
-
- movq %r10, (0 * 8)(%rsp);
- movq %r11, (1 * 8)(%rsp);
- movq %r12, (2 * 8)(%rsp);
- movq %r13, (3 * 8)(%rsp);
- CFI_REL_OFFSET(%r10, 0 * 8);
- CFI_REL_OFFSET(%r11, 1 * 8);
- CFI_REL_OFFSET(%r12, 2 * 8);
- CFI_REL_OFFSET(%r13, 3 * 8);
-
- vmovdqu (%rdx), RTMP0x;
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
-
-#define OCB_INPUT(n, l0reg, l1reg, yreg) \
- vmovdqu (n * 32)(%rsi), yreg; \
- vpxor (l0reg), RTMP0x, RNOTx; \
- vpxor (l1reg), RNOTx, RTMP0x; \
- vinserti128 $1, RTMP0x, RNOT, RNOT; \
- vpxor yreg, RNOT, yreg;
-
- movq (0 * 8)(%r8), %r10;
- movq (1 * 8)(%r8), %r11;
- movq (2 * 8)(%r8), %r12;
- movq (3 * 8)(%r8), %r13;
- OCB_INPUT(0, %r10, %r11, RA0);
- OCB_INPUT(1, %r12, %r13, RA1);
- movq (4 * 8)(%r8), %r10;
- movq (5 * 8)(%r8), %r11;
- movq (6 * 8)(%r8), %r12;
- movq (7 * 8)(%r8), %r13;
- OCB_INPUT(2, %r10, %r11, RA2);
- OCB_INPUT(3, %r12, %r13, RA3);
- movq (8 * 8)(%r8), %r10;
- movq (9 * 8)(%r8), %r11;
- movq (10 * 8)(%r8), %r12;
- movq (11 * 8)(%r8), %r13;
- OCB_INPUT(4, %r10, %r11, RB0);
- OCB_INPUT(5, %r12, %r13, RB1);
- movq (12 * 8)(%r8), %r10;
- movq (13 * 8)(%r8), %r11;
- movq (14 * 8)(%r8), %r12;
- movq (15 * 8)(%r8), %r13;
- OCB_INPUT(6, %r10, %r11, RB2);
- OCB_INPUT(7, %r12, %r13, RB3);
-#undef OCB_INPUT
-
- vmovdqu RTMP0x, (%rdx);
-
- movq (0 * 8)(%rsp), %r10;
- movq (1 * 8)(%rsp), %r11;
- movq (2 * 8)(%rsp), %r12;
- movq (3 * 8)(%rsp), %r13;
- CFI_RESTORE(%r10);
- CFI_RESTORE(%r11);
- CFI_RESTORE(%r12);
- CFI_RESTORE(%r13);
-
- call __sm4_gfni_crypt_blk16;
-
- addq $(4 * 8), %rsp;
- CFI_ADJUST_CFA_OFFSET(-4 * 8);
-
- vpxor RA0, RB0, RA0;
- vpxor RA1, RB1, RA1;
- vpxor RA2, RB2, RA2;
- vpxor RA3, RB3, RA3;
-
- vpxor RA1, RA0, RA0;
- vpxor RA3, RA2, RA2;
-
- vpxor RA2, RA0, RTMP1;
-
- vextracti128 $1, RTMP1, RNOTx;
- vpxor (%rcx), RTMP1x, RTMP1x;
- vpxor RNOTx, RTMP1x, RTMP1x;
- vmovdqu RTMP1x, (%rcx);
-
- vzeroall;
-
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_sm4_gfni_avx2_ocb_auth,.-_gcry_sm4_gfni_avx2_ocb_auth;)
+#define FUNC_NAME(func) _gcry_sm4_gfni_avx2_ ## func
+#define SM4_CRYPT_BLK16 __sm4_gfni_crypt_blk16
+#include "sm4-avx2-amd64.h"
#endif /*defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
#endif /*__x86_64*/
--
2.53.0
More information about the Gcrypt-devel
mailing list