[PATCH 5/7] sm4: add Intel SM4 instructions accelerated AVX2 implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Jun 28 14:37:39 CEST 2026
* cipher/Makefile.am: Add 'sm4-intel-avx2-amd64.S'.
* cipher/sm4-intel-avx2-amd64.S: New.
* cipher/sm4.c (USE_INTEL_SM4_AVX2): New.
(ASM_FUNC_ABI): Define also for Intel SM4 implementation.
(SM4_context): Add 'use_intel_sm4_avx2'.
(_gcry_sm4_intel_avx2_expand_key, _gcry_sm4_intel_avx2_ctr_enc)
(_gcry_sm4_intel_avx2_cbc_dec, _gcry_sm4_intel_avx2_cfb_dec)
(_gcry_sm4_intel_avx2_ocb_enc, _gcry_sm4_intel_avx2_ocb_dec)
(_gcry_sm4_intel_avx2_ocb_auth, _gcry_sm4_intel_avx2_crypt_blk1_16)
(sm4_intel_avx2_crypt_blk1_16): New.
(sm4_expand_key): Add Intel SM4 code-path.
(sm4_setkey): Use Intel SM4 implementation if supported by CPU.
(sm4_encrypt, sm4_decrypt, sm4_get_crypt_blk1_16_fn)
(_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_INTEL_SM4_AVX2]: Add
Intel SM4 code-path.
* configure.ac (gcry_cv_gcc_inline_asm_sm4, HAVE_GCC_INLINE_ASM_SM4): New.
(GCRYPT_ASM_CIPHERS) [x86_64]: Add 'sm4-intel-avx2-amd64.lo'.
* doc/gcrypt.texi: Add "intel-sm4" to HW features list.
* src/g10lib.h (HWF_INTEL_SM4): New.
* src/hwf-x86.c (detect_x86_gnuc): Add Intel SM4 detection.
* src/hwfeatures.c (hwflist): Add "intel-sm4".
--
Add SM4 implementation using the Intel SM4 instruction set extension:
VSM4KEY4 for the key schedule and VSM4RNDS4 for encryption. It is used
in preference to the GFNI/AVX512 implementation when available.
Tested with Intel SDE; not yet tested on real hardware.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 1 +
cipher/sm4-intel-avx2-amd64.S | 376 ++++++++++++++++++++++++++++++++++
cipher/sm4.c | 216 ++++++++++++++++++-
configure.ac | 24 +++
doc/gcrypt.texi | 1 +
src/g10lib.h | 1 +
src/hwf-x86.c | 5 +
src/hwfeatures.c | 1 +
8 files changed, 624 insertions(+), 1 deletion(-)
create mode 100644 cipher/sm4-intel-avx2-amd64.S
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index f8777837..04094a10 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -138,6 +138,7 @@ EXTRA_libcipher_la_SOURCES = \
simd-common-aarch64.h simd-common-ppc.h simd-common-riscv.h \
sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
sm4-avx2-amd64.h \
+ sm4-intel-avx2-amd64.S \
sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
sm4-ppc.c \
diff --git a/cipher/sm4-intel-avx2-amd64.S b/cipher/sm4-intel-avx2-amd64.S
new file mode 100644
index 00000000..4c715104
--- /dev/null
+++ b/cipher/sm4-intel-avx2-amd64.S
@@ -0,0 +1,376 @@
+/* sm4-intel-avx2-amd64.S - Intel SM4 instructions accelerated AVX2 SM4 cipher
+ *
+ * Copyright (C) 2026 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)
+
+#include "asm-common-amd64.h"
+
+/* vector registers */
+#define RA0 %ymm0
+#define RA1 %ymm1
+#define RA2 %ymm2
+#define RA3 %ymm3
+#define RA0x %xmm0
+#define RA1x %xmm1
+#define RA2x %xmm2
+#define RA3x %xmm3
+
+#define RTMP0 %ymm4
+#define RTMP1 %ymm5
+#define RTMP2 %ymm6
+#define RTMP3 %ymm7
+#define RTMP4 %ymm8
+#define RTMP0x %xmm4
+#define RTMP1x %xmm5
+#define RTMP2x %xmm6
+#define RTMP3x %xmm7
+#define RTMP4x %xmm8
+
+#define RB0 %ymm9
+#define RB1 %ymm10
+#define RB2 %ymm11
+#define RB3 %ymm12
+#define RB0x %xmm9
+#define RB1x %xmm10
+#define RB2x %xmm11
+#define RB3x %xmm12
+
+#define RNOT %ymm13
+#define RNOTx %xmm13
+
+SECTION_RODATA
+.align 32
+
+ELF(.type _sm4_intel_avx2_consts, at object)
+_sm4_intel_avx2_consts:
+
+/* For output word byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.text
+
+/**********************************************************************
+ SM4 key schedule with Intel SM4 instructions
+ **********************************************************************/
+
+.align 16
+.globl _gcry_sm4_intel_avx2_expand_key
+ELF(.type _gcry_sm4_intel_avx2_expand_key, at function;)
+_gcry_sm4_intel_avx2_expand_key:
+ /* input:
+ * %rdi: 128-bit master key
+ * %rsi: rkey_enc
+ * %rdx: rkey_dec
+ * %rcx: fk array
+ * %r8: ck array
+ */
+ CFI_STARTPROC();
+
+ vmovdqu (%rdi), RTMP0x;
+ vpshufb .Lbswap32_mask rRIP, RTMP0x, RTMP0x;
+ vpxor (%rcx), RTMP0x, RTMP0x;
+
+ vmovdqu (0*4*4)(%r8), RA0x;
+ vmovdqu (1*4*4)(%r8), RA1x;
+ vmovdqu (2*4*4)(%r8), RA2x;
+ vmovdqu (3*4*4)(%r8), RA3x;
+ vmovdqu (4*4*4)(%r8), RB0x;
+ vmovdqu (5*4*4)(%r8), RB1x;
+ vmovdqu (6*4*4)(%r8), RB2x;
+ vmovdqu (7*4*4)(%r8), RB3x;
+
+#define EXPAND_ROUND4(ck, r) \
+ vsm4key4 ck, RTMP0x, RTMP0x; \
+ vpshufd $0x1b, RTMP0x, RTMP1x; \
+ vmovdqu RTMP0x, (4*4*(r))(%rsi); \
+ vmovdqu RTMP1x, ((32-4)*4)-4*4*(r)(%rdx);
+
+ EXPAND_ROUND4(RA0x, 0);
+ EXPAND_ROUND4(RA1x, 1);
+ EXPAND_ROUND4(RA2x, 2);
+ EXPAND_ROUND4(RA3x, 3);
+ EXPAND_ROUND4(RB0x, 4);
+ EXPAND_ROUND4(RB1x, 5);
+ EXPAND_ROUND4(RB2x, 6);
+ EXPAND_ROUND4(RB3x, 7);
+
+#undef EXPAND_ROUND4
+
+ vzeroall;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_intel_avx2_expand_key,.-_gcry_sm4_intel_avx2_expand_key;)
+
+/**********************************************************************
+ 16-way SM4 with Intel SM4 instructions
+ **********************************************************************/
+
+.align 16
+ELF(.type __sm4_intel_crypt_blk16, at function;)
+__sm4_intel_crypt_blk16:
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * ciphertext blocks
+ */
+ CFI_STARTPROC();
+
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP0;
+ vpshufb RTMP0, RA0, RA0;
+ vpshufb RTMP0, RA1, RA1;
+ vpshufb RTMP0, RA2, RA2;
+ vpshufb RTMP0, RA3, RA3;
+ vpshufb RTMP0, RB0, RB0;
+ vpshufb RTMP0, RB1, RB1;
+ vpshufb RTMP0, RB2, RB2;
+ vpshufb RTMP0, RB3, RB3;
+
+#define ROUND4(rkoff) \
+ vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+ vsm4rnds4 RTMP1, RA0, RA0; \
+ vsm4rnds4 RTMP1, RA1, RA1; \
+ vsm4rnds4 RTMP1, RA2, RA2; \
+ vsm4rnds4 RTMP1, RA3, RA3; \
+ vsm4rnds4 RTMP1, RB0, RB0; \
+ vsm4rnds4 RTMP1, RB1, RB1; \
+ vsm4rnds4 RTMP1, RB2, RB2; \
+ vsm4rnds4 RTMP1, RB3, RB3;
+
+ ROUND4(0 * 16);
+ ROUND4(1 * 16);
+ ROUND4(2 * 16);
+ ROUND4(3 * 16);
+ ROUND4(4 * 16);
+ ROUND4(5 * 16);
+ ROUND4(6 * 16);
+ ROUND4(7 * 16);
+
+#undef ROUND4
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
+ vpshufb RTMP0, RA0, RA0;
+ vpshufb RTMP0, RA1, RA1;
+ vpshufb RTMP0, RA2, RA2;
+ vpshufb RTMP0, RA3, RA3;
+ vpshufb RTMP0, RB0, RB0;
+ vpshufb RTMP0, RB1, RB1;
+ vpshufb RTMP0, RB2, RB2;
+ vpshufb RTMP0, RB3, RB3;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size __sm4_intel_crypt_blk16,.-__sm4_intel_crypt_blk16;)
+
+/* Four rounds (one VSM4RNDS4 group) on 8, 4, 2 or 1 parallel blocks. */
+#define ROUND4_BLK8(rkoff) \
+ vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+ vsm4rnds4 RTMP1, RA0, RA0; \
+ vsm4rnds4 RTMP1, RA1, RA1; \
+ vsm4rnds4 RTMP1, RA2, RA2; \
+ vsm4rnds4 RTMP1, RA3, RA3;
+
+#define ROUND4_BLK4(rkoff) \
+ vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+ vsm4rnds4 RTMP1, RA0, RA0; \
+ vsm4rnds4 RTMP1, RA1, RA1;
+
+#define ROUND4_BLK2(rkoff) \
+ vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+ vsm4rnds4 RTMP1, RA0, RA0;
+
+#define ROUND4_BLK1(rkoff) \
+ vsm4rnds4 (rkoff)(%rdi), RA0x, RA0x;
+
+/**********************************************************************
+ 8/4/2/1-way SM4 with Intel SM4 instructions
+ **********************************************************************/
+.align 16
+.globl _gcry_sm4_intel_avx2_crypt_blk1_16
+ELF(.type _gcry_sm4_intel_avx2_crypt_blk1_16, at function;)
+_gcry_sm4_intel_avx2_crypt_blk1_16:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..16 blocks)
+ * %rdx: src (1..16 blocks)
+ * %rcx: num blocks (1..16)
+ */
+ CFI_STARTPROC();
+
+ cmpq $16, %rcx;
+ jne .Lblk1_15_intel;
+
+ /* Sixteen blocks: use the eight-register block function. */
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+ call __sm4_intel_crypt_blk16;
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+ jmp .Lblk1_16_done_intel;
+
+.Lblk1_15_intel:
+ movq %rcx, %rax;
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP0;
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+ /* Eight blocks (four YMM registers). */
+ cmpq $8, %rax;
+ jb .Lblk1_16_blk4_intel;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ leaq (8 * 16)(%rdx), %rdx;
+ vpshufb RTMP0, RA0, RA0;
+ vpshufb RTMP0, RA1, RA1;
+ vpshufb RTMP0, RA2, RA2;
+ vpshufb RTMP0, RA3, RA3;
+ ROUND4_BLK8(0 * 16);
+ ROUND4_BLK8(1 * 16);
+ ROUND4_BLK8(2 * 16);
+ ROUND4_BLK8(3 * 16);
+ ROUND4_BLK8(4 * 16);
+ ROUND4_BLK8(5 * 16);
+ ROUND4_BLK8(6 * 16);
+ ROUND4_BLK8(7 * 16);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ leaq (8 * 16)(%rsi), %rsi;
+
+ subq $8, %rax;
+ jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk4_intel:
+ /* Four blocks (two YMM registers). */
+ cmpq $4, %rax;
+ jb .Lblk1_16_blk2_intel;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ leaq (4 * 16)(%rdx), %rdx;
+ vpshufb RTMP0, RA0, RA0;
+ vpshufb RTMP0, RA1, RA1;
+ ROUND4_BLK4(0 * 16);
+ ROUND4_BLK4(1 * 16);
+ ROUND4_BLK4(2 * 16);
+ ROUND4_BLK4(3 * 16);
+ ROUND4_BLK4(4 * 16);
+ ROUND4_BLK4(5 * 16);
+ ROUND4_BLK4(6 * 16);
+ ROUND4_BLK4(7 * 16);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ leaq (4 * 16)(%rsi), %rsi;
+
+ subq $4, %rax;
+ jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk2_intel:
+ /* Two blocks (one YMM register). */
+ cmpq $2, %rax;
+ jb .Lblk1_16_blk1_intel;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ leaq (2 * 16)(%rdx), %rdx;
+ vpshufb RTMP0, RA0, RA0;
+ ROUND4_BLK2(0 * 16);
+ ROUND4_BLK2(1 * 16);
+ ROUND4_BLK2(2 * 16);
+ ROUND4_BLK2(3 * 16);
+ ROUND4_BLK2(4 * 16);
+ ROUND4_BLK2(5 * 16);
+ ROUND4_BLK2(6 * 16);
+ ROUND4_BLK2(7 * 16);
+ vpshufb RTMP2, RA0, RA0;
+ vmovdqu RA0, (0 * 32)(%rsi);
+ leaq (2 * 16)(%rsi), %rsi;
+
+ subq $2, %rax;
+ jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk1_intel:
+ /* One block (one XMM register). */
+ cmpq $1, %rax;
+ jb .Lblk1_16_done_intel;
+
+ vmovdqu (%rdx), RA0x;
+ vpshufb RTMP0x, RA0x, RA0x;
+ ROUND4_BLK1(0 * 16);
+ ROUND4_BLK1(1 * 16);
+ ROUND4_BLK1(2 * 16);
+ ROUND4_BLK1(3 * 16);
+ ROUND4_BLK1(4 * 16);
+ ROUND4_BLK1(5 * 16);
+ ROUND4_BLK1(6 * 16);
+ ROUND4_BLK1(7 * 16);
+ vpshufb RTMP2x, RA0x, RA0x;
+ vmovdqu RA0x, (%rsi);
+
+.Lblk1_16_done_intel:
+ xorl %eax, %eax;
+ vzeroall;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_intel_avx2_crypt_blk1_16,.-_gcry_sm4_intel_avx2_crypt_blk1_16;)
+
+#undef ROUND4_BLK8
+#undef ROUND4_BLK4
+#undef ROUND4_BLK2
+#undef ROUND4_BLK1
+
+#define FUNC_NAME(func) _gcry_sm4_intel_avx2_ ## func
+#define SM4_CRYPT_BLK16 __sm4_intel_crypt_blk16
+#include "sm4-avx2-amd64.h"
+
+#endif /*defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)*/
+#endif /*__x86_64*/
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 06b843f8..e397e452 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -74,11 +74,22 @@
# endif
#endif
+/* USE_INTEL_SM4_AVX2 indicates whether to compile with Intel SM4 instructions
+ * (VSM4RNDS4) based AVX2 code. */
+#undef USE_INTEL_SM4_AVX2
+#if defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_INTEL_SM4_AVX2 1
+# endif
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
- defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512)
+ defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512) || \
+ defined(USE_INTEL_SM4_AVX2)
# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define ASM_FUNC_ABI __attribute__((sysv_abi))
# else
@@ -168,6 +179,9 @@ typedef struct
#ifdef USE_GFNI_AVX512
unsigned int use_gfni_avx512:1;
#endif
+#ifdef USE_INTEL_SM4_AVX2
+ unsigned int use_intel_sm4_avx2:1;
+#endif
#ifdef USE_AARCH64_SIMD
unsigned int use_aarch64_simd:1;
#endif
@@ -356,6 +370,55 @@ sm4_aesni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
#endif /* USE_AESNI_AVX2 */
+#ifdef USE_INTEL_SM4_AVX2
+extern void _gcry_sm4_intel_avx2_expand_key(const byte *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk,
+ const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_enc(const u32 *rk_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_dec(const u32 *rk_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_auth(const u32 *rk_enc,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_intel_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_intel_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
+ size_t num_blks)
+{
+ return _gcry_sm4_intel_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+#endif /* USE_INTEL_SM4_AVX2 */
+
#ifdef USE_GFNI_AVX2
extern void _gcry_sm4_gfni_avx2_expand_key(const byte *key, u32 *rk_enc,
u32 *rk_dec, const u32 *fk,
@@ -700,6 +763,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
u32 rk[4];
int i;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ _gcry_sm4_intel_avx2_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+ fk, ck);
+ return;
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
@@ -798,6 +870,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_GFNI_AVX512
ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
#endif
+#ifdef USE_INTEL_SM4_AVX2
+ ctx->use_intel_sm4_avx2 = (hwf & HWF_INTEL_SM4) && (hwf & HWF_INTEL_AVX2);
+#endif
#ifdef USE_AARCH64_SIMD
ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
#endif
@@ -827,6 +902,26 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
}
#endif
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ /* Disable AESNI and GFNI implementations when Intel SM4 implementation
+ * is enabled. */
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+ ctx->use_aesni_avx2 = 0;
+#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = 0;
+#endif
+#ifdef USE_GFNI_AVX512
+ ctx->use_gfni_avx512 = 0;
+#endif
+ }
+#endif
+
ctx->crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
/* Setup bulk encryption routines. */
@@ -876,6 +971,11 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ return sm4_intel_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
@@ -901,6 +1001,11 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ return sm4_intel_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
@@ -1005,6 +1110,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
{
if (0)
;
+#ifdef USE_INTEL_SM4_AVX2
+ else if (ctx->use_intel_sm4_avx2)
+ {
+ return &sm4_intel_avx2_crypt_blk1_16;
+ }
+#endif
#ifdef USE_GFNI_AVX512
else if (ctx->use_gfni_avx512)
{
@@ -1077,6 +1188,21 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
const byte *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_intel_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
@@ -1227,6 +1353,21 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_intel_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
@@ -1376,6 +1517,21 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_intel_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
@@ -1681,6 +1837,37 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
u64 blkn = c->u_mode.ocb.data_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_sm4_intel_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+ else
+ _gcry_sm4_intel_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
@@ -1861,6 +2048,33 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
u64 blkn = c->u_mode.ocb.aad_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_INTEL_SM4_AVX2
+ if (ctx->use_intel_sm4_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ _gcry_sm4_intel_avx2_ocb_auth(ctx->rkey_enc, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX512
if (ctx->use_gfni_avx512)
{
diff --git a/configure.ac b/configure.ac
index b174e518..f12b590e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1636,6 +1636,29 @@ if test "$gcry_cv_gcc_inline_asm_sm3" = "yes" ; then
fi
+#
+# Check whether GCC inline assembler supports Intel SM4 instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports Intel SM4 instructions],
+ [gcry_cv_gcc_inline_asm_sm4],
+ [if test "$mpi_cpu_arch" != "x86" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_gcc_inline_asm_sm4="n/a"
+ else
+ gcry_cv_gcc_inline_asm_sm4=no
+ AC_LINK_IFELSE([AC_LANG_PROGRAM(
+ [[void a(void) {
+ __asm__("vsm4key4 %%ymm2, %%ymm1, %%ymm3\n\t":::"cc");
+ __asm__("vsm4rnds4 %%ymm2, %%ymm1, %%ymm3\n\t":::"cc");
+ }]], [ a(); ] )],
+ [gcry_cv_gcc_inline_asm_sm4=yes])
+ fi])
+if test "$gcry_cv_gcc_inline_asm_sm4" = "yes" ; then
+ AC_DEFINE(HAVE_GCC_INLINE_ASM_SM4,1,
+ [Defined if inline assembler supports Intel SM4 instructions])
+fi
+
+
#
# Check whether GCC inline assembler supports SSE4.1 instructions.
#
@@ -3850,6 +3873,7 @@ if test "$found" = "1" ; then
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-intel-avx2-amd64.lo"
;;
aarch64-*-*)
# Build with the assembly implementation
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index b233cd78..004c4e30 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -588,6 +588,7 @@ are
@item intel-gfni
@item intel-sha512
@item intel-sm3
+ at item intel-sm4
@item arm-neon
@item arm-aes
@item arm-sha1
diff --git a/src/g10lib.h b/src/g10lib.h
index d0e64a69..0b4ca0c7 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -241,6 +241,7 @@ char **_gcry_strtokenize (const char *string, const char *delim);
#define HWF_INTEL_GFNI (1 << 18)
#define HWF_INTEL_SHA512 (1 << 19)
#define HWF_INTEL_SM3 (1 << 20)
+#define HWF_INTEL_SM4 (1 << 21)
#elif defined(HAVE_CPU_ARCH_ARM)
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index f8c3c948..df3ba4f9 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -429,6 +429,11 @@ detect_x86_gnuc (
/* Test bit 1 for Intel SM3 instructions. */
if ((intel_feat3 & (1 << 1)) && os_supports_avx_avx2_registers)
result |= HWF_INTEL_SM3;
+
+ /* Test bit 2 for Intel SM4 instructions. These instructions are
+ * available for AVX/AVX2/AVX512. */
+ if ((intel_feat3 & (1 << 2)) && os_supports_avx_avx2_registers)
+ result |= HWF_INTEL_SM4;
}
/* Check additional feature flags. */
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 4f9053af..dbc84857 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -86,6 +86,7 @@ static struct
{ HWF_INTEL_GFNI, "intel-gfni" },
{ HWF_INTEL_SHA512, "intel-sha512" },
{ HWF_INTEL_SM3, "intel-sm3" },
+ { HWF_INTEL_SM4, "intel-sm4" },
/* Following removed HW feature strings are kept for API compatibility. */
{ 0, "intel-fast-vpgather" },
#elif defined(HAVE_CPU_ARCH_ARM)
--
2.53.0
More information about the Gcrypt-devel
mailing list