[PATCH 5/7] sm4: add Intel SM4 instructions accelerated AVX2 implementation

Sun Jun 28 14:37:39 CEST 2026

* cipher/Makefile.am: Add 'sm4-intel-avx2-amd64.S'.
* cipher/sm4-intel-avx2-amd64.S: New.
* cipher/sm4.c (USE_INTEL_SM4_AVX2): New.
(ASM_FUNC_ABI): Define also for Intel SM4 implementation.
(SM4_context): Add 'use_intel_sm4_avx2'.
(_gcry_sm4_intel_avx2_expand_key, _gcry_sm4_intel_avx2_ctr_enc)
(_gcry_sm4_intel_avx2_cbc_dec, _gcry_sm4_intel_avx2_cfb_dec)
(_gcry_sm4_intel_avx2_ocb_enc, _gcry_sm4_intel_avx2_ocb_dec)
(_gcry_sm4_intel_avx2_ocb_auth, _gcry_sm4_intel_avx2_crypt_blk1_16)
(sm4_intel_avx2_crypt_blk1_16): New.
(sm4_expand_key): Add Intel SM4 code-path.
(sm4_setkey): Use Intel SM4 implementation if supported by CPU.
(sm4_encrypt, sm4_decrypt, sm4_get_crypt_blk1_16_fn)
(_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_INTEL_SM4_AVX2]: Add
Intel SM4 code-path.
* configure.ac (gcry_cv_gcc_inline_asm_sm4, HAVE_GCC_INLINE_ASM_SM4): New.
(GCRYPT_ASM_CIPHERS) [x86_64]: Add 'sm4-intel-avx2-amd64.lo'.
* doc/gcrypt.texi: Add "intel-sm4" to HW features list.
* src/g10lib.h (HWF_INTEL_SM4): New.
* src/hwf-x86.c (detect_x86_gnuc): Add Intel SM4 detection.
* src/hwfeatures.c (hwflist): Add "intel-sm4".
--

Add SM4 implementation using the Intel SM4 instruction set extension:
VSM4KEY4 for the key schedule and VSM4RNDS4 for encryption. It is used
in preference to the GFNI/AVX512 implementation when available.

Tested with Intel SDE; not yet tested on real hardware.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am            |   1 +
 cipher/sm4-intel-avx2-amd64.S | 376 ++++++++++++++++++++++++++++++++++
 cipher/sm4.c                  | 216 ++++++++++++++++++-
 configure.ac                  |  24 +++
 doc/gcrypt.texi               |   1 +
 src/g10lib.h                  |   1 +
 src/hwf-x86.c                 |   5 +
 src/hwfeatures.c              |   1 +
 8 files changed, 624 insertions(+), 1 deletion(-)
 create mode 100644 cipher/sm4-intel-avx2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index f8777837..04094a10 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -138,6 +138,7 @@ EXTRA_libcipher_la_SOURCES = \
 	simd-common-aarch64.h simd-common-ppc.h simd-common-riscv.h \
 	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
 	sm4-avx2-amd64.h \
+	sm4-intel-avx2-amd64.S \
 	sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
 	sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
 	sm4-ppc.c \
diff --git a/cipher/sm4-intel-avx2-amd64.S b/cipher/sm4-intel-avx2-amd64.S
new file mode 100644
index 00000000..4c715104
--- /dev/null
+++ b/cipher/sm4-intel-avx2-amd64.S
@@ -0,0 +1,376 @@
+/* sm4-intel-avx2-amd64.S  -  Intel SM4 instructions accelerated AVX2 SM4 cipher
+ *
+ * Copyright (C) 2026 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)
+
+#include "asm-common-amd64.h"
+
+/* vector registers */
+#define RA0          %ymm0
+#define RA1          %ymm1
+#define RA2          %ymm2
+#define RA3          %ymm3
+#define RA0x         %xmm0
+#define RA1x         %xmm1
+#define RA2x         %xmm2
+#define RA3x         %xmm3
+
+#define RTMP0        %ymm4
+#define RTMP1        %ymm5
+#define RTMP2        %ymm6
+#define RTMP3        %ymm7
+#define RTMP4        %ymm8
+#define RTMP0x       %xmm4
+#define RTMP1x       %xmm5
+#define RTMP2x       %xmm6
+#define RTMP3x       %xmm7
+#define RTMP4x       %xmm8
+
+#define RB0          %ymm9
+#define RB1          %ymm10
+#define RB2          %ymm11
+#define RB3          %ymm12
+#define RB0x         %xmm9
+#define RB1x         %xmm10
+#define RB2x         %xmm11
+#define RB3x         %xmm12
+
+#define RNOT         %ymm13
+#define RNOTx        %xmm13
+
+SECTION_RODATA
+.align 32
+
+ELF(.type _sm4_intel_avx2_consts, at object)
+_sm4_intel_avx2_consts:
+
+/* For output word byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.text
+
+/**********************************************************************
+  SM4 key schedule with Intel SM4 instructions
+ **********************************************************************/
+
+.align 16
+.globl _gcry_sm4_intel_avx2_expand_key
+ELF(.type   _gcry_sm4_intel_avx2_expand_key, at function;)
+_gcry_sm4_intel_avx2_expand_key:
+	/* input:
+	 *	%rdi: 128-bit master key
+	 *	%rsi: rkey_enc
+	 *	%rdx: rkey_dec
+	 *	%rcx: fk array
+	 *	%r8: ck array
+	 */
+	CFI_STARTPROC();
+
+	vmovdqu (%rdi), RTMP0x;
+	vpshufb .Lbswap32_mask rRIP, RTMP0x, RTMP0x;
+	vpxor (%rcx), RTMP0x, RTMP0x;
+
+	vmovdqu (0*4*4)(%r8), RA0x;
+	vmovdqu (1*4*4)(%r8), RA1x;
+	vmovdqu (2*4*4)(%r8), RA2x;
+	vmovdqu (3*4*4)(%r8), RA3x;
+	vmovdqu (4*4*4)(%r8), RB0x;
+	vmovdqu (5*4*4)(%r8), RB1x;
+	vmovdqu (6*4*4)(%r8), RB2x;
+	vmovdqu (7*4*4)(%r8), RB3x;
+
+#define EXPAND_ROUND4(ck, r) \
+	vsm4key4 ck, RTMP0x, RTMP0x; \
+	vpshufd $0x1b, RTMP0x, RTMP1x; \
+	vmovdqu RTMP0x, (4*4*(r))(%rsi); \
+	vmovdqu RTMP1x, ((32-4)*4)-4*4*(r)(%rdx);
+
+	EXPAND_ROUND4(RA0x, 0);
+	EXPAND_ROUND4(RA1x, 1);
+	EXPAND_ROUND4(RA2x, 2);
+	EXPAND_ROUND4(RA3x, 3);
+	EXPAND_ROUND4(RB0x, 4);
+	EXPAND_ROUND4(RB1x, 5);
+	EXPAND_ROUND4(RB2x, 6);
+	EXPAND_ROUND4(RB3x, 7);
+
+#undef EXPAND_ROUND4
+
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_intel_avx2_expand_key,.-_gcry_sm4_intel_avx2_expand_key;)
+
+/**********************************************************************
+  16-way SM4 with Intel SM4 instructions
+ **********************************************************************/
+
+.align 16
+ELF(.type   __sm4_intel_crypt_blk16, at function;)
+__sm4_intel_crypt_blk16:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 * 						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+
+	vbroadcasti128 .Lbswap32_mask rRIP, RTMP0;
+	vpshufb RTMP0, RA0, RA0;
+	vpshufb RTMP0, RA1, RA1;
+	vpshufb RTMP0, RA2, RA2;
+	vpshufb RTMP0, RA3, RA3;
+	vpshufb RTMP0, RB0, RB0;
+	vpshufb RTMP0, RB1, RB1;
+	vpshufb RTMP0, RB2, RB2;
+	vpshufb RTMP0, RB3, RB3;
+
+#define ROUND4(rkoff) \
+	vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+	vsm4rnds4 RTMP1, RA0, RA0; \
+	vsm4rnds4 RTMP1, RA1, RA1; \
+	vsm4rnds4 RTMP1, RA2, RA2; \
+	vsm4rnds4 RTMP1, RA3, RA3; \
+	vsm4rnds4 RTMP1, RB0, RB0; \
+	vsm4rnds4 RTMP1, RB1, RB1; \
+	vsm4rnds4 RTMP1, RB2, RB2; \
+	vsm4rnds4 RTMP1, RB3, RB3;
+
+	ROUND4(0 * 16);
+	ROUND4(1 * 16);
+	ROUND4(2 * 16);
+	ROUND4(3 * 16);
+	ROUND4(4 * 16);
+	ROUND4(5 * 16);
+	ROUND4(6 * 16);
+	ROUND4(7 * 16);
+
+#undef ROUND4
+
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
+	vpshufb RTMP0, RA0, RA0;
+	vpshufb RTMP0, RA1, RA1;
+	vpshufb RTMP0, RA2, RA2;
+	vpshufb RTMP0, RA3, RA3;
+	vpshufb RTMP0, RB0, RB0;
+	vpshufb RTMP0, RB1, RB1;
+	vpshufb RTMP0, RB2, RB2;
+	vpshufb RTMP0, RB3, RB3;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __sm4_intel_crypt_blk16,.-__sm4_intel_crypt_blk16;)
+
+/* Four rounds (one VSM4RNDS4 group) on 8, 4, 2 or 1 parallel blocks. */
+#define ROUND4_BLK8(rkoff) \
+	vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+	vsm4rnds4 RTMP1, RA0, RA0; \
+	vsm4rnds4 RTMP1, RA1, RA1; \
+	vsm4rnds4 RTMP1, RA2, RA2; \
+	vsm4rnds4 RTMP1, RA3, RA3;
+
+#define ROUND4_BLK4(rkoff) \
+	vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+	vsm4rnds4 RTMP1, RA0, RA0; \
+	vsm4rnds4 RTMP1, RA1, RA1;
+
+#define ROUND4_BLK2(rkoff) \
+	vbroadcasti128 (rkoff)(%rdi), RTMP1; \
+	vsm4rnds4 RTMP1, RA0, RA0;
+
+#define ROUND4_BLK1(rkoff) \
+	vsm4rnds4 (rkoff)(%rdi), RA0x, RA0x;
+
+/**********************************************************************
+  8/4/2/1-way SM4 with Intel SM4 instructions
+ **********************************************************************/
+.align 16
+.globl _gcry_sm4_intel_avx2_crypt_blk1_16
+ELF(.type   _gcry_sm4_intel_avx2_crypt_blk1_16, at function;)
+_gcry_sm4_intel_avx2_crypt_blk1_16:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (1..16 blocks)
+	 *	%rdx: src (1..16 blocks)
+	 *	%rcx: num blocks (1..16)
+	 */
+	CFI_STARTPROC();
+
+	cmpq $16, %rcx;
+	jne .Lblk1_15_intel;
+
+	/* Sixteen blocks: use the eight-register block function. */
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	vmovdqu (4 * 32)(%rdx), RB0;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RB2;
+	vmovdqu (7 * 32)(%rdx), RB3;
+	call __sm4_intel_crypt_blk16;
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+	jmp .Lblk1_16_done_intel;
+
+.Lblk1_15_intel:
+	movq %rcx, %rax;
+	vbroadcasti128 .Lbswap32_mask rRIP, RTMP0;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+	/* Eight blocks (four YMM registers). */
+	cmpq $8, %rax;
+	jb .Lblk1_16_blk4_intel;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	leaq (8 * 16)(%rdx), %rdx;
+	vpshufb RTMP0, RA0, RA0;
+	vpshufb RTMP0, RA1, RA1;
+	vpshufb RTMP0, RA2, RA2;
+	vpshufb RTMP0, RA3, RA3;
+	ROUND4_BLK8(0 * 16);
+	ROUND4_BLK8(1 * 16);
+	ROUND4_BLK8(2 * 16);
+	ROUND4_BLK8(3 * 16);
+	ROUND4_BLK8(4 * 16);
+	ROUND4_BLK8(5 * 16);
+	ROUND4_BLK8(6 * 16);
+	ROUND4_BLK8(7 * 16);
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	leaq (8 * 16)(%rsi), %rsi;
+
+	subq $8, %rax;
+	jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk4_intel:
+	/* Four blocks (two YMM registers). */
+	cmpq $4, %rax;
+	jb .Lblk1_16_blk2_intel;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	leaq (4 * 16)(%rdx), %rdx;
+	vpshufb RTMP0, RA0, RA0;
+	vpshufb RTMP0, RA1, RA1;
+	ROUND4_BLK4(0 * 16);
+	ROUND4_BLK4(1 * 16);
+	ROUND4_BLK4(2 * 16);
+	ROUND4_BLK4(3 * 16);
+	ROUND4_BLK4(4 * 16);
+	ROUND4_BLK4(5 * 16);
+	ROUND4_BLK4(6 * 16);
+	ROUND4_BLK4(7 * 16);
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	leaq (4 * 16)(%rsi), %rsi;
+
+	subq $4, %rax;
+	jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk2_intel:
+	/* Two blocks (one YMM register). */
+	cmpq $2, %rax;
+	jb .Lblk1_16_blk1_intel;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	leaq (2 * 16)(%rdx), %rdx;
+	vpshufb RTMP0, RA0, RA0;
+	ROUND4_BLK2(0 * 16);
+	ROUND4_BLK2(1 * 16);
+	ROUND4_BLK2(2 * 16);
+	ROUND4_BLK2(3 * 16);
+	ROUND4_BLK2(4 * 16);
+	ROUND4_BLK2(5 * 16);
+	ROUND4_BLK2(6 * 16);
+	ROUND4_BLK2(7 * 16);
+	vpshufb RTMP2, RA0, RA0;
+	vmovdqu RA0, (0 * 32)(%rsi);
+	leaq (2 * 16)(%rsi), %rsi;
+
+	subq $2, %rax;
+	jz .Lblk1_16_done_intel;
+
+.Lblk1_16_blk1_intel:
+	/* One block (one XMM register). */
+	cmpq $1, %rax;
+	jb .Lblk1_16_done_intel;
+
+	vmovdqu (%rdx), RA0x;
+	vpshufb RTMP0x, RA0x, RA0x;
+	ROUND4_BLK1(0 * 16);
+	ROUND4_BLK1(1 * 16);
+	ROUND4_BLK1(2 * 16);
+	ROUND4_BLK1(3 * 16);
+	ROUND4_BLK1(4 * 16);
+	ROUND4_BLK1(5 * 16);
+	ROUND4_BLK1(6 * 16);
+	ROUND4_BLK1(7 * 16);
+	vpshufb RTMP2x, RA0x, RA0x;
+	vmovdqu RA0x, (%rsi);
+
+.Lblk1_16_done_intel:
+	xorl %eax, %eax;
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_intel_avx2_crypt_blk1_16,.-_gcry_sm4_intel_avx2_crypt_blk1_16;)
+
+#undef ROUND4_BLK8
+#undef ROUND4_BLK4
+#undef ROUND4_BLK2
+#undef ROUND4_BLK1
+
+#define FUNC_NAME(func) _gcry_sm4_intel_avx2_ ## func
+#define SM4_CRYPT_BLK16 __sm4_intel_crypt_blk16
+#include "sm4-avx2-amd64.h"
+
+#endif /*defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)*/
+#endif /*__x86_64*/
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 06b843f8..e397e452 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -74,11 +74,22 @@
 # endif
 #endif
 
+/* USE_INTEL_SM4_AVX2 indicates whether to compile with Intel SM4 instructions
+ * (VSM4RNDS4) based AVX2 code. */
+#undef USE_INTEL_SM4_AVX2
+#if defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_SM4)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_INTEL_SM4_AVX2 1
+# endif
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
-    defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512)
+    defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512) || \
+    defined(USE_INTEL_SM4_AVX2)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 # else
@@ -168,6 +179,9 @@ typedef struct
 #ifdef USE_GFNI_AVX512
   unsigned int use_gfni_avx512:1;
 #endif
+#ifdef USE_INTEL_SM4_AVX2
+  unsigned int use_intel_sm4_avx2:1;
+#endif
 #ifdef USE_AARCH64_SIMD
   unsigned int use_aarch64_simd:1;
 #endif
@@ -356,6 +370,55 @@ sm4_aesni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
 
 #endif /* USE_AESNI_AVX2 */
 
+#ifdef USE_INTEL_SM4_AVX2
+extern void _gcry_sm4_intel_avx2_expand_key(const byte *key, u32 *rk_enc,
+					    u32 *rk_dec, const u32 *fk,
+					    const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+					 const byte *in,
+					 byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+					 const byte *in,
+					 byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+					 const byte *in,
+					 byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_enc(const u32 *rk_enc,
+					 unsigned char *out,
+					 const unsigned char *in,
+					 unsigned char *offset,
+					 unsigned char *checksum,
+					 const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_dec(const u32 *rk_dec,
+					 unsigned char *out,
+					 const unsigned char *in,
+					 unsigned char *offset,
+					 unsigned char *checksum,
+					 const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_intel_avx2_ocb_auth(const u32 *rk_enc,
+					  const unsigned char *abuf,
+					  unsigned char *offset,
+					  unsigned char *checksum,
+					  const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_intel_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_intel_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
+			     size_t num_blks)
+{
+  return _gcry_sm4_intel_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+#endif /* USE_INTEL_SM4_AVX2 */
+
 #ifdef USE_GFNI_AVX2
 extern void _gcry_sm4_gfni_avx2_expand_key(const byte *key, u32 *rk_enc,
                                            u32 *rk_dec, const u32 *fk,
@@ -700,6 +763,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
   u32 rk[4];
   int i;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      _gcry_sm4_intel_avx2_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+                                       fk, ck);
+      return;
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
@@ -798,6 +870,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
 #ifdef USE_GFNI_AVX512
   ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
 #endif
+#ifdef USE_INTEL_SM4_AVX2
+  ctx->use_intel_sm4_avx2 = (hwf & HWF_INTEL_SM4) && (hwf & HWF_INTEL_AVX2);
+#endif
 #ifdef USE_AARCH64_SIMD
   ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
 #endif
@@ -827,6 +902,26 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
     }
 #endif
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      /* Disable AESNI and GFNI implementations when Intel SM4 implementation
+       * is enabled. */
+#ifdef USE_AESNI_AVX
+      ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+      ctx->use_aesni_avx2 = 0;
+#endif
+#ifdef USE_GFNI_AVX2
+      ctx->use_gfni_avx2 = 0;
+#endif
+#ifdef USE_GFNI_AVX512
+      ctx->use_gfni_avx512 = 0;
+#endif
+    }
+#endif
+
   ctx->crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
 
   /* Setup bulk encryption routines.  */
@@ -876,6 +971,11 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    return sm4_intel_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
@@ -901,6 +1001,11 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    return sm4_intel_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
@@ -1005,6 +1110,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
 {
   if (0)
     ;
+#ifdef USE_INTEL_SM4_AVX2
+  else if (ctx->use_intel_sm4_avx2)
+    {
+      return &sm4_intel_avx2_crypt_blk1_16;
+    }
+#endif
 #ifdef USE_GFNI_AVX512
   else if (ctx->use_gfni_avx512)
     {
@@ -1077,6 +1188,21 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
   const byte *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_intel_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
@@ -1227,6 +1353,21 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_intel_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
@@ -1376,6 +1517,21 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_intel_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
@@ -1681,6 +1837,37 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   u64 blkn = c->u_mode.ocb.data_nblocks;
   int burn_stack_depth = 0;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      u64 Ls[16];
+      u64 *l;
+
+      if (nblocks >= 16)
+	{
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_sm4_intel_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+					     c->u_iv.iv, c->u_ctr.ctr, Ls);
+	      else
+		_gcry_sm4_intel_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+					     c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * 16;
+	      inbuf += 16 * 16;
+	    }
+	}
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
@@ -1861,6 +2048,33 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   u64 blkn = c->u_mode.ocb.aad_nblocks;
   int burn_stack_depth = 0;
 
+#ifdef USE_INTEL_SM4_AVX2
+  if (ctx->use_intel_sm4_avx2)
+    {
+      u64 Ls[16];
+      u64 *l;
+
+      if (nblocks >= 16)
+	{
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_sm4_intel_avx2_ocb_auth(ctx->rkey_enc, abuf,
+					    c->u_mode.ocb.aad_offset,
+					    c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * 16;
+	    }
+	}
+    }
+#endif
+
 #ifdef USE_GFNI_AVX512
   if (ctx->use_gfni_avx512)
     {
diff --git a/configure.ac b/configure.ac
index b174e518..f12b590e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1636,6 +1636,29 @@ if test "$gcry_cv_gcc_inline_asm_sm3" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports Intel SM4 instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports Intel SM4 instructions],
+       [gcry_cv_gcc_inline_asm_sm4],
+       [if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_sm4="n/a"
+        else
+          gcry_cv_gcc_inline_asm_sm4=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void a(void) {
+              __asm__("vsm4key4 %%ymm2, %%ymm1, %%ymm3\n\t":::"cc");
+              __asm__("vsm4rnds4 %%ymm2, %%ymm1, %%ymm3\n\t":::"cc");
+            }]], [ a(); ] )],
+          [gcry_cv_gcc_inline_asm_sm4=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_sm4" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_SM4,1,
+     [Defined if inline assembler supports Intel SM4 instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
@@ -3850,6 +3873,7 @@ if test "$found" = "1" ; then
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-intel-avx2-amd64.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index b233cd78..004c4e30 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -588,6 +588,7 @@ are
 @item intel-gfni
 @item intel-sha512
 @item intel-sm3
+ at item intel-sm4
 @item arm-neon
 @item arm-aes
 @item arm-sha1
diff --git a/src/g10lib.h b/src/g10lib.h
index d0e64a69..0b4ca0c7 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -241,6 +241,7 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_INTEL_GFNI          (1 << 18)
 #define HWF_INTEL_SHA512        (1 << 19)
 #define HWF_INTEL_SM3           (1 << 20)
+#define HWF_INTEL_SM4           (1 << 21)
 
 #elif defined(HAVE_CPU_ARCH_ARM)
 
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index f8c3c948..df3ba4f9 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -429,6 +429,11 @@ detect_x86_gnuc (
       /* Test bit 1 for Intel SM3 instructions. */
       if ((intel_feat3 & (1 << 1)) && os_supports_avx_avx2_registers)
         result |= HWF_INTEL_SM3;
+
+      /* Test bit 2 for Intel SM4 instructions.  These instructions are
+       * available for AVX/AVX2/AVX512. */
+      if ((intel_feat3 & (1 << 2)) && os_supports_avx_avx2_registers)
+        result |= HWF_INTEL_SM4;
     }
 
   /* Check additional feature flags. */
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 4f9053af..dbc84857 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -86,6 +86,7 @@ static struct
     { HWF_INTEL_GFNI,          "intel-gfni" },
     { HWF_INTEL_SHA512,        "intel-sha512" },
     { HWF_INTEL_SM3,           "intel-sm3" },
+    { HWF_INTEL_SM4,           "intel-sm4" },
     /* Following removed HW feature strings are kept for API compatibility. */
     { 0,                       "intel-fast-vpgather" },
 #elif defined(HAVE_CPU_ARCH_ARM)
-- 
2.53.0