[PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode

Tianjia Zhang tianjia.zhang at linux.alibaba.com
Thu Jul 28 10:26:54 CEST 2022


* cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
* cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
(_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
--

Benchmark on T-Head Yitian-710 2.75 GHz:

Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |     0.373 ns/B      2560 MiB/s      1.02 c/B      2749
        XTS dec |     0.372 ns/B      2562 MiB/s      1.02 c/B      2750

After (1.18x faster):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |     0.314 ns/B      3038 MiB/s     0.863 c/B      2749
        XTS dec |     0.314 ns/B      3037 MiB/s     0.863 c/B      2749

Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
---
 cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++
 cipher/sm4.c                  |  18 +++-
 2 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
index 5fb55947edc1..1a4ff736ad27 100644
--- a/cipher/sm4-armv8-aarch64-ce.S
+++ b/cipher/sm4-armv8-aarch64-ce.S
@@ -62,6 +62,7 @@
 #define RTMP3   v19
 
 #define RIV     v20
+#define RMASK   v21
 
 /* Helper macros. */
 
@@ -69,6 +70,20 @@
         ld1 {v24.16b-v27.16b}, [ptr], #64; \
         ld1 {v28.16b-v31.16b}, [ptr];
 
+#define SM4_CRYPT_BLK(b0)                       \
+        rev32       b0.16b, b0.16b;             \
+        sm4e(b0, v24);                          \
+        sm4e(b0, v25);                          \
+        sm4e(b0, v26);                          \
+        sm4e(b0, v27);                          \
+        sm4e(b0, v28);                          \
+        sm4e(b0, v29);                          \
+        sm4e(b0, v30);                          \
+        sm4e(b0, v31);                          \
+        rev64       b0.4s, b0.4s;               \
+        ext         b0.16b, b0.16b, b0.16b, #8; \
+        rev32       b0.16b, b0.16b;
+
 #define crypt_blk4(b0, b1, b2, b3)         \
         rev32 b0.16b, b0.16b;              \
         rev32 b1.16b, b1.16b;              \
@@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc:
     CFI_ENDPROC();
 ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
 
+.align 3
+.global _gcry_sm4_armv8_ce_xts_crypt
+ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
+_gcry_sm4_armv8_ce_xts_crypt:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: tweak (big endian, 128 bit)
+     *   x4: nblocks
+     */
+    CFI_STARTPROC()
+    VPUSH_ABI
+
+    load_rkey(x0)
+
+    mov         x7, #0x87
+    mov         x8, #0x1
+    mov         RMASK.d[0], x7
+    mov         RMASK.d[1], x8
+
+    ld1         {RIV.16b}, [x3]
+    mov         v8.16b, RIV.16b
+    ext         RIV.16b, RIV.16b, RIV.16b, #8
+
+.Lxts_loop_blk:
+    sub         x4, x4, #8
+    tbnz        x4, #63, .Lxts_tail8
+
+#define tweak_next(vt, vin, RTMP)                       \
+        sshr        RTMP.2d, RIV.2d, #63;               \
+        add         vt.2d, vin.2d, vin.2d;              \
+        and         RTMP.16b, RTMP.16b, RMASK.16b;      \
+        add         RIV.2d, RIV.2d, RIV.2d;             \
+        eor         vt.16b, vt.16b, RTMP.16b;
+
+    tweak_next( v9,  v8, RTMP0)
+    tweak_next(v10,  v9, RTMP1)
+    tweak_next(v11, v10, RTMP2)
+    tweak_next(v12, v11, RTMP3)
+    tweak_next(v13, v12, RTMP0)
+    tweak_next(v14, v13, RTMP1)
+    tweak_next(v15, v14, RTMP2)
+
+    ld1         {v0.16b-v3.16b}, [x2], #64
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    ld1         {v4.16b-v7.16b}, [x2], #64
+    eor         v4.16b, v4.16b, v12.16b
+    eor         v5.16b, v5.16b, v13.16b
+    eor         v6.16b, v6.16b, v14.16b
+    eor         v7.16b, v7.16b, v15.16b
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    st1         {v0.16b-v3.16b}, [x1], #64
+    eor         v4.16b, v4.16b, v12.16b
+    eor         v5.16b, v5.16b, v13.16b
+    eor         v6.16b, v6.16b, v14.16b
+    eor         v7.16b, v7.16b, v15.16b
+    st1         {v4.16b-v7.16b}, [x1], #64
+
+    tweak_next(v8, v15, RTMP3)
+
+    cbz         x4, .Lxts_end
+    b           .Lxts_loop_blk
+
+.Lxts_tail8:
+    add         x4, x4, #8
+    cmp         x4, #4
+    blt         .Lxts_tail4
+
+    sub         x4, x4, #4
+
+    tweak_next( v9,  v8, RTMP0)
+    tweak_next(v10,  v9, RTMP1)
+    tweak_next(v11, v10, RTMP2)
+
+    ld1         {v0.16b-v3.16b}, [x2], #64
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+
+    crypt_blk4(v0, v1, v2, v3);
+
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    st1         {v0.16b-v3.16b}, [x1], #64
+
+    tweak_next(v8, v11, RTMP3)
+
+    cbz         x4, .Lxts_end
+
+.Lxts_tail4:
+    sub         x4, x4, #1
+
+    ld1         {v0.16b}, [x2], #16
+    eor         v0.16b, v0.16b, v8.16b
+
+    SM4_CRYPT_BLK(v0)
+
+    eor         v0.16b, v0.16b, v8.16b
+    st1         {v0.16b}, [x1], #16
+
+    tweak_next(v8, v8, RTMP0)
+
+    cbnz        x4, .Lxts_tail4
+
+.Lxts_end:
+    /* store new tweak */
+    st1         {v8.16b}, [x3]
+
+    CLEAR_REG(v8)
+    CLEAR_REG(v9)
+    CLEAR_REG(v10)
+    CLEAR_REG(v11)
+    CLEAR_REG(v12)
+    CLEAR_REG(v13)
+    CLEAR_REG(v14)
+    CLEAR_REG(v15)
+    CLEAR_REG(RIV)
+
+    VPOP_ABI
+    ret_spec_stop
+    CFI_ENDPROC()
+ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
+
 #endif
diff --git a/cipher/sm4.c b/cipher/sm4.c
index b5d4691ddbcb..4cac3b6c64b0 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -1,6 +1,6 @@
 /* sm4.c  -  SM4 Cipher Algorithm
  * Copyright (C) 2020 Alibaba Group.
- * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
+ * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
  * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
@@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
 				       byte *iv,
 				       size_t nblocks);
 
+extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
+					 const byte *in,
+					 byte *tweak,
+					 size_t nblocks);
+
 extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
 					    const byte *in,
 					    size_t num_blocks);
@@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      /* Process all blocks at a time. */
+      _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
+                                   outbuf, inbuf, tweak, nblocks);
+
+      nblocks = 0;
+    }
+#endif
+
   /* Process remaining blocks. */
   if (nblocks)
     {
-- 
2.24.3 (Apple Git-128)




More information about the Gcrypt-devel mailing list