[PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Aug 1 16:25:34 CEST 2022
Hello,
Patch applied to master, thanks.
-Jussi
On 28.7.2022 11.26, Tianjia Zhang via Gcrypt-devel wrote:
> * cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
> * cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
> (_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
> --
>
> Benchmark on T-Head Yitian-710 2.75 GHz:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 0.373 ns/B 2560 MiB/s 1.02 c/B 2749
> XTS dec | 0.372 ns/B 2562 MiB/s 1.02 c/B 2750
>
> After (1.18x faster):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 0.314 ns/B 3038 MiB/s 0.863 c/B 2749
> XTS dec | 0.314 ns/B 3037 MiB/s 0.863 c/B 2749
>
> Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> ---
> cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++
> cipher/sm4.c | 18 +++-
> 2 files changed, 168 insertions(+), 1 deletion(-)
>
> diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
> index 5fb55947edc1..1a4ff736ad27 100644
> --- a/cipher/sm4-armv8-aarch64-ce.S
> +++ b/cipher/sm4-armv8-aarch64-ce.S
> @@ -62,6 +62,7 @@
> #define RTMP3 v19
>
> #define RIV v20
> +#define RMASK v21
>
> /* Helper macros. */
>
> @@ -69,6 +70,20 @@
> ld1 {v24.16b-v27.16b}, [ptr], #64; \
> ld1 {v28.16b-v31.16b}, [ptr];
>
> +#define SM4_CRYPT_BLK(b0) \
> + rev32 b0.16b, b0.16b; \
> + sm4e(b0, v24); \
> + sm4e(b0, v25); \
> + sm4e(b0, v26); \
> + sm4e(b0, v27); \
> + sm4e(b0, v28); \
> + sm4e(b0, v29); \
> + sm4e(b0, v30); \
> + sm4e(b0, v31); \
> + rev64 b0.4s, b0.4s; \
> + ext b0.16b, b0.16b, b0.16b, #8; \
> + rev32 b0.16b, b0.16b;
> +
> #define crypt_blk4(b0, b1, b2, b3) \
> rev32 b0.16b, b0.16b; \
> rev32 b1.16b, b1.16b; \
> @@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc:
> CFI_ENDPROC();
> ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
>
> +.align 3
> +.global _gcry_sm4_armv8_ce_xts_crypt
> +ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
> +_gcry_sm4_armv8_ce_xts_crypt:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: tweak (big endian, 128 bit)
> + * x4: nblocks
> + */
> + CFI_STARTPROC()
> + VPUSH_ABI
> +
> + load_rkey(x0)
> +
> + mov x7, #0x87
> + mov x8, #0x1
> + mov RMASK.d[0], x7
> + mov RMASK.d[1], x8
> +
> + ld1 {RIV.16b}, [x3]
> + mov v8.16b, RIV.16b
> + ext RIV.16b, RIV.16b, RIV.16b, #8
> +
> +.Lxts_loop_blk:
> + sub x4, x4, #8
> + tbnz x4, #63, .Lxts_tail8
> +
> +#define tweak_next(vt, vin, RTMP) \
> + sshr RTMP.2d, RIV.2d, #63; \
> + add vt.2d, vin.2d, vin.2d; \
> + and RTMP.16b, RTMP.16b, RMASK.16b; \
> + add RIV.2d, RIV.2d, RIV.2d; \
> + eor vt.16b, vt.16b, RTMP.16b;
> +
> + tweak_next( v9, v8, RTMP0)
> + tweak_next(v10, v9, RTMP1)
> + tweak_next(v11, v10, RTMP2)
> + tweak_next(v12, v11, RTMP3)
> + tweak_next(v13, v12, RTMP0)
> + tweak_next(v14, v13, RTMP1)
> + tweak_next(v15, v14, RTMP2)
> +
> + ld1 {v0.16b-v3.16b}, [x2], #64
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + ld1 {v4.16b-v7.16b}, [x2], #64
> + eor v4.16b, v4.16b, v12.16b
> + eor v5.16b, v5.16b, v13.16b
> + eor v6.16b, v6.16b, v14.16b
> + eor v7.16b, v7.16b, v15.16b
> +
> + crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
> +
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + st1 {v0.16b-v3.16b}, [x1], #64
> + eor v4.16b, v4.16b, v12.16b
> + eor v5.16b, v5.16b, v13.16b
> + eor v6.16b, v6.16b, v14.16b
> + eor v7.16b, v7.16b, v15.16b
> + st1 {v4.16b-v7.16b}, [x1], #64
> +
> + tweak_next(v8, v15, RTMP3)
> +
> + cbz x4, .Lxts_end
> + b .Lxts_loop_blk
> +
> +.Lxts_tail8:
> + add x4, x4, #8
> + cmp x4, #4
> + blt .Lxts_tail4
> +
> + sub x4, x4, #4
> +
> + tweak_next( v9, v8, RTMP0)
> + tweak_next(v10, v9, RTMP1)
> + tweak_next(v11, v10, RTMP2)
> +
> + ld1 {v0.16b-v3.16b}, [x2], #64
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> +
> + crypt_blk4(v0, v1, v2, v3);
> +
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + st1 {v0.16b-v3.16b}, [x1], #64
> +
> + tweak_next(v8, v11, RTMP3)
> +
> + cbz x4, .Lxts_end
> +
> +.Lxts_tail4:
> + sub x4, x4, #1
> +
> + ld1 {v0.16b}, [x2], #16
> + eor v0.16b, v0.16b, v8.16b
> +
> + SM4_CRYPT_BLK(v0)
> +
> + eor v0.16b, v0.16b, v8.16b
> + st1 {v0.16b}, [x1], #16
> +
> + tweak_next(v8, v8, RTMP0)
> +
> + cbnz x4, .Lxts_tail4
> +
> +.Lxts_end:
> + /* store new tweak */
> + st1 {v8.16b}, [x3]
> +
> + CLEAR_REG(v8)
> + CLEAR_REG(v9)
> + CLEAR_REG(v10)
> + CLEAR_REG(v11)
> + CLEAR_REG(v12)
> + CLEAR_REG(v13)
> + CLEAR_REG(v14)
> + CLEAR_REG(v15)
> + CLEAR_REG(RIV)
> +
> + VPOP_ABI
> + ret_spec_stop
> + CFI_ENDPROC()
> +ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
> +
> #endif
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index b5d4691ddbcb..4cac3b6c64b0 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -1,6 +1,6 @@
> /* sm4.c - SM4 Cipher Algorithm
> * Copyright (C) 2020 Alibaba Group.
> - * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> + * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
> *
> * This file is part of Libgcrypt.
> @@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
> byte *iv,
> size_t nblocks);
>
> +extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
> + const byte *in,
> + byte *tweak,
> + size_t nblocks);
> +
> extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
> const byte *in,
> size_t num_blocks);
> @@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
> const unsigned char *inbuf = inbuf_arg;
> int burn_stack_depth = 0;
>
> +#ifdef USE_ARM_CE
> + if (ctx->use_arm_ce)
> + {
> + /* Process all blocks at a time. */
> + _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
> + outbuf, inbuf, tweak, nblocks);
> +
> + nblocks = 0;
> + }
> +#endif
> +
> /* Process remaining blocks. */
> if (nblocks)
> {
More information about the Gcrypt-devel
mailing list