[PATCH 4/4] Add ARMv8-CE HW acceleration for GCM-SIV counter mode
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Aug 13 17:01:29 CEST 2021
* cipher/rijndael-armv8-aarch32-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-ce.c
(_gcry_aes_ctr32le_enc_armv8_ce)
(_gcry_aes_armv8_ce_ctr32le_enc): New.
* cipher/rijndael.c
(_gcry_aes_armv8_ce_ctr32le_enc): New prototype.
(do_setkey): Add setup of 'bulk_ops->ctr32le_enc' for ARMv8-CE.
--
Benchmark on Cortex-A53 (aarch64):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM-SIV enc | 11.77 ns/B 81.03 MiB/s 7.63 c/B 647.9
GCM-SIV dec | 11.92 ns/B 79.98 MiB/s 7.73 c/B 647.9
GCM-SIV auth | 2.99 ns/B 318.9 MiB/s 1.94 c/B 648.0
After (~2.4x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM-SIV enc | 4.66 ns/B 204.5 MiB/s 3.02 c/B 647.9
GCM-SIV dec | 4.82 ns/B 198.0 MiB/s 3.12 c/B 647.9
GCM-SIV auth | 3.00 ns/B 318.4 MiB/s 1.94 c/B 648.0
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-armv8-aarch32-ce.S | 121 +++++++++++++++++++++++++++++
cipher/rijndael-armv8-aarch64-ce.S | 109 ++++++++++++++++++++++++++
cipher/rijndael-armv8-ce.c | 17 ++++
cipher/rijndael.c | 5 ++
4 files changed, 252 insertions(+)
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 66440bd4..6d78af0a 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1016,6 +1016,127 @@ _gcry_aes_ctr_enc_armv8_ce:
.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
+/*
+ * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr32le_enc_armv8_ce
+.type _gcry_aes_ctr32le_enc_armv8_ce,%function;
+_gcry_aes_ctr32le_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ cmp r4, #0
+ beq .Lctr32le_enc_skip
+
+ cmp r5, #12
+ vld1.8 {q0}, [r3] /* load IV */
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lctr32le_enc_entry_192
+ bhi .Lctr32le_enc_entry_256
+
+#define CTR_ENC(bits, ...) \
+ .Lctr32le_enc_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lctr32le_enc_loop_##bits; \
+ \
+ .Lctr32le_enc_loop4_##bits: \
+ veor q2, q2; \
+ sub r4, r4, #4; \
+ vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
+ vmov q1, q0; \
+ vadd.u32 q3, q2, q2; /* q3 <= -2:0:0:0 */ \
+ vadd.u32 q0, q3, q3; /* q0 <= -4:0:0:0 */ \
+ vadd.u32 q4, q3, q2; /* q4 <= -3:0:0:0 */ \
+ vsub.u32 q0, q1, q0; \
+ vsub.u32 q2, q1, q2; \
+ vst1.8 {q0}, [r3]; \
+ vsub.u32 q3, q1, q3; \
+ vsub.u32 q4, q1, q4; \
+ \
+ cmp r4, #4; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q0; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ veor q2, q2, q0; \
+ veor q3, q3, q1; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ vst1.8 {q2}, [r1]!; /* store plaintext */ \
+ veor q4, q4, q0; \
+ vld1.8 {q0}, [r3]; /* reload IV */ \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lctr32le_enc_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lctr32le_enc_done; \
+ \
+ .Lctr32le_enc_loop_##bits: \
+ \
+ veor q2, q2; \
+ vmov q1, q0; \
+ vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
+ subs r4, r4, #1; \
+ vsub.u32 q0, q0, q2; \
+ vld1.8 {q2}, [r2]!; /* load ciphertext */ \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q2, q1; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lctr32le_enc_loop_##bits; \
+ b .Lctr32le_enc_done;
+
+ CTR_ENC(128)
+ CTR_ENC(192, r0, r6)
+ CTR_ENC(256, r0, r6)
+
+#undef CTR_ENC
+
+.Lctr32le_enc_done:
+ vst1.8 {q0}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lctr32le_enc_skip:
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;
+
+
/*
* void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 3af29e0d..a87d2ca5 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -676,6 +676,115 @@ _gcry_aes_ctr_enc_armv8_ce:
ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
+/*
+ * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr32le_enc_armv8_ce
+ELF(.type _gcry_aes_ctr32le_enc_armv8_ce,%function;)
+_gcry_aes_ctr32le_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * x4: nblocks
+ * w5: nrounds
+ */
+ CFI_STARTPROC();
+
+ cbz x4, .Lctr32le_enc_skip
+
+ mov w6, #1
+ movi v16.16b, #0
+ mov v16.S[0], w6
+
+ /* load IV */
+ ld1 {v0.16b}, [x3]
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lctr32le_enc_entry_192
+ b.hi .Lctr32le_enc_entry_256
+
+#define CTR_ENC(bits) \
+ .Lctr32le_enc_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lctr32le_enc_loop_##bits; \
+ \
+ .Lctr32le_enc_loop4_##bits: \
+ sub x4, x4, #4; \
+ \
+ add v3.4s, v16.4s, v16.4s; /* 2 */ \
+ mov v1.16b, v0.16b; \
+ add v2.4s, v0.4s, v16.4s; \
+ add v4.4s, v3.4s, v16.4s; /* 3 */ \
+ add v6.4s, v3.4s, v3.4s; /* 4 */ \
+ add v3.4s, v0.4s, v3.4s; \
+ add v4.4s, v0.4s, v4.4s; \
+ add v0.4s, v0.4s, v6.4s; \
+ \
+ cmp x4, #4; \
+ ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v5.16b; \
+ ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
+ eor v2.16b, v2.16b, v6.16b; \
+ eor v3.16b, v3.16b, v7.16b; \
+ eor v4.16b, v4.16b, v5.16b; \
+ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lctr32le_enc_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x4, .Lctr32le_enc_done; \
+ \
+ .Lctr32le_enc_loop_##bits: \
+ \
+ mov v1.16b, v0.16b; \
+ ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
+ sub x4, x4, #1; \
+ add v0.4s, v0.4s, v16.4s; \
+ \
+ do_aes_one##bits(e, mc, v1, v1); \
+ \
+ eor v1.16b, v2.16b, v1.16b; \
+ st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+ \
+ cbnz x4, .Lctr32le_enc_loop_##bits; \
+ b .Lctr32le_enc_done;
+
+ CTR_ENC(128)
+ CTR_ENC(192)
+ CTR_ENC(256)
+
+#undef CTR_ENC
+
+.Lctr32le_enc_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lctr32le_enc_skip:
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;)
+
+
/*
* void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 6e46830e..b24ae3e9 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -75,6 +75,12 @@ extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
unsigned char *iv, size_t nblocks,
unsigned int nrounds);
+extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
unsigned char *outbuf,
const unsigned char *inbuf,
@@ -345,6 +351,17 @@ _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv,
_gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
}
+void
+_gcry_aes_armv8_ce_ctr32le_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+ unsigned char *outbuf,
+ const unsigned char *inbuf, size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_ctr32le_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
size_t
_gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index c096321f..df41b911 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -209,6 +209,10 @@ extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv,
extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+extern void _gcry_aes_armv8_ce_ctr32le_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
@@ -570,6 +574,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->cbc_enc = _gcry_aes_armv8_ce_cbc_enc;
bulk_ops->cbc_dec = _gcry_aes_armv8_ce_cbc_dec;
bulk_ops->ctr_enc = _gcry_aes_armv8_ce_ctr_enc;
+ bulk_ops->ctr32le_enc = _gcry_aes_armv8_ce_ctr32le_enc;
bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
--
2.30.2
More information about the Gcrypt-devel
mailing list