[PATCH] Add ARMv8/CE acceleration for AES-XTS
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Jan 12 18:32:27 CET 2018
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_xts_enc_armv8_ce)
(_gcry_aes_xts_dec_armv8_ce, xts_crypt_fn_t)
(_gcry_aes_armv8_ce_xts_crypt): New.
* cipher/rijndael.c (_gcry_aes_armv8_ce_xts_crypt): New.
(_gcry_aes_xts_crypt) [USE_ARM_CE]: New.
--
Benchmark on Cortex-A53 (AArch64, 1152 Mhz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 4.88 ns/B 195.5 MiB/s 5.62 c/B
XTS dec | 4.94 ns/B 192.9 MiB/s 5.70 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 5.55 ns/B 171.8 MiB/s 6.39 c/B
XTS dec | 5.61 ns/B 169.9 MiB/s 6.47 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 6.22 ns/B 153.3 MiB/s 7.17 c/B
XTS dec | 6.29 ns/B 151.7 MiB/s 7.24 c/B
=
After (~2.6x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 1.83 ns/B 520.9 MiB/s 2.11 c/B
XTS dec | 1.82 ns/B 524.9 MiB/s 2.09 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 1.97 ns/B 483.3 MiB/s 2.27 c/B
XTS dec | 1.96 ns/B 486.9 MiB/s 2.26 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 2.11 ns/B 450.9 MiB/s 2.44 c/B
XTS dec | 2.10 ns/B 453.8 MiB/s 2.42 c/B
=
Benchmark on Cortex-A53 (AArch32, 1152 Mhz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 6.52 ns/B 146.2 MiB/s 7.51 c/B
XTS dec | 6.57 ns/B 145.2 MiB/s 7.57 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 7.10 ns/B 134.3 MiB/s 8.18 c/B
XTS dec | 7.11 ns/B 134.2 MiB/s 8.19 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 7.30 ns/B 130.7 MiB/s 8.41 c/B
XTS dec | 7.38 ns/B 129.3 MiB/s 8.50 c/B
=
After (~2.7x faster):
Cipher:
AES | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 2.33 ns/B 409.6 MiB/s 2.68 c/B
XTS dec | 2.35 ns/B 405.3 MiB/s 2.71 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 2.53 ns/B 377.6 MiB/s 2.91 c/B
XTS dec | 2.54 ns/B 375.5 MiB/s 2.93 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
XTS enc | 2.75 ns/B 346.8 MiB/s 3.17 c/B
XTS dec | 2.76 ns/B 345.2 MiB/s 3.18 c/B
=
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-armv8-aarch32-ce.S | 311 ++++++++++++++++++++++++++++++++++++
cipher/rijndael-armv8-aarch64-ce.S | 274 ++++++++++++++++++++++++++++++++
cipher/rijndael-armv8-ce.c | 28 +++
cipher/rijndael.c | 12 +
4 files changed, 625 insertions(+)
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 5c8fa3c09..66440bd4e 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1517,6 +1517,317 @@ _gcry_aes_ocb_auth_armv8_ce:
.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ cmp r4, #0
+ beq .Lxts_enc_skip
+
+ cmp r5, #12
+
+ vld1.8 {q0}, [r3] /* load tweak */
+ mov r7, #0x87;
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lxts_enc_entry_192
+ bhi .Lxts_enc_entry_256
+
+#define CTR_XTS(bits, ...) \
+ .Lxts_enc_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lxts_enc_loop_##bits; \
+ \
+ .Lxts_enc_loop4_##bits: \
+ sub r4, r4, #4; \
+ veor q9, q9, q9; \
+ \
+ vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+ veor q1, q1, q0; \
+ cmp r4, #4; \
+ vmov.u32 d18[0], r7; \
+ vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+ veor q2, q2, q0; \
+ vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ veor q3, q3, q0; \
+ vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ veor q4, q4, q0; \
+ vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+ sub r1, r1, #48; \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+ veor q1, q1, q8; \
+ veor q2, q2, q9; \
+ vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+ sub r1, r1, #32; \
+ veor q3, q3, q8; \
+ veor q4, q4, q9; \
+ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lxts_enc_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lxts_enc_done; \
+ \
+ .Lxts_enc_loop_##bits: \
+ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ \
+ veor q9, q9, q9; \
+ veor q1, q1, q0; \
+ vmov.u32 d18[0], r7; \
+ vmov q2, q0; \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ subs r4, r4, #1; \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q2; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lxts_enc_loop_##bits; \
+ b .Lxts_enc_done;
+
+ CTR_XTS(128re, r0, r6)
+ CTR_XTS(192, r0, r6)
+ CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_enc_done:
+ vst1.8 {q0}, [r3] /* store tweak */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lxts_enc_skip:
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ cmp r4, #0
+ beq .Lxts_dec_skip
+
+ cmp r5, #12
+
+ vld1.8 {q0}, [r3] /* load tweak */
+ mov r7, #0x87;
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lxts_dec_entry_192
+ bhi .Lxts_dec_entry_256
+
+#define CTR_XTS(bits, ...) \
+ .Lxts_dec_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lxts_dec_loop_##bits; \
+ \
+ .Lxts_dec_loop4_##bits: \
+ sub r4, r4, #4; \
+ veor q9, q9, q9; \
+ \
+ vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+ veor q1, q1, q0; \
+ cmp r4, #4; \
+ vmov.u32 d18[0], r7; \
+ vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+ veor q2, q2, q0; \
+ vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ veor q3, q3, q0; \
+ vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ veor q4, q4, q0; \
+ vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+ sub r1, r1, #48; \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ \
+ do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+ veor q1, q1, q8; \
+ veor q2, q2, q9; \
+ vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+ sub r1, r1, #32; \
+ veor q3, q3, q8; \
+ veor q4, q4, q9; \
+ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lxts_dec_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lxts_dec_done; \
+ \
+ .Lxts_dec_loop_##bits: \
+ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ \
+ veor q9, q9, q9; \
+ veor q1, q1, q0; \
+ vmov.u32 d18[0], r7; \
+ vmov q2, q0; \
+ \
+ vshr.s64 d16, d1, #63; \
+ vshr.u64 d17, d0, #63; \
+ vadd.u64 q0, q0, q0; \
+ vand d16, d16, d18; \
+ veor q0, q0, q8; \
+ subs r4, r4, #1; \
+ \
+ do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q2; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lxts_dec_loop_##bits; \
+ b .Lxts_dec_done;
+
+ CTR_XTS(128re, r0, r6)
+ CTR_XTS(192, r0, r6)
+ CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_dec_done:
+ vst1.8 {q0}, [r3] /* store tweak */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lxts_dec_skip:
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
/*
* u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
*/
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 708ef340d..40097a710 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -1276,6 +1276,280 @@ _gcry_aes_ocb_auth_armv8_ce:
.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: tweak
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lxts_enc_skip
+
+ /* load tweak */
+ ld1 {v0.16b}, [x3]
+
+ /* load gfmul mask */
+ mov x6, #0x87
+ mov x7, #0x01
+ mov v16.D[0], x6
+ mov v16.D[1], x7
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lxts_enc_entry_192
+ b.hi .Lxts_enc_entry_256
+
+#define XTS_ENC(bits) \
+ .Lxts_enc_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lxts_enc_loop_##bits; \
+ \
+ .Lxts_enc_loop4_##bits: \
+ \
+ ext v4.16b, v0.16b, v0.16b, #8; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v5.2d, v0.2d, v0.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v5.16b, v5.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v6.2d, v5.2d, v5.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v6.16b, v6.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v7.2d, v6.2d, v6.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v7.16b, v7.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v3.2d, v7.2d, v7.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v3.16b, v3.16b, v2.16b; \
+ ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+ st1 {v3.16b}, [x3]; \
+ sub x4, x4, #4; \
+ eor v1.16b, v1.16b, v0.16b; \
+ \
+ ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+ cmp x4, #4; \
+ eor v2.16b, v2.16b, v5.16b; \
+ eor v3.16b, v3.16b, v6.16b; \
+ eor v4.16b, v4.16b, v7.16b; \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ ld1 {v0.16b}, [x3]; \
+ eor v2.16b, v2.16b, v5.16b; \
+ eor v3.16b, v3.16b, v6.16b; \
+ eor v4.16b, v4.16b, v7.16b; \
+ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lxts_enc_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x4, .Lxts_enc_done; \
+ \
+ .Lxts_enc_loop_##bits: \
+ \
+ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+ ext v3.16b, v0.16b, v0.16b, #8; \
+ mov v2.16b, v0.16b; \
+ sshr v3.2d, v3.2d, #63; \
+ add v0.2d, v0.2d, v0.2d; \
+ and v3.16b, v3.16b, v16.16b; \
+ eor v1.16b, v1.16b, v2.16b; \
+ eor v0.16b, v0.16b, v3.16b; \
+ sub x4, x4, #1; \
+ \
+ do_aes_one##bits(e, mc, v1, v1); \
+ \
+ eor v1.16b, v1.16b, v2.16b; \
+ st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+ \
+ cbnz x4, .Lxts_enc_loop_##bits; \
+ b .Lxts_enc_done;
+
+ XTS_ENC(128)
+ XTS_ENC(192)
+ XTS_ENC(256)
+
+#undef XTS_ENC
+
+.Lxts_enc_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store tweak */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lxts_enc_skip:
+ ret
+
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *tweak, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: tweak
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lxts_dec_skip
+
+ /* load tweak */
+ ld1 {v0.16b}, [x3]
+
+ /* load gfmul mask */
+ mov x6, #0x87
+ mov x7, #0x01
+ mov v16.D[0], x6
+ mov v16.D[1], x7
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lxts_dec_entry_192
+ b.hi .Lxts_dec_entry_256
+
+#define XTS_DEC(bits) \
+ .Lxts_dec_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lxts_dec_loop_##bits; \
+ \
+ .Lxts_dec_loop4_##bits: \
+ \
+ ext v4.16b, v0.16b, v0.16b, #8; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v5.2d, v0.2d, v0.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v5.16b, v5.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v6.2d, v5.2d, v5.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v6.16b, v6.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v7.2d, v6.2d, v6.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v7.16b, v7.16b, v2.16b; \
+ \
+ sshr v2.2d, v4.2d, #63; \
+ add v3.2d, v7.2d, v7.2d; \
+ and v2.16b, v2.16b, v16.16b; \
+ add v4.2d, v4.2d, v4.2d; \
+ eor v3.16b, v3.16b, v2.16b; \
+ ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+ st1 {v3.16b}, [x3]; \
+ sub x4, x4, #4; \
+ eor v1.16b, v1.16b, v0.16b; \
+ \
+ ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+ cmp x4, #4; \
+ eor v2.16b, v2.16b, v5.16b; \
+ eor v3.16b, v3.16b, v6.16b; \
+ eor v4.16b, v4.16b, v7.16b; \
+ \
+ do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ ld1 {v0.16b}, [x3]; \
+ eor v2.16b, v2.16b, v5.16b; \
+ eor v3.16b, v3.16b, v6.16b; \
+ eor v4.16b, v4.16b, v7.16b; \
+ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lxts_dec_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x4, .Lxts_dec_done; \
+ \
+ .Lxts_dec_loop_##bits: \
+ \
+ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+ ext v3.16b, v0.16b, v0.16b, #8; \
+ mov v2.16b, v0.16b; \
+ sshr v3.2d, v3.2d, #63; \
+ add v0.2d, v0.2d, v0.2d; \
+ and v3.16b, v3.16b, v16.16b; \
+ eor v1.16b, v1.16b, v2.16b; \
+ eor v0.16b, v0.16b, v3.16b; \
+ sub x4, x4, #1; \
+ \
+ do_aes_one##bits(d, imc, v1, v1); \
+ \
+ eor v1.16b, v1.16b, v2.16b; \
+ st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+ \
+ cbnz x4, .Lxts_dec_loop_##bits; \
+ b .Lxts_dec_done;
+
+ XTS_DEC(128)
+ XTS_DEC(192)
+ XTS_DEC(256)
+
+#undef XTS_DEC
+
+.Lxts_dec_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store tweak */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lxts_dec_skip:
+ ret
+
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
/*
* u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
*/
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 334cf6848..6af7108f8 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -101,6 +101,16 @@ extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
size_t nblocks,
unsigned int nrounds,
unsigned int blkn);
+extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *tweak,
+ size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *tweak,
+ size_t nblocks, unsigned int nrounds);
typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
const unsigned char *inbuf,
@@ -108,6 +118,11 @@ typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
unsigned char *L_table, size_t nblocks,
unsigned int nrounds, unsigned int blkn);
+typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *tweak, size_t nblocks,
+ unsigned int nrounds);
+
void
_gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
{
@@ -361,4 +376,17 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
nblocks, nrounds, (unsigned int)blkn);
}
+void
+_gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks, int encrypt)
+{
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
+ : _gcry_aes_xts_dec_armv8_ce;
+ unsigned int nrounds = ctx->rounds;
+
+ crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+}
+
#endif /* USE_ARM_CE */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index df1363f28..0f676fe14 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -211,6 +211,11 @@ extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
int encrypt);
extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c,
const void *abuf_arg, size_t nblocks);
+extern void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx,
+ unsigned char *tweak,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ size_t nblocks, int encrypt);
#endif /*USE_ARM_ASM*/
static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -1473,6 +1478,13 @@ _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
tweak_next_lo = buf_get_le64 (tweak + 0);
More information about the Gcrypt-devel
mailing list