[PATCH 3/3] Add ARMv8/AArch32 Crypto Extension implementation of AES
Jussi Kivilinna
jussi.kivilinna at iki.fi
Tue Jul 12 11:54:06 CEST 2016
* cipher/Makefile.am: Add 'rijndael-armv8-ce.c' and
'rijndael-armv-aarch32-ce.S'.
* cipher/rijndael-armv8-aarch32-ce.S: New.
* cipher/rijndael-armv8-ce.c: New.
* cipher/rijndael-internal.h (USE_ARM_CE): New.
(RIJNDAEL_context_s): Add 'use_arm_ce'.
* cipher/rijndael.c [USE_ARM_CE] (_gcry_aes_armv8_ce_setkey)
(_gcry_aes_armv8_ce_prepare_decryption)
(_gcry_aes_armv8_ce_encrypt, _gcry_aes_armv8_ce_decrypt)
(_gcry_aes_armv8_ce_cfb_enc, _gcry_aes_armv8_ce_cbc_enc)
(_gcry_aes_armv8_ce_ctr_enc, _gcry_aes_armv8_ce_cfb_dec)
(_gcry_aes_armv8_ce_cbc_dec, _gcry_aes_armv8_ce_ocb_crypt)
(_gcry_aes_armv8_ce_ocb_auth): New.
(do_setkey) [USE_ARM_CE]: Add ARM CE/AES HW feature check and key
setup for ARM CE.
(prepare_decryption, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
(_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec)
(_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth) [USE_ARM_CE]: Add
ARM CE support.
* configure.ac: Add 'rijndael-armv8-ce.lo' and
'rijndael-armv8-aarch32-ce.lo'.
--
Improvement vs ARM assembly on Cortex-A53:
AES-128 AES-192 AES-256
CBC enc: 14.8x 12.8x 11.4x
CBC dec: 21.4x 20.5x 19.4x
CFB enc: 16.2x 13.6x 11.6x
CFB dec: 21.6x 20.5x 19.4x
CTR: 19.1x 18.6x 17.8x
OCB enc: 16.0x 16.2x 16.1x
OCB dec: 15.6x 15.9x 15.8x
OCB auth: 18.3x 18.4x 18.0x
Benchmark on Cortex-A53 (1152 Mhz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 24.42 ns/B 39.06 MiB/s 28.13 c/B
ECB dec | 25.07 ns/B 38.05 MiB/s 28.88 c/B
CBC enc | 21.05 ns/B 45.30 MiB/s 24.25 c/B
CBC dec | 21.16 ns/B 45.07 MiB/s 24.38 c/B
CFB enc | 21.05 ns/B 45.31 MiB/s 24.25 c/B
CFB dec | 21.38 ns/B 44.61 MiB/s 24.62 c/B
OFB enc | 26.15 ns/B 36.47 MiB/s 30.13 c/B
OFB dec | 26.15 ns/B 36.47 MiB/s 30.13 c/B
CTR enc | 21.17 ns/B 45.06 MiB/s 24.38 c/B
CTR dec | 21.16 ns/B 45.06 MiB/s 24.38 c/B
CCM enc | 42.32 ns/B 22.53 MiB/s 48.75 c/B
CCM dec | 42.32 ns/B 22.53 MiB/s 48.75 c/B
CCM auth | 21.17 ns/B 45.06 MiB/s 24.38 c/B
GCM enc | 22.08 ns/B 43.19 MiB/s 25.44 c/B
GCM dec | 22.08 ns/B 43.18 MiB/s 25.44 c/B
GCM auth | 0.923 ns/B 1032.8 MiB/s 1.06 c/B
OCB enc | 26.20 ns/B 36.40 MiB/s 30.18 c/B
OCB dec | 25.97 ns/B 36.73 MiB/s 29.91 c/B
OCB auth | 24.52 ns/B 38.90 MiB/s 28.24 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 27.83 ns/B 34.26 MiB/s 32.06 c/B
ECB dec | 28.54 ns/B 33.42 MiB/s 32.88 c/B
CBC enc | 24.47 ns/B 38.97 MiB/s 28.19 c/B
CBC dec | 25.27 ns/B 37.74 MiB/s 29.11 c/B
CFB enc | 25.08 ns/B 38.02 MiB/s 28.89 c/B
CFB dec | 25.31 ns/B 37.68 MiB/s 29.16 c/B
OFB enc | 29.57 ns/B 32.25 MiB/s 34.06 c/B
OFB dec | 29.57 ns/B 32.25 MiB/s 34.06 c/B
CTR enc | 25.24 ns/B 37.78 MiB/s 29.08 c/B
CTR dec | 25.24 ns/B 37.79 MiB/s 29.08 c/B
CCM enc | 49.81 ns/B 19.15 MiB/s 57.38 c/B
CCM dec | 49.80 ns/B 19.15 MiB/s 57.37 c/B
CCM auth | 24.58 ns/B 38.80 MiB/s 28.32 c/B
GCM enc | 26.15 ns/B 36.47 MiB/s 30.13 c/B
GCM dec | 26.11 ns/B 36.52 MiB/s 30.08 c/B
GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B
OCB enc | 29.59 ns/B 32.23 MiB/s 34.09 c/B
OCB dec | 29.42 ns/B 32.42 MiB/s 33.89 c/B
OCB auth | 27.92 ns/B 34.16 MiB/s 32.16 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 31.20 ns/B 30.57 MiB/s 35.94 c/B
ECB dec | 31.80 ns/B 29.99 MiB/s 36.63 c/B
CBC enc | 27.83 ns/B 34.27 MiB/s 32.06 c/B
CBC dec | 27.87 ns/B 34.21 MiB/s 32.11 c/B
CFB enc | 27.88 ns/B 34.20 MiB/s 32.12 c/B
CFB dec | 28.16 ns/B 33.87 MiB/s 32.44 c/B
OFB enc | 32.93 ns/B 28.96 MiB/s 37.94 c/B
OFB dec | 32.93 ns/B 28.96 MiB/s 37.94 c/B
CTR enc | 27.95 ns/B 34.13 MiB/s 32.19 c/B
CTR dec | 27.95 ns/B 34.12 MiB/s 32.20 c/B
CCM enc | 55.88 ns/B 17.07 MiB/s 64.38 c/B
CCM dec | 55.88 ns/B 17.07 MiB/s 64.38 c/B
CCM auth | 27.95 ns/B 34.12 MiB/s 32.20 c/B
GCM enc | 28.86 ns/B 33.05 MiB/s 33.25 c/B
GCM dec | 28.87 ns/B 33.04 MiB/s 33.25 c/B
GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B
OCB enc | 32.96 ns/B 28.94 MiB/s 37.97 c/B
OCB dec | 32.73 ns/B 29.14 MiB/s 37.70 c/B
OCB auth | 31.29 ns/B 30.48 MiB/s 36.04 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 5.10 ns/B 187.0 MiB/s 5.88 c/B
ECB dec | 5.27 ns/B 181.0 MiB/s 6.07 c/B
CBC enc | 1.41 ns/B 675.8 MiB/s 1.63 c/B
CBC dec | 0.992 ns/B 961.7 MiB/s 1.14 c/B
CFB enc | 1.30 ns/B 732.4 MiB/s 1.50 c/B
CFB dec | 0.991 ns/B 962.7 MiB/s 1.14 c/B
OFB enc | 7.05 ns/B 135.2 MiB/s 8.13 c/B
OFB dec | 7.05 ns/B 135.2 MiB/s 8.13 c/B
CTR enc | 1.11 ns/B 856.9 MiB/s 1.28 c/B
CTR dec | 1.11 ns/B 857.0 MiB/s 1.28 c/B
CCM enc | 2.58 ns/B 369.8 MiB/s 2.97 c/B
CCM dec | 2.58 ns/B 369.5 MiB/s 2.97 c/B
CCM auth | 1.58 ns/B 605.2 MiB/s 1.82 c/B
GCM enc | 2.04 ns/B 467.9 MiB/s 2.35 c/B
GCM dec | 2.04 ns/B 466.6 MiB/s 2.35 c/B
GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B
OCB enc | 1.64 ns/B 579.8 MiB/s 1.89 c/B
OCB dec | 1.66 ns/B 574.5 MiB/s 1.91 c/B
OCB auth | 1.33 ns/B 715.5 MiB/s 1.54 c/B
=
AES192 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 5.64 ns/B 169.0 MiB/s 6.50 c/B
ECB dec | 5.81 ns/B 164.3 MiB/s 6.69 c/B
CBC enc | 1.90 ns/B 502.1 MiB/s 2.19 c/B
CBC dec | 1.24 ns/B 771.7 MiB/s 1.42 c/B
CFB enc | 1.84 ns/B 517.1 MiB/s 2.12 c/B
CFB dec | 1.23 ns/B 772.5 MiB/s 1.42 c/B
OFB enc | 7.60 ns/B 125.5 MiB/s 8.75 c/B
OFB dec | 7.60 ns/B 125.6 MiB/s 8.75 c/B
CTR enc | 1.36 ns/B 702.7 MiB/s 1.56 c/B
CTR dec | 1.36 ns/B 702.5 MiB/s 1.56 c/B
CCM enc | 3.31 ns/B 287.8 MiB/s 3.82 c/B
CCM dec | 3.31 ns/B 288.0 MiB/s 3.81 c/B
CCM auth | 2.06 ns/B 462.1 MiB/s 2.38 c/B
GCM enc | 2.28 ns/B 418.4 MiB/s 2.63 c/B
GCM dec | 2.28 ns/B 418.0 MiB/s 2.63 c/B
GCM auth | 0.923 ns/B 1032.8 MiB/s 1.06 c/B
OCB enc | 1.83 ns/B 520.1 MiB/s 2.11 c/B
OCB dec | 1.84 ns/B 517.8 MiB/s 2.12 c/B
OCB auth | 1.52 ns/B 626.1 MiB/s 1.75 c/B
=
AES256 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 5.86 ns/B 162.7 MiB/s 6.75 c/B
ECB dec | 6.02 ns/B 158.3 MiB/s 6.94 c/B
CBC enc | 2.44 ns/B 390.5 MiB/s 2.81 c/B
CBC dec | 1.45 ns/B 656.4 MiB/s 1.67 c/B
CFB enc | 2.39 ns/B 399.5 MiB/s 2.75 c/B
CFB dec | 1.45 ns/B 656.8 MiB/s 1.67 c/B
OFB enc | 7.81 ns/B 122.1 MiB/s 9.00 c/B
OFB dec | 7.81 ns/B 122.1 MiB/s 9.00 c/B
CTR enc | 1.57 ns/B 605.8 MiB/s 1.81 c/B
CTR dec | 1.57 ns/B 605.9 MiB/s 1.81 c/B
CCM enc | 4.07 ns/B 234.3 MiB/s 4.69 c/B
CCM dec | 4.07 ns/B 234.1 MiB/s 4.69 c/B
CCM auth | 2.61 ns/B 365.7 MiB/s 3.00 c/B
GCM enc | 2.50 ns/B 381.9 MiB/s 2.88 c/B
GCM dec | 2.49 ns/B 382.3 MiB/s 2.87 c/B
GCM auth | 0.926 ns/B 1029.7 MiB/s 1.07 c/B
OCB enc | 2.05 ns/B 465.6 MiB/s 2.36 c/B
OCB dec | 2.06 ns/B 462.0 MiB/s 2.38 c/B
OCB auth | 1.74 ns/B 548.4 MiB/s 2.00 c/B
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 5d69a38..de619fe 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -81,6 +81,7 @@ md5.c \
poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \
+ rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \
rmd160.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
new file mode 100644
index 0000000..f3b5400
--- /dev/null
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -0,0 +1,1483 @@
+/* ARMv8 CE accelerated AES
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+
+.syntax unified
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+# define GET_DATA_POINTER(reg, name, rtmp) \
+ ldr reg, 1f; \
+ ldr rtmp, 2f; \
+ b 3f; \
+ 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+ 2: .word name(GOT); \
+ 3: add reg, pc, reg; \
+ ldr reg, [reg, rtmp];
+#else
+# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* AES macros */
+
+#define aes_preload_keys(keysched, rekeysched) \
+ vldmia keysched!, {q5-q7}; \
+ mov rekeysched, keysched; \
+ vldmialo keysched!, {q8-q15}; /* 128-bit */ \
+ addeq keysched, #(2*16); \
+ vldmiaeq keysched!, {q10-q15}; /* 192-bit */ \
+ addhi keysched, #(4*16); \
+ vldmiahi keysched!, {q12-q15}; /* 256-bit */ \
+
+#define do_aes_one128(ed, mcimc, qo, qb) \
+ aes##ed.8 qb, q5; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q6; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q7; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q8; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q9; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q10; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q11; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q12; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q13; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q14; \
+ veor qo, qb, q15;
+
+#define do_aes_one128re(ed, mcimc, qo, qb, keysched, rekeysched) \
+ vldm rekeysched, {q8-q9}; \
+ do_aes_one128(ed, mcimc, qo, qb);
+
+#define do_aes_one192(ed, mcimc, qo, qb, keysched, rekeysched) \
+ vldm rekeysched!, {q8}; \
+ aes##ed.8 qb, q5; \
+ aes##mcimc.8 qb, qb; \
+ vldm rekeysched, {q9}; \
+ aes##ed.8 qb, q6; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q7; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q8; \
+ aes##mcimc.8 qb, qb; \
+ vldmia keysched!, {q8}; \
+ aes##ed.8 qb, q9; \
+ aes##mcimc.8 qb, qb; \
+ sub rekeysched, #(1*16); \
+ aes##ed.8 qb, q10; \
+ aes##mcimc.8 qb, qb; \
+ vldm keysched, {q9}; \
+ aes##ed.8 qb, q11; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q12; \
+ aes##mcimc.8 qb, qb; \
+ sub keysched, #16; \
+ aes##ed.8 qb, q13; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q14; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q15; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q8; \
+ veor qo, qb, q9; \
+
+#define do_aes_one256(ed, mcimc, qo, qb, keysched, rekeysched) \
+ vldmia rekeysched!, {q8}; \
+ aes##ed.8 qb, q5; \
+ aes##mcimc.8 qb, qb; \
+ vldmia rekeysched!, {q9}; \
+ aes##ed.8 qb, q6; \
+ aes##mcimc.8 qb, qb; \
+ vldmia rekeysched!, {q10}; \
+ aes##ed.8 qb, q7; \
+ aes##mcimc.8 qb, qb; \
+ vldm rekeysched, {q11}; \
+ aes##ed.8 qb, q8; \
+ aes##mcimc.8 qb, qb; \
+ vldmia keysched!, {q8}; \
+ aes##ed.8 qb, q9; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q10; \
+ aes##mcimc.8 qb, qb; \
+ vldmia keysched!, {q9}; \
+ aes##ed.8 qb, q11; \
+ aes##mcimc.8 qb, qb; \
+ sub rekeysched, #(3*16); \
+ aes##ed.8 qb, q12; \
+ aes##mcimc.8 qb, qb; \
+ vldmia keysched!, {q10}; \
+ aes##ed.8 qb, q13; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q14; \
+ aes##mcimc.8 qb, qb; \
+ vldm keysched, {q11}; \
+ aes##ed.8 qb, q15; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q8; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q9; \
+ aes##mcimc.8 qb, qb; \
+ aes##ed.8 qb, q10; \
+ veor qo, qb, q11; \
+ sub keysched, #(3*16); \
+
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+ aes##ed.8 b0, key; \
+ aes##mcimc.8 b0, b0; \
+ aes##ed.8 b1, key; \
+ aes##mcimc.8 b1, b1; \
+ aes##ed.8 b2, key; \
+ aes##mcimc.8 b2, b2; \
+ aes##ed.8 b3, key; \
+ aes##mcimc.8 b3, b3;
+
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+ aes##ed.8 b0, q14; \
+ veor b0, b0, q15; \
+ aes##ed.8 b1, q14; \
+ veor b1, b1, q15; \
+ aes##ed.8 b2, q14; \
+ veor b2, b2, q15; \
+ aes##ed.8 b3, q14; \
+ veor b3, b3, q15;
+
+#define do_aes_4_128re(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+ vldm rekeysched, {q8-q9}; \
+ do_aes_4_128(ed, mcimc, b0, b1, b2, b3);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+ vldm rekeysched!, {q8}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+ vldm rekeysched, {q9}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+ vldmia keysched!, {q8}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+ sub rekeysched, #(1*16); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+ vldm keysched, {q9}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+ sub keysched, #16; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
+ aes##ed.8 b0, q8; \
+ veor b0, b0, q9; \
+ aes##ed.8 b1, q8; \
+ veor b1, b1, q9; \
+ aes##ed.8 b2, q8; \
+ veor b2, b2, q9; \
+ aes##ed.8 b3, q8; \
+ veor b3, b3, q9;
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+ vldmia rekeysched!, {q8}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+ vldmia rekeysched!, {q9}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+ vldmia rekeysched!, {q10}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+ vldm rekeysched, {q11}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+ vldmia keysched!, {q8}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+ vldmia keysched!, {q9}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+ sub rekeysched, #(3*16); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+ vldmia keysched!, {q10}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
+ vldm keysched, {q11}; \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+ sub keysched, #(3*16); \
+ aes##ed.8 b0, q10; \
+ veor b0, b0, q11; \
+ aes##ed.8 b1, q10; \
+ veor b1, b1, q11; \
+ aes##ed.8 b2, q10; \
+ veor b2, b2, q11; \
+ aes##ed.8 b3, q10; \
+ veor b3, b3, q11;
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
+ * const byte *src,
+ * unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_enc_armv8_ce
+.type _gcry_aes_enc_armv8_ce,%function;
+_gcry_aes_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: dst
+ * r2: src
+ * r3: nrounds
+ */
+
+ vldmia r0!, {q1-q3} /* load 3 round keys */
+
+ cmp r3, #12
+
+ vld1.8 {q0}, [r2]
+
+ bhi .Lenc1_256
+ beq .Lenc1_192
+
+.Lenc1_128:
+
+.Lenc1_tail:
+ vldmia r0, {q8-q15} /* load 8 round keys */
+
+ aese.8 q0, q1
+ aesmc.8 q0, q0
+ CLEAR_REG(q1)
+
+ aese.8 q0, q2
+ aesmc.8 q0, q0
+ CLEAR_REG(q2)
+
+ aese.8 q0, q3
+ aesmc.8 q0, q0
+ CLEAR_REG(q3)
+
+ aese.8 q0, q8
+ aesmc.8 q0, q0
+ CLEAR_REG(q8)
+
+ aese.8 q0, q9
+ aesmc.8 q0, q0
+ CLEAR_REG(q9)
+
+ aese.8 q0, q10
+ aesmc.8 q0, q0
+ CLEAR_REG(q10)
+
+ aese.8 q0, q11
+ aesmc.8 q0, q0
+ CLEAR_REG(q11)
+
+ aese.8 q0, q12
+ aesmc.8 q0, q0
+ CLEAR_REG(q12)
+
+ aese.8 q0, q13
+ aesmc.8 q0, q0
+ CLEAR_REG(q13)
+
+ aese.8 q0, q14
+ veor q0, q15
+ CLEAR_REG(q14)
+ CLEAR_REG(q15)
+
+ vst1.8 {q0}, [r1]
+ CLEAR_REG(q0)
+
+ mov r0, #0
+ bx lr
+
+.Lenc1_192:
+ aese.8 q0, q1
+ aesmc.8 q0, q0
+ vmov q1, q3
+
+ aese.8 q0, q2
+ aesmc.8 q0, q0
+ vldm r0!, {q2-q3} /* load 3 round keys */
+
+ b .Lenc1_tail
+
+.Lenc1_256:
+ vldm r0!, {q15} /* load 1 round key */
+ aese.8 q0, q1
+ aesmc.8 q0, q0
+
+ aese.8 q0, q2
+ aesmc.8 q0, q0
+
+ aese.8 q0, q3
+ aesmc.8 q0, q0
+ vldm r0!, {q1-q3} /* load 3 round keys */
+
+ aese.8 q0, q15
+ aesmc.8 q0, q0
+
+ b .Lenc1_tail
+.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
+
+
+/*
+ * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
+ * const byte *src,
+ * unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_dec_armv8_ce
+.type _gcry_aes_dec_armv8_ce,%function;
+_gcry_aes_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: dst
+ * r2: src
+ * r3: nrounds
+ */
+
+ vldmia r0!, {q1-q3} /* load 3 round keys */
+
+ cmp r3, #12
+
+ vld1.8 {q0}, [r2]
+
+ bhi .Ldec1_256
+ beq .Ldec1_192
+
+.Ldec1_128:
+
+.Ldec1_tail:
+ vldmia r0, {q8-q15} /* load 8 round keys */
+
+ aesd.8 q0, q1
+ aesimc.8 q0, q0
+ CLEAR_REG(q1)
+
+ aesd.8 q0, q2
+ aesimc.8 q0, q0
+ CLEAR_REG(q2)
+
+ aesd.8 q0, q3
+ aesimc.8 q0, q0
+ CLEAR_REG(q3)
+
+ aesd.8 q0, q8
+ aesimc.8 q0, q0
+ CLEAR_REG(q8)
+
+ aesd.8 q0, q9
+ aesimc.8 q0, q0
+ CLEAR_REG(q9)
+
+ aesd.8 q0, q10
+ aesimc.8 q0, q0
+ CLEAR_REG(q10)
+
+ aesd.8 q0, q11
+ aesimc.8 q0, q0
+ CLEAR_REG(q11)
+
+ aesd.8 q0, q12
+ aesimc.8 q0, q0
+ CLEAR_REG(q12)
+
+ aesd.8 q0, q13
+ aesimc.8 q0, q0
+ CLEAR_REG(q13)
+
+ aesd.8 q0, q14
+ veor q0, q15
+ CLEAR_REG(q14)
+ CLEAR_REG(q15)
+
+ vst1.8 {q0}, [r1]
+ CLEAR_REG(q0)
+
+ mov r0, #0
+ bx lr
+
+.Ldec1_192:
+ aesd.8 q0, q1
+ aesimc.8 q0, q0
+ vmov q1, q3
+
+ aesd.8 q0, q2
+ aesimc.8 q0, q0
+ vldm r0!, {q2-q3} /* load 3 round keys */
+
+ b .Ldec1_tail
+
+.Ldec1_256:
+ vldm r0!, {q15} /* load 1 round key */
+ aesd.8 q0, q1
+ aesimc.8 q0, q0
+
+ aesd.8 q0, q2
+ aesimc.8 q0, q0
+
+ aesd.8 q0, q3
+ aesimc.8 q0, q0
+ vldm r0!, {q1-q3} /* load 3 round keys */
+
+ aesd.8 q0, q15
+ aesimc.8 q0, q0
+
+ b .Ldec1_tail
+.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, size_t nblocks,
+ * int cbc_mac, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_enc_armv8_ce
+.type _gcry_aes_cbc_enc_armv8_ce,%function;
+_gcry_aes_cbc_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: cbc_mac => r5
+ * %st+8: nrounds => r6
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ ldr r4, [sp, #(16+0)]
+ ldr r5, [sp, #(16+4)]
+ cmp r4, #0
+ ldr r6, [sp, #(16+8)]
+ beq .Lcbc_enc_skip
+ cmp r5, #0
+ vpush {q4-q7}
+ moveq r5, #16
+ movne r5, #0
+
+ cmp r6, #12
+ vld1.8 {q1}, [r3] /* load IV */
+
+ aes_preload_keys(r0, lr);
+
+ beq .Lcbc_enc_loop192
+ bhi .Lcbc_enc_loop256
+
+#define CBC_ENC(bits, ...) \
+ .Lcbc_enc_loop##bits: \
+ vld1.8 {q0}, [r2]!; /* load plaintext */ \
+ veor q1, q0, q1; \
+ subs r4, r4, #1; \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+ \
+ vst1.8 {q1}, [r1], r5; /* store ciphertext */ \
+ \
+ bne .Lcbc_enc_loop##bits; \
+ b .Lcbc_enc_done;
+
+ CBC_ENC(128)
+ CBC_ENC(192, r0, lr)
+ CBC_ENC(256, r0, lr)
+
+#undef CBC_ENC
+
+.Lcbc_enc_done:
+ vst1.8 {q1}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lcbc_enc_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_dec_armv8_ce
+.type _gcry_aes_cbc_dec_armv8_ce,%function;
+_gcry_aes_cbc_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ ldr r4, [sp, #(16+0)]
+ ldr r5, [sp, #(16+4)]
+ cmp r4, #0
+ beq .Lcbc_dec_skip
+ vpush {q4-q7}
+
+ cmp r5, #12
+ vld1.8 {q0}, [r3] /* load IV */
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lcbc_dec_entry_192
+ bhi .Lcbc_dec_entry_256
+
+#define CBC_DEC(bits, ...) \
+ .Lcbc_dec_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lcbc_dec_loop_##bits; \
+ \
+ .Lcbc_dec_loop4_##bits: \
+ \
+ vld1.8 {q1-q2}, [r2]!; /* load ciphertext */ \
+ sub r4, r4, #4; \
+ vld1.8 {q3-q4}, [r2]; /* load ciphertext */ \
+ cmp r4, #4; \
+ sub r2, #32; \
+ \
+ do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q0; \
+ vld1.8 {q0}, [r2]!; /* load next IV */ \
+ veor q2, q2, q0; \
+ vld1.8 {q0}, [r2]!; /* load next IV */ \
+ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+ veor q3, q3, q0; \
+ vld1.8 {q0}, [r2]!; /* load next IV */ \
+ veor q4, q4, q0; \
+ vld1.8 {q0}, [r2]!; /* load next IV */ \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lcbc_dec_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lcbc_dec_done; \
+ \
+ .Lcbc_dec_loop_##bits: \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ subs r4, r4, #1; \
+ vmov q2, q1; \
+ \
+ do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q0; \
+ vmov q0, q2; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lcbc_dec_loop_##bits; \
+ b .Lcbc_dec_done;
+
+ CBC_DEC(128)
+ CBC_DEC(192, r0, r6)
+ CBC_DEC(256, r0, r6)
+
+#undef CBC_DEC
+
+.Lcbc_dec_done:
+ vst1.8 {q0}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lcbc_dec_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_enc_armv8_ce
+.type _gcry_aes_cfb_enc_armv8_ce,%function;
+_gcry_aes_cfb_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ ldr r4, [sp, #(16+0)]
+ ldr r5, [sp, #(16+4)]
+ cmp r4, #0
+ beq .Lcfb_enc_skip
+ vpush {q4-q7}
+
+ cmp r5, #12
+ vld1.8 {q0}, [r3] /* load IV */
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lcfb_enc_entry_192
+ bhi .Lcfb_enc_entry_256
+
+#define CFB_ENC(bits, ...) \
+ .Lcfb_enc_entry_##bits: \
+ .Lcfb_enc_loop_##bits: \
+ vld1.8 {q1}, [r2]!; /* load plaintext */ \
+ subs r4, r4, #1; \
+ \
+ do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
+ \
+ veor q0, q1, q0; \
+ vst1.8 {q0}, [r1]!; /* store ciphertext */ \
+ \
+ bne .Lcfb_enc_loop_##bits; \
+ b .Lcfb_enc_done;
+
+ CFB_ENC(128)
+ CFB_ENC(192, r0, r6)
+ CFB_ENC(256, r0, r6)
+
+#undef CFB_ENC
+
+.Lcfb_enc_done:
+ vst1.8 {q0}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lcfb_enc_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_dec_armv8_ce
+.type _gcry_aes_cfb_dec_armv8_ce,%function;
+_gcry_aes_cfb_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ ldr r4, [sp, #(16+0)]
+ ldr r5, [sp, #(16+4)]
+ cmp r4, #0
+ beq .Lcfb_dec_skip
+ vpush {q4-q7}
+
+ cmp r5, #12
+ vld1.8 {q0}, [r3] /* load IV */
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lcfb_dec_entry_192
+ bhi .Lcfb_dec_entry_256
+
+#define CFB_DEC(bits, ...) \
+ .Lcfb_dec_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lcfb_dec_loop_##bits; \
+ \
+ .Lcfb_dec_loop4_##bits: \
+ \
+ vld1.8 {q2-q3}, [r2]!; /* load ciphertext */ \
+ vmov q1, q0; \
+ sub r4, r4, #4; \
+ vld1.8 {q4}, [r2]; /* load ciphertext */ \
+ sub r2, #32; \
+ cmp r4, #4; \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ veor q1, q1, q0; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ veor q2, q2, q0; \
+ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+ vld1.8 {q0}, [r2]!; \
+ veor q3, q3, q0; \
+ vld1.8 {q0}, [r2]!; /* load next IV / ciphertext */ \
+ veor q4, q4, q0; \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lcfb_dec_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lcfb_dec_done; \
+ \
+ .Lcfb_dec_loop_##bits: \
+ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ \
+ subs r4, r4, #1; \
+ \
+ do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
+ \
+ veor q2, q1, q0; \
+ vmov q0, q1; \
+ vst1.8 {q2}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lcfb_dec_loop_##bits; \
+ b .Lcfb_dec_done;
+
+ CFB_DEC(128)
+ CFB_DEC(192, r0, r6)
+ CFB_DEC(256, r0, r6)
+
+#undef CFB_DEC
+
+.Lcfb_dec_done:
+ vst1.8 {q0}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lcfb_dec_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr_enc_armv8_ce
+.type _gcry_aes_ctr_enc_armv8_ce,%function;
+_gcry_aes_ctr_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * %st+0: nblocks => r4
+ * %st+4: nrounds => r5
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ cmp r4, #0
+ beq .Lctr_enc_skip
+
+ cmp r5, #12
+ ldm r3, {r7-r10}
+ vld1.8 {q0}, [r3] /* load IV */
+ rev r7, r7
+ rev r8, r8
+ rev r9, r9
+ rev r10, r10
+
+ aes_preload_keys(r0, r6);
+
+ beq .Lctr_enc_entry_192
+ bhi .Lctr_enc_entry_256
+
+#define CTR_ENC(bits, ...) \
+ .Lctr_enc_entry_##bits: \
+ cmp r4, #4; \
+ blo .Lctr_enc_loop_##bits; \
+ \
+ .Lctr_enc_loop4_##bits: \
+ cmp r10, #0xfffffffc; \
+ sub r4, r4, #4; \
+ blo .Lctr_enc_loop4_##bits##_nocarry; \
+ cmp r9, #0xffffffff; \
+ bne .Lctr_enc_loop4_##bits##_nocarry; \
+ \
+ adds r10, #1; \
+ vmov q1, q0; \
+ blcs .Lctr_overflow_one; \
+ rev r11, r10; \
+ vmov.32 d1[1], r11; \
+ \
+ adds r10, #1; \
+ vmov q2, q0; \
+ blcs .Lctr_overflow_one; \
+ rev r11, r10; \
+ vmov.32 d1[1], r11; \
+ \
+ adds r10, #1; \
+ vmov q3, q0; \
+ blcs .Lctr_overflow_one; \
+ rev r11, r10; \
+ vmov.32 d1[1], r11; \
+ \
+ adds r10, #1; \
+ vmov q4, q0; \
+ blcs .Lctr_overflow_one; \
+ rev r11, r10; \
+ vmov.32 d1[1], r11; \
+ \
+ b .Lctr_enc_loop4_##bits##_store_ctr; \
+ \
+ .Lctr_enc_loop4_##bits##_nocarry: \
+ \
+ veor q2, q2; \
+ vrev64.8 q1, q0; \
+ vceq.u32 d5, d5; \
+ vadd.u64 q3, q2, q2; \
+ vadd.u64 q4, q3, q2; \
+ vadd.u64 q0, q3, q3; \
+ vsub.u64 q2, q1, q2; \
+ vsub.u64 q3, q1, q3; \
+ vsub.u64 q4, q1, q4; \
+ vsub.u64 q0, q1, q0; \
+ vrev64.8 q1, q1; \
+ vrev64.8 q2, q2; \
+ vrev64.8 q3, q3; \
+ vrev64.8 q0, q0; \
+ vrev64.8 q4, q4; \
+ add r10, #4; \
+ \
+ .Lctr_enc_loop4_##bits##_store_ctr: \
+ \
+ vst1.8 {q0}, [r3]; \
+ cmp r4, #4; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q0; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ veor q2, q2, q0; \
+ veor q3, q3, q1; \
+ vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+ vst1.8 {q2}, [r1]!; /* store plaintext */ \
+ veor q4, q4, q0; \
+ vld1.8 {q0}, [r3]; /* reload IV */ \
+ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+ \
+ bhs .Lctr_enc_loop4_##bits; \
+ cmp r4, #0; \
+ beq .Lctr_enc_done; \
+ \
+ .Lctr_enc_loop_##bits: \
+ \
+ adds r10, #1; \
+ vmov q1, q0; \
+ blcs .Lctr_overflow_one; \
+ rev r11, r10; \
+ subs r4, r4, #1; \
+ vld1.8 {q2}, [r2]!; /* load ciphertext */ \
+ vmov.32 d1[1], r11; \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q2, q1; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ \
+ bne .Lctr_enc_loop_##bits; \
+ b .Lctr_enc_done;
+
+ CTR_ENC(128)
+ CTR_ENC(192, r0, r6)
+ CTR_ENC(256, r0, r6)
+
+#undef CTR_ENC
+
+.Lctr_enc_done:
+ vst1.8 {q0}, [r3] /* store IV */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lctr_enc_skip:
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+
+.Lctr_overflow_one:
+ adcs r9, #0
+ adcs r8, #0
+ adc r7, #0
+ rev r11, r9
+ rev r12, r8
+ vmov.32 d1[0], r11
+ rev r11, r7
+ vmov.32 d0[1], r12
+ vmov.32 d0[0], r11
+ bx lr
+.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_enc_armv8_ce
+.type _gcry_aes_ocb_enc_armv8_ce,%function;
+_gcry_aes_ocb_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: offset
+ * %st+0: checksum => r4
+ * %st+4: Ls => r5
+ * %st+8: nblocks => r6 (0 < nblocks <= 32)
+ * %st+12: nrounds => r7
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r7, [sp, #(104+12)]
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ ldr r6, [sp, #(104+8)]
+
+ cmp r7, #12
+ vld1.8 {q0}, [r3] /* load offset */
+
+ aes_preload_keys(r0, r12);
+
+ beq .Locb_enc_entry_192
+ bhi .Locb_enc_entry_256
+
+#define OCB_ENC(bits, ...) \
+ .Locb_enc_entry_##bits: \
+ cmp r6, #4; \
+ blo .Locb_enc_loop_##bits; \
+ \
+ .Locb_enc_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
+ \
+ ldm r5!, {r8, r9, r10, r11}; \
+ sub r6, #4; \
+ \
+ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
+ vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
+ vld1.8 {q8}, [r4]; /* load Checksum_{i-1} */ \
+ veor q0, q0, q9; /* Offset_i+0 */ \
+ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \
+ veor q8, q8, q1; /* Checksum_i+0 */ \
+ veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\
+ vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
+ vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\
+ veor q0, q0, q9; /* Offset_i+1 */ \
+ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \
+ veor q8, q8, q2; /* Checksum_i+1 */ \
+ veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\
+ vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\
+ veor q0, q0, q9; /* Offset_i+2 */ \
+ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \
+ veor q8, q8, q3; /* Checksum_i+2 */ \
+ veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\
+ vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\
+ veor q0, q0, q9; /* Offset_i+3 */ \
+ veor q8, q8, q4; /* Checksum_i+3 */ \
+ veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\
+ vst1.8 {q0}, [r1]; /* store Offset_i+3 */\
+ sub r1, #(3*16); \
+ vst1.8 {q8}, [r4]; /* store Checksum_i+3 */\
+ \
+ cmp r6, #4; \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ mov r8, r1; \
+ vld1.8 {q8-q9}, [r1]!; \
+ veor q1, q1, q8; \
+ veor q2, q2, q9; \
+ vld1.8 {q8-q9}, [r1]!; \
+ vst1.8 {q1-q2}, [r8]!; \
+ veor q3, q3, q8; \
+ veor q4, q4, q9; \
+ vst1.8 {q3-q4}, [r8]; \
+ \
+ bhs .Locb_enc_loop4_##bits; \
+ cmp r6, #0; \
+ beq .Locb_enc_done; \
+ \
+ .Locb_enc_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
+ \
+ ldr r8, [r5], #4; \
+ vld1.8 {q1}, [r2]!; /* load plaintext */ \
+ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+ vld1.8 {q3}, [r4]; /* load checksum */ \
+ subs r6, #1; \
+ veor q0, q0, q2; \
+ veor q3, q3, q1; \
+ veor q1, q1, q0; \
+ vst1.8 {q3}, [r4]; /* store checksum */ \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q0; \
+ vst1.8 {q1}, [r1]!; /* store ciphertext */ \
+ \
+ bne .Locb_enc_loop_##bits; \
+ b .Locb_enc_done;
+
+ OCB_ENC(128re, r0, r12)
+ OCB_ENC(192, r0, r12)
+ OCB_ENC(256, r0, r12)
+
+#undef OCB_ENC
+
+.Locb_enc_done:
+ vst1.8 {q0}, [r3] /* store offset */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_dec_armv8_ce
+.type _gcry_aes_ocb_dec_armv8_ce,%function;
+_gcry_aes_ocb_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: offset
+ * %st+0: checksum => r4
+ * %st+4: Ls => r5
+ * %st+8: nblocks => r6 (0 < nblocks <= 32)
+ * %st+12: nrounds => r7
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r7, [sp, #(104+12)]
+ ldr r4, [sp, #(104+0)]
+ ldr r5, [sp, #(104+4)]
+ ldr r6, [sp, #(104+8)]
+
+ cmp r7, #12
+ vld1.8 {q0}, [r3] /* load offset */
+
+ aes_preload_keys(r0, r12);
+
+ beq .Locb_dec_entry_192
+ bhi .Locb_dec_entry_256
+
+#define OCB_DEC(bits, ...) \
+ .Locb_dec_entry_##bits: \
+ cmp r6, #4; \
+ blo .Locb_dec_loop_##bits; \
+ \
+ .Locb_dec_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ \
+ ldm r5!, {r8, r9, r10, r11}; \
+ sub r6, #4; \
+ \
+ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
+ vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
+ veor q0, q0, q9; /* Offset_i+0 */ \
+ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \
+ veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\
+ vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
+ vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\
+ veor q0, q0, q9; /* Offset_i+1 */ \
+ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \
+ veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\
+ vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\
+ veor q0, q0, q9; /* Offset_i+2 */ \
+ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \
+ veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\
+ vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\
+ veor q0, q0, q9; /* Offset_i+3 */ \
+ veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\
+ vst1.8 {q0}, [r1]; /* store Offset_i+3 */\
+ sub r1, #(3*16); \
+ \
+ cmp r6, #4; \
+ \
+ do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ mov r8, r1; \
+ vld1.8 {q8-q9}, [r1]!; \
+ veor q1, q1, q8; \
+ veor q2, q2, q9; \
+ vld1.8 {q8-q9}, [r1]!; \
+ vst1.8 {q1-q2}, [r8]!; \
+ veor q1, q1, q2; \
+ vld1.8 {q2}, [r4]; /* load Checksum_{i-1} */ \
+ veor q3, q3, q8; \
+ veor q1, q1, q3; \
+ veor q4, q4, q9; \
+ veor q1, q1, q4; \
+ vst1.8 {q3-q4}, [r8]; \
+ veor q2, q2, q1; \
+ vst1.8 {q2}, [r4]; /* store Checksum_i+3 */ \
+ \
+ bhs .Locb_dec_loop4_##bits; \
+ cmp r6, #0; \
+ beq .Locb_dec_done; \
+ \
+ .Locb_dec_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ \
+ ldr r8, [r5], #4; \
+ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ subs r6, #1; \
+ veor q0, q0, q2; \
+ veor q1, q1, q0; \
+ \
+ do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__) \
+ \
+ vld1.8 {q2}, [r4]; /* load checksum */ \
+ veor q1, q1, q0; \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ veor q2, q2, q1; \
+ vst1.8 {q2}, [r4]; /* store checksum */ \
+ \
+ bne .Locb_dec_loop_##bits; \
+ b .Locb_dec_done;
+
+ OCB_DEC(128re, r0, r12)
+ OCB_DEC(192, r0, r12)
+ OCB_DEC(256, r0, r12)
+
+#undef OCB_DEC
+
+.Locb_dec_done:
+ vst1.8 {q0}, [r3] /* store offset */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * const unsigned char *abuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_auth_armv8_ce
+.type _gcry_aes_ocb_auth_armv8_ce,%function;
+_gcry_aes_ocb_auth_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: abuf
+ * r2: offset
+ * r3: checksum
+ * %st+0: Ls => r5
+ * %st+4: nblocks => r6 (0 < nblocks <= 32)
+ * %st+8: nrounds => r7
+ */
+
+ vpush {q4-q7}
+ push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+ ldr r7, [sp, #(104+8)]
+ ldr r5, [sp, #(104+0)]
+ ldr r6, [sp, #(104+4)]
+
+ cmp r7, #12
+ vld1.8 {q0}, [r2] /* load offset */
+
+ aes_preload_keys(r0, r12);
+
+ beq .Locb_auth_entry_192
+ bhi .Locb_auth_entry_256
+
+#define OCB_AUTH(bits, ...) \
+ .Locb_auth_entry_##bits: \
+ cmp r6, #4; \
+ blo .Locb_auth_loop_##bits; \
+ \
+ .Locb_auth_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
+ \
+ ldm r5!, {r8, r9, r10, r11}; \
+ sub r6, #4; \
+ \
+ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
+ vld1.8 {q1-q2}, [r1]!; /* load A_i+<0-1> */ \
+ veor q0, q0, q9; /* Offset_i+0 */ \
+ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \
+ veor q1, q1, q0; /* A_i+0 xor Offset_i+0 */\
+ vld1.8 {q3-q4}, [r1]!; /* load A_i+<2-3> */ \
+ veor q0, q0, q9; /* Offset_i+1 */ \
+ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \
+ veor q2, q2, q0; /* A_i+1 xor Offset_i+1 */\
+ veor q0, q0, q9; /* Offset_i+2 */ \
+ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \
+ veor q3, q3, q0; /* A_i+2 xor Offset_i+2 */\
+ veor q0, q0, q9; /* Offset_i+3 */ \
+ veor q4, q4, q0; /* A_i+3 xor Offset_i+3 */\
+ \
+ cmp r6, #4; \
+ \
+ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ veor q1, q1, q2; \
+ veor q3, q3, q4; \
+ vld1.8 {q2}, [r3]; \
+ veor q1, q1, q3; \
+ veor q2, q2, q1; \
+ vst1.8 {q2}, [r3]; \
+ \
+ bhs .Locb_auth_loop4_##bits; \
+ cmp r6, #0; \
+ beq .Locb_auth_done; \
+ \
+ .Locb_auth_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
+ \
+ ldr r8, [r5], #4; \
+ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+ vld1.8 {q1}, [r1]!; /* load aadtext */ \
+ subs r6, #1; \
+ veor q0, q0, q2; \
+ vld1.8 {q2}, [r3]; /* load checksum */ \
+ veor q1, q1, q0; \
+ \
+ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__) \
+ \
+ veor q2, q2, q1; \
+ vst1.8 {q2}, [r3]; /* store checksum */ \
+ \
+ bne .Locb_auth_loop_##bits; \
+ b .Locb_auth_done;
+
+ OCB_AUTH(128re, r0, r12)
+ OCB_AUTH(192, r0, r12)
+ OCB_AUTH(256, r0, r12)
+
+#undef OCB_AUTH
+
+.Locb_auth_done:
+ vst1.8 {q0}, [r2] /* store offset */
+
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+ pop {r4-r12,lr}
+ vpop {q4-q7}
+ bx lr
+.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+
+
+/*
+ * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+ */
+.align 3
+.globl _gcry_aes_sbox4_armv8_ce
+.type _gcry_aes_sbox4_armv8_ce,%function;
+_gcry_aes_sbox4_armv8_ce:
+ /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+ vmov.i8 q0, #0x52
+ vmov.i8 q1, #0
+ vmov s0, r0
+ aese.8 q0, q1
+ veor d0, d1
+ vpadd.i32 d0, d0, d1
+ vmov r0, s0
+ CLEAR_REG(q0)
+ bx lr
+.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
+
+
+/*
+ * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
+ */
+.align 3
+.globl _gcry_aes_invmixcol_armv8_ce
+.type _gcry_aes_invmixcol_armv8_ce,%function;
+_gcry_aes_invmixcol_armv8_ce:
+ vld1.8 {q0}, [r1]
+ aesimc.8 q0, q0
+ vst1.8 {q0}, [r0]
+ CLEAR_REG(q0)
+ bx lr
+.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
+
+#endif
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
new file mode 100644
index 0000000..bed4066
--- /dev/null
+++ b/cipher/rijndael-armv8-ce.c
@@ -0,0 +1,469 @@
+/* ARMv8 Crypto Extension AES for Libgcrypt
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_ARM_CE
+
+
+typedef struct u128_s { u32 a, b, c, d; } u128_t;
+
+extern u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+extern void _gcry_aes_invmixcol_armv8_ce(u128_t *dst, const u128_t *src);
+
+extern unsigned int _gcry_aes_enc_armv8_ce(const void *keysched, byte *dst,
+ const byte *src,
+ unsigned int nrounds);
+extern unsigned int _gcry_aes_dec_armv8_ce(const void *keysched, byte *dst,
+ const byte *src,
+ unsigned int nrounds);
+
+extern void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ int cbc_mac, unsigned int nrounds);
+extern void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+
+typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset, unsigned char *checksum,
+ void **Ls, size_t nblocks,
+ unsigned int nrounds);
+
+void
+_gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte data[MAXKC][4];
+ u32 data32[MAXKC];
+ } tkk[2];
+ unsigned int rounds = ctx->rounds;
+ int KC = rounds - 6;
+ unsigned int keylen = KC * 4;
+ unsigned int i, r, t;
+ byte rcon = 1;
+ int j;
+#define k tkk[0].data
+#define k_u32 tkk[0].data32
+#define tk tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W (ctx->keyschenc)
+#define W_u32 (ctx->keyschenc32)
+
+ for (i = 0; i < keylen; i++)
+ {
+ k[i >> 2][i & 3] = key[i];
+ }
+
+ for (j = KC-1; j >= 0; j--)
+ {
+ tk_u32[j] = k_u32[j];
+ }
+ r = 0;
+ t = 0;
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ W_u32[r][t] = le_bswap32(tk_u32[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+
+ while (r < rounds + 1)
+ {
+ tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon;
+
+ if (KC != 8)
+ {
+ for (j = 1; j < KC; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+ }
+ else
+ {
+ for (j = 1; j < KC/2; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+
+ tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]);
+
+ for (j = KC/2 + 1; j < KC; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+ }
+
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ W_u32[r][t] = le_bswap32(tk_u32[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+
+ rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+ }
+
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+ wipememory(&tkk, sizeof(tkk));
+}
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_armv8_ce_prepare_decryption (RIJNDAEL_context *ctx)
+{
+ u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+ u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+ int rounds = ctx->rounds;
+ int rr;
+ int r;
+
+#define DO_AESIMC() _gcry_aes_invmixcol_armv8_ce(&dkey[r], &ekey[rr])
+
+ dkey[0] = ekey[rounds];
+ r = 1;
+ rr = rounds-1;
+ DO_AESIMC(); r++; rr--; /* round 1 */
+ DO_AESIMC(); r++; rr--; /* round 2 */
+ DO_AESIMC(); r++; rr--; /* round 3 */
+ DO_AESIMC(); r++; rr--; /* round 4 */
+ DO_AESIMC(); r++; rr--; /* round 5 */
+ DO_AESIMC(); r++; rr--; /* round 6 */
+ DO_AESIMC(); r++; rr--; /* round 7 */
+ DO_AESIMC(); r++; rr--; /* round 8 */
+ DO_AESIMC(); r++; rr--; /* round 9 */
+ if (rounds >= 12)
+ {
+ if (rounds > 12)
+ {
+ DO_AESIMC(); r++; rr--; /* round 10 */
+ DO_AESIMC(); r++; rr--; /* round 11 */
+ }
+
+ DO_AESIMC(); r++; rr--; /* round 12 / 10 */
+ DO_AESIMC(); r++; rr--; /* round 13 / 11 */
+ }
+
+ dkey[r] = ekey[0];
+
+#undef DO_AESIMC
+}
+
+unsigned int
+_gcry_aes_armv8_ce_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ return _gcry_aes_enc_armv8_ce(keysched, dst, src, nrounds);
+}
+
+unsigned int
+_gcry_aes_armv8_ce_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ const void *keysched = ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ return _gcry_aes_dec_armv8_ce(keysched, dst, src, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks, int cbc_mac)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cbc_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, cbc_mac,
+ nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cbc_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cfb_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cfb_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
+ : _gcry_aes_ocb_dec_armv8_ce;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.data_nblocks;
+ u64 blkn_offs = blkn - blkn % 32;
+ unsigned int n = 32 - blkn % 32;
+ unsigned char l_tmp[16];
+ void *Ls[32];
+ void **l;
+ size_t i;
+
+ c->u_mode.ocb.data_nblocks = blkn + nblocks;
+
+ if (nblocks >= 32)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ blkn_offs += 32;
+ *l = (void *)ocb_get_l(c, l_tmp, blkn_offs);
+
+ crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, 32,
+ nrounds);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+
+ if (nblocks && l < &Ls[nblocks])
+ {
+ *l = (void *)ocb_get_l(c, l_tmp, 32 + blkn_offs);
+ }
+ }
+ else
+ {
+ for (i = 0; i < nblocks; i++)
+ Ls[i] = (void *)ocb_get_l(c, l_tmp, ++blkn);
+ }
+
+ if (nblocks)
+ {
+ crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, nblocks,
+ nrounds);
+ }
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+}
+
+void
+_gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
+ size_t nblocks)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = ctx->keyschenc32;
+ const unsigned char *abuf = abuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.aad_nblocks;
+ u64 blkn_offs = blkn - blkn % 32;
+ unsigned int n = 32 - blkn % 32;
+ unsigned char l_tmp[16];
+ void *Ls[32];
+ void **l;
+ size_t i;
+
+ c->u_mode.ocb.aad_nblocks = blkn + nblocks;
+
+ if (nblocks >= 32)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ blkn_offs += 32;
+ *l = (void *)ocb_get_l(c, l_tmp, blkn_offs);
+
+ _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls, 32, nrounds);
+
+ nblocks -= 32;
+ abuf += 32 * 16;
+ }
+
+ if (nblocks && l < &Ls[nblocks])
+ {
+ *l = (void *)ocb_get_l(c, l_tmp, 32 + blkn_offs);
+ }
+ }
+ else
+ {
+ for (i = 0; i < nblocks; i++)
+ Ls[i] = (void *)ocb_get_l(c, l_tmp, ++blkn);
+ }
+
+ if (nblocks)
+ {
+ _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls, nblocks, nrounds);
+ }
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+}
+
+#endif /* USE_ARM_CE */
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 6641728..7544fa0 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -82,6 +82,17 @@
# endif
#endif /* ENABLE_AESNI_SUPPORT */
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
+ * code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+# define USE_ARM_CE 1
+# endif
+#endif /* ENABLE_ARM_CRYPTO_SUPPORT */
+
struct RIJNDAEL_context_s;
typedef unsigned int (*rijndael_cryptfn_t)(const struct RIJNDAEL_context_s *ctx,
@@ -127,6 +138,9 @@ typedef struct RIJNDAEL_context_s
#ifdef USE_SSSE3
unsigned int use_ssse3:1; /* SSSE3 shall be used. */
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ unsigned int use_arm_ce:1; /* ARMv8 CE shall be used. */
+#endif /*USE_ARM_CE*/
rijndael_cryptfn_t encrypt_fn;
rijndael_cryptfn_t decrypt_fn;
rijndael_prefetchfn_t prefetch_enc_fn;
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 0130924..cc6a722 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -168,6 +168,46 @@ extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec,
const void *decT);
#endif /*USE_ARM_ASM*/
+#ifdef USE_ARM_CE
+/* ARMv8 Crypto Extension implementations of AES */
+extern void _gcry_aes_armv8_ce_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_armv8_ce_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_armv8_ce_encrypt(const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+extern unsigned int _gcry_aes_armv8_ce_decrypt(const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+
+extern void _gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_armv8_ce_cbc_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ int cbc_mac);
+extern void _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *ctr, size_t nblocks);
+extern void _gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c,
+ const void *abuf_arg, size_t nblocks);
+#endif /*USE_ARM_ASM*/
+
static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
const unsigned char *ax);
static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -223,11 +263,12 @@ static gcry_err_code_t
do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
static int initialized = 0;
- static const char *selftest_failed=0;
+ static const char *selftest_failed = 0;
int rounds;
int i,j, r, t, rconpointer = 0;
int KC;
-#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3)
+#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \
+ || defined(USE_ARM_CE)
unsigned int hwfeatures;
#endif
@@ -268,7 +309,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
ctx->rounds = rounds;
-#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3)
+#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \
+ || defined(USE_ARM_CE)
hwfeatures = _gcry_get_hw_features ();
#endif
@@ -282,6 +324,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
#ifdef USE_SSSE3
ctx->use_ssse3 = 0;
#endif
+#ifdef USE_ARM_CE
+ ctx->use_arm_ce = 0;
+#endif
if (0)
{
@@ -318,6 +363,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
ctx->use_ssse3 = 1;
}
#endif
+#ifdef USE_ARM_CE
+ else if (hwfeatures & HWF_ARM_AES)
+ {
+ ctx->encrypt_fn = _gcry_aes_armv8_ce_encrypt;
+ ctx->decrypt_fn = _gcry_aes_armv8_ce_decrypt;
+ ctx->prefetch_enc_fn = NULL;
+ ctx->prefetch_dec_fn = NULL;
+ ctx->use_arm_ce = 1;
+ }
+#endif
else
{
ctx->encrypt_fn = do_encrypt;
@@ -340,6 +395,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
else if (ctx->use_ssse3)
_gcry_aes_ssse3_do_setkey (ctx, key);
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ _gcry_aes_armv8_ce_setkey (ctx, key);
+#endif
else
{
const byte *sbox = ((const byte *)encT) + 1;
@@ -471,6 +530,12 @@ prepare_decryption( RIJNDAEL_context *ctx )
_gcry_aes_ssse3_prepare_decryption (ctx);
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_prepare_decryption (ctx);
+ }
+#endif /*USE_SSSE3*/
#ifdef USE_PADLOCK
else if (ctx->use_padlock)
{
@@ -744,6 +809,13 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_cfb_enc (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -798,6 +870,13 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -860,6 +939,13 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp;
@@ -1120,6 +1206,13 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_cfb_dec (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -1173,6 +1266,13 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_cbc_dec (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16;
@@ -1238,6 +1338,13 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else if (encrypt)
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
@@ -1323,6 +1430,13 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
burn_depth = 0;
}
#endif /*USE_SSSE3*/
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ _gcry_aes_armv8_ce_ocb_auth (c, abuf, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_ARM_CE*/
else
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
diff --git a/configure.ac b/configure.ac
index 91dd285..915888a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2009,6 +2009,10 @@ if test "$found" = "1" ; then
arm*-*-*)
# Build with the assembly implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
+
+ # Build with the ARMv8/AArch32 CE implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo"
;;
esac
More information about the Gcrypt-devel
mailing list