[PATCH] Add ARMv8/AArch64 implementation of chacha20
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Aug 6 14:09:49 CEST 2017
* cipher/Makefile.am: Add 'chacha20-aarch64.S'.
* cipher/chacha20-aarch64.S: New.
* cipher/chacha20.c (USE_AARCH64_SIMD): New.
(_gcry_chacha20_aarch_blocks): New.
(chacha20_do_setkey): Add HWF selection for Aarch64 implementation.
* configure.ac: Add 'chacha20-aarch64.lo'.
--
Patch adds ARMv8/AArch64 SIMD implementation based on public domain
ARMv7/NEON implementation by Andrew Moon at:
https://github.com/floodyberry/chacha-opt
Benchmark on ARM Cortex-A53 (1536 Mhz):
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 5.70 ns/B 167.2 MiB/s 8.76 c/B
STREAM dec | 5.71 ns/B 166.9 MiB/s 8.78 c/B
After (~1.7x faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 3.32 ns/B 287.7 MiB/s 5.09 c/B
STREAM dec | 3.31 ns/B 287.9 MiB/s 5.09 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 95c45108..26d25e1a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -65,7 +65,7 @@ arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
- chacha20-armv7-neon.S \
+ chacha20-armv7-neon.S chacha20-aarch64.S \
crc.c \
crc-intel-pclmul.c \
des.c des-amd64.S \
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
new file mode 100644
index 00000000..d07511ff
--- /dev/null
+++ b/cipher/chacha20-aarch64.S
@@ -0,0 +1,772 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2014,2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain ARMv7/NEON implementation by Andrew Moon at
+ * https://github.com/floodyberry/chacha-opt
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+ defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#define STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+ add x17, ptr, #8; \
+ stp l0, l1, [ptr], #16; \
+ stp l2, l3, [x17], #16; \
+ stp l4, l5, [ptr], #16; \
+ stp l6, l7, [x17];
+
+#define LDMIA16(ptr, l0, l1, l2, l3, l4, l5, l6, l7, \
+ l8, l9, l10, l11, l12, l13, l14, l15) \
+ add x17, ptr, #8; \
+ ldp l0, l1, [ptr], #16; \
+ ldp l2, l3, [x17], #16; \
+ ldp l4, l5, [ptr], #16; \
+ ldp l6, l7, [x17], #16; \
+ ldp l8, l9, [ptr], #16; \
+ ldp l10, l11, [x17], #16; \
+ ldp l12, l13, [ptr], #16; \
+ ldp l14, l15, [x17]; \
+
+#define LDMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+ add x17, ptr, #8; \
+ ldp l0, l1, [ptr], #16; \
+ ldp l2, l3, [x17], #16; \
+ ldp l4, l5, [ptr], #16; \
+ ldp l6, l7, [x17];
+
+#define LDMIA4(ptr, l0, l1, l2, l3) \
+ ldp l0, l1, [ptr], #8; \
+ ldp l2, l3, [ptr], #8;
+
+#define EXT32(a,b,c,n) \
+ ext a,b,c,#(n*4);
+
+.text
+
+#define STACK_STATE 48
+#define STACK_SRC 56
+#define STACK_SP 192
+#define STACK_DST 200
+#define STACK_BYTES 208
+#define STACK_DST_TMP 216
+
+.globl _gcry_chacha20_aarch64_blocks
+.type _gcry_chacha20_aarch64_blocks,%function;
+_gcry_chacha20_aarch64_blocks:
+.Lchacha_blocks_neon_local:
+ tst x3, x3
+ beq .Lchacha_blocks_neon_nobytes
+ mov x16, sp
+ mov x8, sp
+ sub x16, x16, #(216+8)
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ and x16, x16, #(-32)
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov sp, x16
+ add x16, x16, #64
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+ mov w4, #20
+ ld1 {v24.4s-v27.4s}, [x0]
+ str x0, [sp, # STACK_STATE]
+ str x1, [sp, # STACK_SRC]
+ str x2, [sp, # STACK_DST]
+ str x3, [sp, # STACK_BYTES]
+ str x8, [sp, # STACK_SP]
+ st1 {v24.4s-v27.4s}, [x16]
+ str w4, [sp, #44]
+ cmp x3, #256
+ blo .Lchacha_blocks_neon_mainloop2
+.Lchacha_blocks_neon_mainloop1:
+ ldr w0, [sp, #44]
+ add x16, sp, #64
+ str w0, [sp, #0]
+ mov x2, #1
+ eor v12.16b, v12.16b, v12.16b
+ mov v0.16b, v24.16b
+ mov v1.16b, v25.16b
+ mov v2.16b, v26.16b
+ mov v3.16b, v27.16b
+ mov v12.2d[0], x2
+ add v3.2d, v3.2d, v12.2d
+ mov v4.16b, v0.16b
+ mov v5.16b, v1.16b
+ mov v6.16b, v2.16b
+ add v7.2d, v3.2d, v12.2d
+ LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+ w8, w9, w10, w11, w12, w13, w14, w15)
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ add v11.2d, v7.2d, v12.2d
+ str w6, [sp, #8]
+ str w11, [sp, #12]
+ mov w11, w13
+ str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds1:
+ ldr w6, [sp, #0]
+ add v0.4s, v0.4s, v1.4s
+ add w0, w0, w4
+ add v4.4s, v4.4s, v5.4s
+ add w1, w1, w5
+ add v8.4s, v8.4s, v9.4s
+ eor w12, w12, w0
+ eor v12.16b, v3.16b, v0.16b
+ eor w11, w11, w1
+ eor v13.16b, v7.16b, v4.16b
+ ror w12, w12, #16
+ eor v14.16b, v11.16b, v8.16b
+ ror w11, w11, #16
+ rev32 v3.8h, v12.8h
+ subs w6, w6, #2
+ rev32 v7.8h, v13.8h
+ add w8, w8, w12
+ rev32 v11.8h, v14.8h
+ add w9, w9, w11
+ add v2.4s, v2.4s, v3.4s
+ eor w4, w4, w8
+ add v6.4s, v6.4s, v7.4s
+ eor w5, w5, w9
+ add v10.4s, v10.4s, v11.4s
+ str w6, [sp, #0]
+ eor v12.16b, v1.16b, v2.16b
+ ror w4, w4, #20
+ eor v13.16b, v5.16b, v6.16b
+ ror w5, w5, #20
+ eor v14.16b, v9.16b, v10.16b
+ add w0, w0, w4
+ shl v1.4s, v12.4s, #12
+ add w1, w1, w5
+ shl v5.4s, v13.4s, #12
+ ldr w6, [sp, #8]
+ shl v9.4s, v14.4s, #12
+ eor w12, w12, w0
+ sri v1.4s, v12.4s, #20
+ eor w11, w11, w1
+ sri v5.4s, v13.4s, #20
+ ror w12, w12, #24
+ sri v9.4s, v14.4s, #20
+ ror w11, w11, #24
+ add v0.4s, v0.4s, v1.4s
+ add w8, w8, w12
+ add v4.4s, v4.4s, v5.4s
+ add w9, w9, w11
+ add v8.4s, v8.4s, v9.4s
+ eor w4, w4, w8
+ eor v12.16b, v3.16b, v0.16b
+ eor w5, w5, w9
+ eor v13.16b, v7.16b, v4.16b
+ str w11, [sp, #20]
+ eor v14.16b, v11.16b, v8.16b
+ ror w4, w4, #25
+ shl v3.4s, v12.4s, #8
+ ror w5, w5, #25
+ shl v7.4s, v13.4s, #8
+ str w4, [sp, #4]
+ shl v11.4s, v14.4s, #8
+ ldr w4, [sp, #28]
+ sri v3.4s, v12.4s, #24
+ add w2, w2, w6
+ sri v7.4s, v13.4s, #24
+ add w3, w3, w7
+ sri v11.4s, v14.4s, #24
+ ldr w11, [sp, #12]
+ add v2.4s, v2.4s, v3.4s
+ eor w14, w14, w2
+ add v6.4s, v6.4s, v7.4s
+ eor w4, w4, w3
+ add v10.4s, v10.4s, v11.4s
+ ror w14, w14, #16
+ eor v12.16b, v1.16b, v2.16b
+ ror w4, w4, #16
+ eor v13.16b, v5.16b, v6.16b
+ add w10, w10, w14
+ eor v14.16b, v9.16b, v10.16b
+ add w11, w11, w4
+ shl v1.4s, v12.4s, #7
+ eor w6, w6, w10
+ shl v5.4s, v13.4s, #7
+ eor w7, w7, w11
+ shl v9.4s, v14.4s, #7
+ ror w6, w6, #20
+ sri v1.4s, v12.4s, #25
+ ror w7, w7, #20
+ sri v5.4s, v13.4s, #25
+ add w2, w2, w6
+ sri v9.4s, v14.4s, #25
+ add w3, w3, w7
+ EXT32(v3.16b, v3.16b, v3.16b, 3)
+ eor w14, w14, w2
+ EXT32(v7.16b, v7.16b, v7.16b, 3)
+ eor w4, w4, w3
+ EXT32(v11.16b, v11.16b, v11.16b, 3)
+ ror w14, w14, #24
+ EXT32(v1.16b, v1.16b, v1.16b, 1)
+ ror w4, w4, #24
+ EXT32(v5.16b, v5.16b, v5.16b, 1)
+ add w10, w10, w14
+ EXT32(v9.16b, v9.16b, v9.16b, 1)
+ add w11, w11, w4
+ EXT32(v2.16b, v2.16b, v2.16b, 2)
+ eor w6, w6, w10
+ EXT32(v6.16b, v6.16b, v6.16b, 2)
+ eor w7, w7, w11
+ EXT32(v10.16b, v10.16b, v10.16b, 2)
+ ror w6, w6, #25
+ add v0.4s, v0.4s, v1.4s
+ ror w7, w7, #25
+ add v4.4s, v4.4s, v5.4s
+ add w0, w0, w5
+ add v8.4s, v8.4s, v9.4s
+ add w1, w1, w6
+ eor v12.16b, v3.16b, v0.16b
+ eor w4, w4, w0
+ eor v13.16b, v7.16b, v4.16b
+ eor w12, w12, w1
+ eor v14.16b, v11.16b, v8.16b
+ ror w4, w4, #16
+ rev32 v3.8h, v12.8h
+ ror w12, w12, #16
+ rev32 v7.8h, v13.8h
+ add w10, w10, w4
+ rev32 v11.8h, v14.8h
+ add w11, w11, w12
+ add v2.4s, v2.4s, v3.4s
+ eor w5, w5, w10
+ add v6.4s, v6.4s, v7.4s
+ eor w6, w6, w11
+ add v10.4s, v10.4s, v11.4s
+ ror w5, w5, #20
+ eor v12.16b, v1.16b, v2.16b
+ ror w6, w6, #20
+ eor v13.16b, v5.16b, v6.16b
+ add w0, w0, w5
+ eor v14.16b, v9.16b, v10.16b
+ add w1, w1, w6
+ shl v1.4s, v12.4s, #12
+ eor w4, w4, w0
+ shl v5.4s, v13.4s, #12
+ eor w12, w12, w1
+ shl v9.4s, v14.4s, #12
+ ror w4, w4, #24
+ sri v1.4s, v12.4s, #20
+ ror w12, w12, #24
+ sri v5.4s, v13.4s, #20
+ add w10, w10, w4
+ sri v9.4s, v14.4s, #20
+ add w11, w11, w12
+ add v0.4s, v0.4s, v1.4s
+ eor w5, w5, w10
+ add v4.4s, v4.4s, v5.4s
+ eor w6, w6, w11
+ add v8.4s, v8.4s, v9.4s
+ str w11, [sp, #12]
+ eor v12.16b, v3.16b, v0.16b
+ ror w5, w5, #25
+ eor v13.16b, v7.16b, v4.16b
+ ror w6, w6, #25
+ eor v14.16b, v11.16b, v8.16b
+ str w4, [sp, #28]
+ shl v3.4s, v12.4s, #8
+ ldr w4, [sp, #4]
+ shl v7.4s, v13.4s, #8
+ add w2, w2, w7
+ shl v11.4s, v14.4s, #8
+ add w3, w3, w4
+ sri v3.4s, v12.4s, #24
+ ldr w11, [sp, #20]
+ sri v7.4s, v13.4s, #24
+ eor w11, w11, w2
+ sri v11.4s, v14.4s, #24
+ eor w14, w14, w3
+ add v2.4s, v2.4s, v3.4s
+ ror w11, w11, #16
+ add v6.4s, v6.4s, v7.4s
+ ror w14, w14, #16
+ add v10.4s, v10.4s, v11.4s
+ add w8, w8, w11
+ eor v12.16b, v1.16b, v2.16b
+ add w9, w9, w14
+ eor v13.16b, v5.16b, v6.16b
+ eor w7, w7, w8
+ eor v14.16b, v9.16b, v10.16b
+ eor w4, w4, w9
+ shl v1.4s, v12.4s, #7
+ ror w7, w7, #20
+ shl v5.4s, v13.4s, #7
+ ror w4, w4, #20
+ shl v9.4s, v14.4s, #7
+ str w6, [sp, #8]
+ sri v1.4s, v12.4s, #25
+ add w2, w2, w7
+ sri v5.4s, v13.4s, #25
+ add w3, w3, w4
+ sri v9.4s, v14.4s, #25
+ eor w11, w11, w2
+ EXT32(v3.16b, v3.16b, v3.16b, 1)
+ eor w14, w14, w3
+ EXT32(v7.16b, v7.16b, v7.16b, 1)
+ ror w11, w11, #24
+ EXT32(v11.16b, v11.16b, v11.16b, 1)
+ ror w14, w14, #24
+ EXT32(v1.16b, v1.16b, v1.16b, 3)
+ add w8, w8, w11
+ EXT32(v5.16b, v5.16b, v5.16b, 3)
+ add w9, w9, w14
+ EXT32(v9.16b, v9.16b, v9.16b, 3)
+ eor w7, w7, w8
+ EXT32(v2.16b, v2.16b, v2.16b, 2)
+ eor w4, w4, w9
+ EXT32(v6.16b, v6.16b, v6.16b, 2)
+ ror w7, w7, #25
+ EXT32(v10.16b, v10.16b, v10.16b, 2)
+ ror w4, w4, #25
+ bne .Lchacha_blocks_neon_rounds1
+ str w8, [sp, #0]
+ str w9, [sp, #4]
+ mov v12.16b, v24.16b
+ str w10, [sp, #8]
+ str w12, [sp, #16]
+ mov v13.16b, v25.16b
+ str w11, [sp, #20]
+ str w14, [sp, #24]
+ mov v14.16b, v26.16b
+ mov v15.16b, v27.16b
+ ldr x12, [sp, # STACK_SRC]
+ ldr x14, [sp, # STACK_DST]
+ add v0.4s, v0.4s, v12.4s
+ ldr w8, [sp, #(64 +0)]
+ add v4.4s, v4.4s, v12.4s
+ ldr w9, [sp, #(64 +4)]
+ add v8.4s, v8.4s, v12.4s
+ ldr w10, [sp, #(64 +8)]
+ add v1.4s, v1.4s, v13.4s
+ ldr w11, [sp, #(64 +12)]
+ add v5.4s, v5.4s, v13.4s
+ add w0, w0, w8
+ add v9.4s, v9.4s, v13.4s
+ add w1, w1, w9
+ add v2.4s, v2.4s, v14.4s
+ add w2, w2, w10
+ add v6.4s, v6.4s, v14.4s
+ ldr w8, [sp, #(64 +16)]
+ add v10.4s, v10.4s, v14.4s
+ add w3, w3, w11
+ eor v14.16b, v14.16b, v14.16b
+ ldr w9, [sp, #(64 +20)]
+ mov x11, #1
+ add w4, w4, w8
+ mov v14.2d[0], x11
+ ldr w10, [sp, #(64 +24)]
+ add v12.2d, v14.2d, v15.2d
+ add w5, w5, w9
+ add v13.2d, v14.2d, v12.2d
+ ldr w11, [sp, #(64 +28)]
+ add v14.2d, v14.2d, v13.2d
+ add w6, w6, w10
+ add v3.4s, v3.4s, v12.4s
+ tst x12, x12
+ add v7.4s, v7.4s, v13.4s
+ add w7, w7, w11
+ add v11.4s, v11.4s, v14.4s
+ beq .Lchacha_blocks_neon_nomessage11
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage11:
+ mov x16, sp
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ tst x12, x12
+ LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+ ldr w8, [sp, #(64 +32)]
+ ldr w9, [sp, #(64 +36)]
+ ldr w10, [sp, #(64 +40)]
+ ldr w11, [sp, #(64 +44)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +48)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +52)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +56)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +60)]
+ add w6, w6, w10
+ adds w8, w8, #4
+ add w7, w7, w11
+ adc w9, w9, wzr
+ str w8, [sp, #(64 +48)]
+ mov v27.4s[0], w8
+ tst x12, x12
+ str w9, [sp, #(64 +52)]
+ mov v27.4s[1], w9
+ beq .Lchacha_blocks_neon_nomessage12
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage12:
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ tst x12, x12
+ beq .Lchacha_blocks_neon_nomessage13
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+.Lchacha_blocks_neon_nomessage13:
+ st1 {v0.4s-v3.4s}, [x14], #64
+ beq .Lchacha_blocks_neon_nomessage14
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+.Lchacha_blocks_neon_nomessage14:
+ st1 {v4.4s-v7.4s}, [x14], #64
+ beq .Lchacha_blocks_neon_nomessage15
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+.Lchacha_blocks_neon_nomessage15:
+ st1 {v8.4s-v11.4s}, [x14], #64
+ str x12, [sp, # STACK_SRC]
+ str x14, [sp, # STACK_DST]
+ ldr x3, [sp, # STACK_BYTES]
+ sub x3, x3, #256
+ cmp x3, #256
+ str x3, [sp, # STACK_BYTES]
+ bhs .Lchacha_blocks_neon_mainloop1
+ tst x3, x3
+ beq .Lchacha_blocks_neon_done
+.Lchacha_blocks_neon_mainloop2:
+ ldr x3, [sp, # STACK_BYTES]
+ ldr x1, [sp, # STACK_SRC]
+ cmp x3, #64
+ bhs .Lchacha_blocks_neon_noswap1
+ add x4, sp, #128
+ mov x5, x4
+ tst x1, x1
+ beq .Lchacha_blocks_neon_nocopy1
+.Lchacha_blocks_neon_copyinput1:
+ subs x3, x3, #1
+ ldrb w0, [x1], #1
+ strb w0, [x4], #1
+ bne .Lchacha_blocks_neon_copyinput1
+ str x5, [sp, # STACK_SRC]
+.Lchacha_blocks_neon_nocopy1:
+ ldr x4, [sp, # STACK_DST]
+ str x5, [sp, # STACK_DST]
+ str x4, [sp, # STACK_DST_TMP]
+.Lchacha_blocks_neon_noswap1:
+ add x16, sp, #64
+ ldr w0, [sp, #44]
+ str w0, [sp, #0]
+ LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+ w8, w9, w10, w11, w12, w13, w14, w15)
+ str w6, [sp, #8]
+ str w11, [sp, #12]
+ mov w11, w13
+ str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds2:
+ ldr w6, [sp, #0]
+ add w0, w0, w4
+ add w1, w1, w5
+ eor w12, w12, w0
+ eor w11, w11, w1
+ ror w12, w12, #16
+ ror w11, w11, #16
+ subs w6, w6, #2
+ add w8, w8, w12
+ add w9, w9, w11
+ eor w4, w4, w8
+ eor w5, w5, w9
+ str w6, [sp, #0]
+ ror w4, w4, #20
+ ror w5, w5, #20
+ add w0, w0, w4
+ add w1, w1, w5
+ ldr w6, [sp, #8]
+ eor w12, w12, w0
+ eor w11, w11, w1
+ ror w12, w12, #24
+ ror w11, w11, #24
+ add w8, w8, w12
+ add w9, w9, w11
+ eor w4, w4, w8
+ eor w5, w5, w9
+ str w11, [sp, #20]
+ ror w4, w4, #25
+ ror w5, w5, #25
+ str w4, [sp, #4]
+ ldr w4, [sp, #28]
+ add w2, w2, w6
+ add w3, w3, w7
+ ldr w11, [sp, #12]
+ eor w14, w14, w2
+ eor w4, w4, w3
+ ror w14, w14, #16
+ ror w4, w4, #16
+ add w10, w10, w14
+ add w11, w11, w4
+ eor w6, w6, w10
+ eor w7, w7, w11
+ ror w6, w6, #20
+ ror w7, w7, #20
+ add w2, w2, w6
+ add w3, w3, w7
+ eor w14, w14, w2
+ eor w4, w4, w3
+ ror w14, w14, #24
+ ror w4, w4, #24
+ add w10, w10, w14
+ add w11, w11, w4
+ eor w6, w6, w10
+ eor w7, w7, w11
+ ror w6, w6, #25
+ ror w7, w7, #25
+ add w0, w0, w5
+ add w1, w1, w6
+ eor w4, w4, w0
+ eor w12, w12, w1
+ ror w4, w4, #16
+ ror w12, w12, #16
+ add w10, w10, w4
+ add w11, w11, w12
+ eor w5, w5, w10
+ eor w6, w6, w11
+ ror w5, w5, #20
+ ror w6, w6, #20
+ add w0, w0, w5
+ add w1, w1, w6
+ eor w4, w4, w0
+ eor w12, w12, w1
+ ror w4, w4, #24
+ ror w12, w12, #24
+ add w10, w10, w4
+ add w11, w11, w12
+ eor w5, w5, w10
+ eor w6, w6, w11
+ str w11, [sp, #12]
+ ror w5, w5, #25
+ ror w6, w6, #25
+ str w4, [sp, #28]
+ ldr w4, [sp, #4]
+ add w2, w2, w7
+ add w3, w3, w4
+ ldr w11, [sp, #20]
+ eor w11, w11, w2
+ eor w14, w14, w3
+ ror w11, w11, #16
+ ror w14, w14, #16
+ add w8, w8, w11
+ add w9, w9, w14
+ eor w7, w7, w8
+ eor w4, w4, w9
+ ror w7, w7, #20
+ ror w4, w4, #20
+ str w6, [sp, #8]
+ add w2, w2, w7
+ add w3, w3, w4
+ eor w11, w11, w2
+ eor w14, w14, w3
+ ror w11, w11, #24
+ ror w14, w14, #24
+ add w8, w8, w11
+ add w9, w9, w14
+ eor w7, w7, w8
+ eor w4, w4, w9
+ ror w7, w7, #25
+ ror w4, w4, #25
+ bne .Lchacha_blocks_neon_rounds2
+ str w8, [sp, #0]
+ str w9, [sp, #4]
+ str w10, [sp, #8]
+ str w12, [sp, #16]
+ str w11, [sp, #20]
+ str w14, [sp, #24]
+ ldr x12, [sp, # STACK_SRC]
+ ldr x14, [sp, # STACK_DST]
+ ldr w8, [sp, #(64 +0)]
+ ldr w9, [sp, #(64 +4)]
+ ldr w10, [sp, #(64 +8)]
+ ldr w11, [sp, #(64 +12)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +16)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +20)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +24)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +28)]
+ add w6, w6, w10
+ tst x12, x12
+ add w7, w7, w11
+ beq .Lchacha_blocks_neon_nomessage21
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage21:
+ mov x16, sp
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+ ldr w8, [sp, #(64 +32)]
+ ldr w9, [sp, #(64 +36)]
+ ldr w10, [sp, #(64 +40)]
+ ldr w11, [sp, #(64 +44)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +48)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +52)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +56)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +60)]
+ add w6, w6, w10
+ adds w8, w8, #1
+ add w7, w7, w11
+ adc w9, w9, wzr
+ str w8, [sp, #(64 +48)]
+ tst x12, x12
+ str w9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage22
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage22:
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ str x12, [sp, # STACK_SRC]
+ str x14, [sp, # STACK_DST]
+ ldr x3, [sp, # STACK_BYTES]
+ cmp x3, #64
+ sub x4, x3, #64
+ str x4, [sp, # STACK_BYTES]
+ bhi .Lchacha_blocks_neon_mainloop2
+ cmp x3, #64
+ beq .Lchacha_blocks_neon_nocopy2
+ ldr x1, [sp, # STACK_DST_TMP]
+ sub x14, x14, #64
+.Lchacha_blocks_neon_copyinput2:
+ subs x3, x3, #1
+ ldrb w0, [x14], #1
+ strb w0, [x1], #1
+ bne .Lchacha_blocks_neon_copyinput2
+.Lchacha_blocks_neon_nocopy2:
+.Lchacha_blocks_neon_done:
+ ldr x16, [sp, # STACK_SP]
+ ldr x7, [sp, # STACK_STATE]
+ ldr w8, [sp, #(64 +48)]
+ ldr w9, [sp, #(64 +52)]
+ str w8, [x7, #(48 + 0)]
+ str w9, [x7, #(48 + 4)]
+ sub x0, sp, #8
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+ mov v10.16b, v18.16b
+ mov v11.16b, v19.16b
+ mov sp, x16
+ mov v12.16b, v20.16b
+ mov v13.16b, v21.16b
+ mov v14.16b, v22.16b
+ mov v15.16b, v23.16b
+ sub x0, sp, x0
+ eor v0.16b, v0.16b, v0.16b
+ eor v1.16b, v1.16b, v1.16b
+ eor v2.16b, v2.16b, v2.16b
+ eor v3.16b, v3.16b, v3.16b
+ eor v4.16b, v4.16b, v4.16b
+ eor v5.16b, v5.16b, v5.16b
+ eor v6.16b, v6.16b, v6.16b
+ eor v7.16b, v7.16b, v7.16b
+ ret
+.Lchacha_blocks_neon_nobytes:
+ mov x0, xzr;
+ ret
+.ltorg
+.size _gcry_chacha20_aarch64_blocks,.-_gcry_chacha20_aarch64_blocks;
+
+#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a..a11986c1 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -81,6 +81,16 @@
# endif
#endif /*ENABLE_NEON_SUPPORT*/
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+# define USE_AARCH64_SIMD 1
+# endif
+#endif
struct CHACHA20_context_s;
@@ -144,6 +154,14 @@ unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
#endif /* USE_NEON */
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks(u32 *state, const byte *in,
+ byte *out,
+ size_t bytes) ASM_FUNC_ABI;
+
+#endif /* USE_AARCH64_SIMD */
+
static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
static const char *selftest (void);
@@ -406,6 +424,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
if (features & HWF_ARM_NEON)
ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
#endif
+#ifdef USE_AARCH64_SIMD
+ if (features & HWF_ARM_NEON)
+ ctx->blocks = _gcry_chacha20_aarch64_blocks;
+#endif
(void)features;
diff --git a/configure.ac b/configure.ac
index 66e7cd67..1e6ac9d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2243,6 +2243,10 @@ if test "$found" = "1" ; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
;;
+ aarch64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
+ ;;
esac
if test x"$neonsupport" = xyes ; then
More information about the Gcrypt-devel
mailing list