[PATCH 3/3] OCB ARM CE: Move ocb_get_l handling to assembly part
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Dec 5 15:14:34 CET 2016
* cipher/rijndael-armv8-aarch32-ce.S: Add OCB 'L_{ntz(i)}' calculation.
* cipher/rijndael-armv8-aarch64-ce.S: Ditto.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce)
(ocb_cryt_fn_t): Updated arguments.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_ocb_auth): Remove
'ocb_get_l' handling and splitting input to 32 block chunks, instead
pass full buffers to assembly.
--
Performance on Cortex-A53 (AArch32):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 1.63 ns/B 583.8 MiB/s 1.88 c/B
OCB dec | 1.67 ns/B 572.1 MiB/s 1.92 c/B
OCB auth | 1.33 ns/B 717.1 MiB/s 1.53 c/B
After (~12% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 1.47 ns/B 650.2 MiB/s 1.69 c/B
OCB dec | 1.48 ns/B 644.5 MiB/s 1.70 c/B
OCB auth | 1.19 ns/B 798.2 MiB/s 1.38 c/B
Performance on Cortex-A53 (AArch64):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 1.29 ns/B 738.5 MiB/s 1.49 c/B
OCB dec | 1.32 ns/B 723.5 MiB/s 1.52 c/B
OCB auth | 1.15 ns/B 827.0 MiB/s 1.33 c/B
After (~8% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 1.21 ns/B 789.1 MiB/s 1.39 c/B
OCB dec | 1.21 ns/B 789.2 MiB/s 1.39 c/B
OCB auth | 1.10 ns/B 867.0 MiB/s 1.27 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-armv8-aarch32-ce.S | 98 ++++++++++++++++++++++++---
cipher/rijndael-armv8-aarch64-ce.S | 125 ++++++++++++++++++++++++-----------
cipher/rijndael-armv8-ce.c | 129 ++++--------------------------------
3 files changed, 188 insertions(+), 164 deletions(-)
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index bf68f20..f375f67 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1021,9 +1021,10 @@ _gcry_aes_ctr_enc_armv8_ce:
* const unsigned char *inbuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -1039,6 +1040,7 @@ _gcry_aes_ocb_enc_armv8_ce:
* %st+4: Ls => r5
* %st+8: nblocks => r6 (0 < nblocks <= 32)
* %st+12: nrounds => r7
+ * %st+16: blkn => lr
*/
vpush {q4-q7}
@@ -1047,6 +1049,7 @@ _gcry_aes_ocb_enc_armv8_ce:
ldr r4, [sp, #(104+0)]
ldr r5, [sp, #(104+4)]
ldr r6, [sp, #(104+8)]
+ ldr lr, [sp, #(104+16)]
cmp r7, #12
vld1.8 {q0}, [r3] /* load offset */
@@ -1059,6 +1062,7 @@ _gcry_aes_ocb_enc_armv8_ce:
#define OCB_ENC(bits, ...) \
.Locb_enc_entry_##bits: \
cmp r6, #4; \
+ add lr, #1; \
blo .Locb_enc_loop_##bits; \
\
.Locb_enc_loop4_##bits: \
@@ -1067,7 +1071,23 @@ _gcry_aes_ocb_enc_armv8_ce:
/* Checksum_i = Checksum_{i-1} xor P_i */ \
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
\
- ldm r5!, {r8, r9, r10, r11}; \
+ add r9, lr, #1; \
+ add r10, lr, #2; \
+ add r11, lr, #3; \
+ rbit r8, lr; \
+ add lr, lr, #4; \
+ rbit r9, r9; \
+ rbit r10, r10; \
+ rbit r11, r11; \
+ clz r8, r8; /* ntz(i+0) */ \
+ clz r9, r9; /* ntz(i+1) */ \
+ clz r10, r10; /* ntz(i+2) */ \
+ clz r11, r11; /* ntz(i+3) */ \
+ add r8, r5, r8, lsl #4; \
+ add r9, r5, r9, lsl #4; \
+ add r10, r5, r10, lsl #4; \
+ add r11, r5, r11, lsl #4; \
+ \
sub r6, #4; \
\
vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
@@ -1120,7 +1140,11 @@ _gcry_aes_ocb_enc_armv8_ce:
/* Checksum_i = Checksum_{i-1} xor P_i */ \
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
\
- ldr r8, [r5], #4; \
+ rbit r8, lr; \
+ add lr, #1; \
+ clz r8, r8; /* ntz(i) */ \
+ add r8, r5, r8, lsl #4; \
+ \
vld1.8 {q1}, [r2]!; /* load plaintext */ \
vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
vld1.8 {q3}, [r4]; /* load checksum */ \
@@ -1171,9 +1195,10 @@ _gcry_aes_ocb_enc_armv8_ce:
* const unsigned char *inbuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -1189,6 +1214,7 @@ _gcry_aes_ocb_dec_armv8_ce:
* %st+4: Ls => r5
* %st+8: nblocks => r6 (0 < nblocks <= 32)
* %st+12: nrounds => r7
+ * %st+16: blkn => lr
*/
vpush {q4-q7}
@@ -1197,6 +1223,7 @@ _gcry_aes_ocb_dec_armv8_ce:
ldr r4, [sp, #(104+0)]
ldr r5, [sp, #(104+4)]
ldr r6, [sp, #(104+8)]
+ ldr lr, [sp, #(104+16)]
cmp r7, #12
vld1.8 {q0}, [r3] /* load offset */
@@ -1209,6 +1236,7 @@ _gcry_aes_ocb_dec_armv8_ce:
#define OCB_DEC(bits, ...) \
.Locb_dec_entry_##bits: \
cmp r6, #4; \
+ add lr, #1; \
blo .Locb_dec_loop_##bits; \
\
.Locb_dec_loop4_##bits: \
@@ -1217,7 +1245,23 @@ _gcry_aes_ocb_dec_armv8_ce:
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
/* Checksum_i = Checksum_{i-1} xor P_i */ \
\
- ldm r5!, {r8, r9, r10, r11}; \
+ add r9, lr, #1; \
+ add r10, lr, #2; \
+ add r11, lr, #3; \
+ rbit r8, lr; \
+ add lr, lr, #4; \
+ rbit r9, r9; \
+ rbit r10, r10; \
+ rbit r11, r11; \
+ clz r8, r8; /* ntz(i+0) */ \
+ clz r9, r9; /* ntz(i+1) */ \
+ clz r10, r10; /* ntz(i+2) */ \
+ clz r11, r11; /* ntz(i+3) */ \
+ add r8, r5, r8, lsl #4; \
+ add r9, r5, r9, lsl #4; \
+ add r10, r5, r10, lsl #4; \
+ add r11, r5, r11, lsl #4; \
+ \
sub r6, #4; \
\
vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
@@ -1270,7 +1314,11 @@ _gcry_aes_ocb_dec_armv8_ce:
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
/* Checksum_i = Checksum_{i-1} xor P_i */ \
\
- ldr r8, [r5], #4; \
+ rbit r8, lr; \
+ add lr, #1; \
+ clz r8, r8; /* ntz(i) */ \
+ add r8, r5, r8, lsl #4; \
+ \
vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
vld1.8 {q1}, [r2]!; /* load ciphertext */ \
subs r6, #1; \
@@ -1320,9 +1368,10 @@ _gcry_aes_ocb_dec_armv8_ce:
* const unsigned char *abuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -1337,6 +1386,7 @@ _gcry_aes_ocb_auth_armv8_ce:
* %st+0: Ls => r5
* %st+4: nblocks => r6 (0 < nblocks <= 32)
* %st+8: nrounds => r7
+ * %st+12: blkn => lr
*/
vpush {q4-q7}
@@ -1344,6 +1394,7 @@ _gcry_aes_ocb_auth_armv8_ce:
ldr r7, [sp, #(104+8)]
ldr r5, [sp, #(104+0)]
ldr r6, [sp, #(104+4)]
+ ldr lr, [sp, #(104+12)]
cmp r7, #12
vld1.8 {q0}, [r2] /* load offset */
@@ -1356,6 +1407,7 @@ _gcry_aes_ocb_auth_armv8_ce:
#define OCB_AUTH(bits, ...) \
.Locb_auth_entry_##bits: \
cmp r6, #4; \
+ add lr, #1; \
blo .Locb_auth_loop_##bits; \
\
.Locb_auth_loop4_##bits: \
@@ -1363,7 +1415,23 @@ _gcry_aes_ocb_auth_armv8_ce:
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
\
- ldm r5!, {r8, r9, r10, r11}; \
+ add r9, lr, #1; \
+ add r10, lr, #2; \
+ add r11, lr, #3; \
+ rbit r8, lr; \
+ add lr, lr, #4; \
+ rbit r9, r9; \
+ rbit r10, r10; \
+ rbit r11, r11; \
+ clz r8, r8; /* ntz(i+0) */ \
+ clz r9, r9; /* ntz(i+1) */ \
+ clz r10, r10; /* ntz(i+2) */ \
+ clz r11, r11; /* ntz(i+3) */ \
+ add r8, r5, r8, lsl #4; \
+ add r9, r5, r9, lsl #4; \
+ add r10, r5, r10, lsl #4; \
+ add r11, r5, r11, lsl #4; \
+ \
sub r6, #4; \
\
vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \
@@ -1401,8 +1469,12 @@ _gcry_aes_ocb_auth_armv8_ce:
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
\
- ldr r8, [r5], #4; \
- vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+ rbit r8, lr; \
+ add lr, #1; \
+ clz r8, r8; /* ntz(i) */ \
+ add r8, r5, r8, lsl #4; \
+ \
+ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
vld1.8 {q1}, [r1]!; /* load aadtext */ \
subs r6, #1; \
veor q0, q0, q2; \
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 21d0aec..1ebb363 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -28,23 +28,6 @@
.text
-#if (SIZEOF_VOID_P == 4)
- #define ptr8 w8
- #define ptr9 w9
- #define ptr10 w10
- #define ptr11 w11
- #define ptr_sz 4
-#elif (SIZEOF_VOID_P == 8)
- #define ptr8 x8
- #define ptr9 x9
- #define ptr10 x10
- #define ptr11 x11
- #define ptr_sz 8
-#else
- #error "missing SIZEOF_VOID_P"
-#endif
-
-
#define GET_DATA_POINTER(reg, name) \
adrp reg, :got:name ; \
ldr reg, [reg, #:got_lo12:name] ;
@@ -855,9 +838,10 @@ _gcry_aes_cfb_dec_armv8_ce:
* const unsigned char *inbuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -870,11 +854,13 @@ _gcry_aes_ocb_enc_armv8_ce:
* x2: inbuf
* x3: offset
* x4: checksum
- * x5: Ls
+ * x5: Ltable
* x6: nblocks (0 < nblocks <= 32)
* w7: nrounds
+ * %st+0: blkn => w12
*/
+ ldr w12, [sp]
ld1 {v0.16b}, [x3] /* load offset */
ld1 {v16.16b}, [x4] /* load checksum */
@@ -886,6 +872,7 @@ _gcry_aes_ocb_enc_armv8_ce:
#define OCB_ENC(bits, ...) \
.Locb_enc_entry_##bits: \
cmp x6, #4; \
+ add x12, x12, #1; \
b.lo .Locb_enc_loop_##bits; \
\
.Locb_enc_loop4_##bits: \
@@ -894,10 +881,24 @@ _gcry_aes_ocb_enc_armv8_ce:
/* Checksum_i = Checksum_{i-1} xor P_i */ \
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
\
- ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ add w9, w12, #1; \
+ add w10, w12, #2; \
+ add w11, w12, #3; \
+ rbit w8, w12; \
+ add w12, w12, #4; \
+ rbit w9, w9; \
+ rbit w10, w10; \
+ rbit w11, w11; \
+ clz w8, w8; /* ntz(i+0) */ \
+ clz w9, w9; /* ntz(i+1) */ \
+ clz w10, w10; /* ntz(i+2) */ \
+ clz w11, w11; /* ntz(i+3) */ \
+ add x8, x5, x8, lsl #4; \
+ ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
+ add x9, x5, x9, lsl #4; \
+ add x10, x5, x10, lsl #4; \
+ add x11, x5, x11, lsl #4; \
\
- ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
- ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
sub x6, x6, #4; \
\
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
@@ -940,7 +941,11 @@ _gcry_aes_ocb_enc_armv8_ce:
/* Checksum_i = Checksum_{i-1} xor P_i */ \
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
\
- ldr ptr8, [x5], #(ptr_sz); \
+ rbit x8, x12; \
+ add x12, x12, #1; \
+ clz x8, x8; /* ntz(i) */ \
+ add x8, x5, x8, lsl #4; \
+ \
ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
sub x6, x6, #1; \
@@ -983,9 +988,10 @@ _gcry_aes_ocb_enc_armv8_ce:
* const unsigned char *inbuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -998,11 +1004,13 @@ _gcry_aes_ocb_dec_armv8_ce:
* x2: inbuf
* x3: offset
* x4: checksum
- * x5: Ls
+ * x5: Ltable
* x6: nblocks (0 < nblocks <= 32)
* w7: nrounds
+ * %st+0: blkn => w12
*/
+ ldr w12, [sp]
ld1 {v0.16b}, [x3] /* load offset */
ld1 {v16.16b}, [x4] /* load checksum */
@@ -1014,6 +1022,7 @@ _gcry_aes_ocb_dec_armv8_ce:
#define OCB_DEC(bits) \
.Locb_dec_entry_##bits: \
cmp x6, #4; \
+ add w12, w12, #1; \
b.lo .Locb_dec_loop_##bits; \
\
.Locb_dec_loop4_##bits: \
@@ -1022,10 +1031,24 @@ _gcry_aes_ocb_dec_armv8_ce:
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
/* Checksum_i = Checksum_{i-1} xor P_i */ \
\
- ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ add w9, w12, #1; \
+ add w10, w12, #2; \
+ add w11, w12, #3; \
+ rbit w8, w12; \
+ add w12, w12, #4; \
+ rbit w9, w9; \
+ rbit w10, w10; \
+ rbit w11, w11; \
+ clz w8, w8; /* ntz(i+0) */ \
+ clz w9, w9; /* ntz(i+1) */ \
+ clz w10, w10; /* ntz(i+2) */ \
+ clz w11, w11; /* ntz(i+3) */ \
+ add x8, x5, x8, lsl #4; \
+ ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
+ add x9, x5, x9, lsl #4; \
+ add x10, x5, x10, lsl #4; \
+ add x11, x5, x11, lsl #4; \
\
- ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
- ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
sub x6, x6, #4; \
\
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
@@ -1068,7 +1091,11 @@ _gcry_aes_ocb_dec_armv8_ce:
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
/* Checksum_i = Checksum_{i-1} xor P_i */ \
\
- ldr ptr8, [x5], #(ptr_sz); \
+ rbit w8, w12; \
+ add w12, w12, #1; \
+ clz w8, w8; /* ntz(i) */ \
+ add x8, x5, x8, lsl #4; \
+ \
ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
sub x6, x6, #1; \
@@ -1110,9 +1137,10 @@ _gcry_aes_ocb_dec_armv8_ce:
* const unsigned char *abuf,
* unsigned char *offset,
* unsigned char *checksum,
- * void **Ls,
+ * unsigned char *L_table,
* size_t nblocks,
- * unsigned int nrounds);
+ * unsigned int nrounds,
+ * unsigned int blkn);
*/
.align 3
@@ -1124,10 +1152,12 @@ _gcry_aes_ocb_auth_armv8_ce:
* x1: abuf
* x2: offset => x3
* x3: checksum => x4
- * x4: Ls => x5
+ * x4: Ltable => x5
* x5: nblocks => x6 (0 < nblocks <= 32)
* w6: nrounds => w7
+ * w7: blkn => w12
*/
+ mov x12, x7
mov x7, x6
mov x6, x5
mov x5, x4
@@ -1145,6 +1175,7 @@ _gcry_aes_ocb_auth_armv8_ce:
#define OCB_AUTH(bits) \
.Locb_auth_entry_##bits: \
cmp x6, #4; \
+ add w12, w12, #1; \
b.lo .Locb_auth_loop_##bits; \
\
.Locb_auth_loop4_##bits: \
@@ -1152,10 +1183,24 @@ _gcry_aes_ocb_auth_armv8_ce:
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
\
- ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ add w9, w12, #1; \
+ add w10, w12, #2; \
+ add w11, w12, #3; \
+ rbit w8, w12; \
+ add w12, w12, #4; \
+ rbit w9, w9; \
+ rbit w10, w10; \
+ rbit w11, w11; \
+ clz w8, w8; /* ntz(i+0) */ \
+ clz w9, w9; /* ntz(i+1) */ \
+ clz w10, w10; /* ntz(i+2) */ \
+ clz w11, w11; /* ntz(i+3) */ \
+ add x8, x5, x8, lsl #4; \
+ ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
+ add x9, x5, x9, lsl #4; \
+ add x10, x5, x10, lsl #4; \
+ add x11, x5, x11, lsl #4; \
\
- ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
- ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
sub x6, x6, #4; \
\
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
@@ -1192,7 +1237,11 @@ _gcry_aes_ocb_auth_armv8_ce:
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
\
- ldr ptr8, [x5], #(ptr_sz); \
+ rbit w8, w12; \
+ add w12, w12, #1; \
+ clz w8, w8; /* ntz(i) */ \
+ add x8, x5, x8, lsl #4; \
+ \
ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
sub x6, x6, #1; \
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 1bf74da..334cf68 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -80,30 +80,33 @@ extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
const unsigned char *inbuf,
unsigned char *offset,
unsigned char *checksum,
- void **Ls,
+ unsigned char *L_table,
size_t nblocks,
- unsigned int nrounds);
+ unsigned int nrounds,
+ unsigned int blkn);
extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
unsigned char *outbuf,
const unsigned char *inbuf,
unsigned char *offset,
unsigned char *checksum,
- void **Ls,
+ unsigned char *L_table,
size_t nblocks,
- unsigned int nrounds);
+ unsigned int nrounds,
+ unsigned int blkn);
extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
const unsigned char *abuf,
unsigned char *offset,
unsigned char *checksum,
- void **Ls,
+ unsigned char *L_table,
size_t nblocks,
- unsigned int nrounds);
+ unsigned int nrounds,
+ unsigned int blkn);
typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
const unsigned char *inbuf,
unsigned char *offset, unsigned char *checksum,
- void **Ls, size_t nblocks,
- unsigned int nrounds);
+ unsigned char *L_table, size_t nblocks,
+ unsigned int nrounds, unsigned int blkn);
void
_gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
@@ -334,62 +337,11 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
unsigned int nrounds = ctx->rounds;
u64 blkn = c->u_mode.ocb.data_nblocks;
- u64 blkn_offs = blkn - blkn % 32;
- unsigned int n = 32 - blkn % 32;
- void *Ls[32];
- void **l;
- size_t i;
c->u_mode.ocb.data_nblocks = blkn + nblocks;
- if (nblocks >= 32)
- {
- for (i = 0; i < 32; i += 8)
- {
- Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
- Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
- Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
- l = &Ls[(31 + n) % 32];
-
- /* Process data in 32 block chunks. */
- while (nblocks >= 32)
- {
- blkn_offs += 32;
- *l = (void *)ocb_get_l(c, blkn_offs);
-
- crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, 32,
- nrounds);
-
- nblocks -= 32;
- outbuf += 32 * 16;
- inbuf += 32 * 16;
- }
-
- if (nblocks && l < &Ls[nblocks])
- {
- *l = (void *)ocb_get_l(c, 32 + blkn_offs);
- }
- }
- else
- {
- for (i = 0; i < nblocks; i++)
- Ls[i] = (void *)ocb_get_l(c, ++blkn);
- }
-
- if (nblocks)
- {
- crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, nblocks,
- nrounds);
- }
+ crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+ c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
}
void
@@ -401,61 +353,12 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
const unsigned char *abuf = abuf_arg;
unsigned int nrounds = ctx->rounds;
u64 blkn = c->u_mode.ocb.aad_nblocks;
- u64 blkn_offs = blkn - blkn % 32;
- unsigned int n = 32 - blkn % 32;
- void *Ls[32];
- void **l;
- size_t i;
c->u_mode.ocb.aad_nblocks = blkn + nblocks;
- if (nblocks >= 32)
- {
- for (i = 0; i < 32; i += 8)
- {
- Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
- Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
- Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
- l = &Ls[(31 + n) % 32];
-
- /* Process data in 32 block chunks. */
- while (nblocks >= 32)
- {
- blkn_offs += 32;
- *l = (void *)ocb_get_l(c, blkn_offs);
-
- _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls, 32, nrounds);
-
- nblocks -= 32;
- abuf += 32 * 16;
- }
-
- if (nblocks && l < &Ls[nblocks])
- {
- *l = (void *)ocb_get_l(c, 32 + blkn_offs);
- }
- }
- else
- {
- for (i = 0; i < nblocks; i++)
- Ls[i] = (void *)ocb_get_l(c, ++blkn);
- }
-
- if (nblocks)
- {
- _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls, nblocks, nrounds);
- }
+ _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+ nblocks, nrounds, (unsigned int)blkn);
}
#endif /* USE_ARM_CE */
More information about the Gcrypt-devel
mailing list