[PATCH] twofish: add ARMv6 assembly implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Oct 21 15:21:21 CEST 2013


* cipher/Makefile.am: Add 'twofish-armv6.S'.
* cipher/twofish-armv6.S: New.
* cipher/twofish.c (USE_ARMV6_ASM): New macro.
[USE_ARMV6_ASM] (_gcry_twofish_armv6_encrypt_block)
(_gcry_twofish_armv6_decrypt_block): New prototypes.
[USE_AMDV6_ASM] (twofish_encrypt, twofish_decrypt): Add.
[USE_AMD64_ASM] (do_twofish_encrypt, do_twofish_decrypt): Remove.
(_gcry_twofish_ctr_enc, _gcry_twofish_cfb_dec): Use 'twofish_encrypt'
instead of 'do_twofish_encrypt'.
(_gcry_twofish_cbc_dec): Use 'twofish_decrypt' instead of
'do_twofish_decrypt'.
* configure.ac [arm]: Add 'twofish-armv6.lo'.
--

Add optimized ARMv6 assembly implementation for Twofish. Implementation is tuned
for Cortex-A8. Unaligned access handling is done in assembly part.

For now, only enable this on little-endian systems as big-endian correctness
have not been tested yet.

Old (gcc-4.8) vs new (twofish-asm), Cortex-A8 (on armhf):
           ECB/Stream         CBC             CFB             OFB             CTR             CCM
         --------------- --------------- --------------- --------------- --------------- ---------------
TWOFISH   1.23x   1.25x   1.16x   1.26x   1.16x   1.30x   1.18x   1.17x   1.23x   1.23x   1.22x   1.22x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am     |    2 
 cipher/twofish-armv6.S |  365 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/twofish.c       |   88 ++++++++----
 configure.ac           |    4 +
 4 files changed, 432 insertions(+), 27 deletions(-)
 create mode 100644 cipher/twofish-armv6.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b0efd89..3d8149a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -80,7 +80,7 @@ sha512.c sha512-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c \
-twofish.c twofish-amd64.S \
+twofish.c twofish-amd64.S twofish-armv6.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-armv6.S
diff --git a/cipher/twofish-armv6.S b/cipher/twofish-armv6.S
new file mode 100644
index 0000000..b76ab37
--- /dev/null
+++ b/cipher/twofish-armv6.S
@@ -0,0 +1,365 @@
+/* twofish-armv6.S  -  ARM assembly implementation of Twofish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX %r0
+#define CTXs0 %r0
+#define CTXs1 %r1
+#define CTXs3 %r7
+
+#define RA %r3
+#define RB %r4
+#define RC %r5
+#define RD %r6
+
+#define RX %r2
+#define RY %ip
+
+#define RMASK %lr
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#ifndef __ARMEL__
+	/* bswap on big-endian */
+	#define host_to_le(reg) \
+		rev reg, reg;
+	#define le_to_host(reg) \
+		rev reg, reg;
+#else
+	/* nop on little-endian */
+	#define host_to_le(reg) /*_*/
+	#define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	le_to_host(a); \
+	ldr c, [rin, #8]; \
+	le_to_host(b); \
+	ldr d, [rin, #12]; \
+	le_to_host(c); \
+	le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+	le_to_host(a); \
+	le_to_host(b); \
+	str a, [rout, #0]; \
+	le_to_host(c); \
+	str b, [rout, #4]; \
+	le_to_host(d); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads/writes allowed */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+		ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		str_output_aligned_le(rout, ra, rb, rc, rd)
+#else
+	/* need to handle unaligned reads/writes by byte reads */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_le(ra, rin, 0, rtmp0); \
+			ldr_unaligned_le(rb, rin, 4, rtmp0); \
+			ldr_unaligned_le(rc, rin, 8, rtmp0); \
+			ldr_unaligned_le(rd, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
+		2:;
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			str_output_aligned_le(rout, ra, rb, rc, rd); \
+		2:;
+#endif
+
+/**********************************************************************
+  1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+	and RT0, RMASK, b, lsr#(8 - 2); \
+	and RY, RMASK, b, lsr#(16 - 2); \
+	add RT0, RT0, #(s2 - s1); \
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	ldr RY, [CTXs3, RY]; \
+	and RT2, RMASK, b, lsl#(2); \
+	ldr RT0, [CTXs1, RT0]; \
+	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+	ldr RT1, [CTXs0, RT1]; \
+	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+	ldr RT2, [CTXs1, RT2]; \
+	add RT3, RT3, #(s2 - s1); \
+	ldr RX, [CTXs1, RX]; \
+	ror_a(a); \
+	\
+	eor RY, RY, RT0; \
+	ldr RT3, [CTXs1, RT3]; \
+	and RT0, RMASK, a, lsl#(2); \
+	eor RY, RY, RT1; \
+	and RT1, RMASK, a, lsr#(24 - 2); \
+	eor RY, RY, RT2; \
+	ldr RT0, [CTXs0, RT0]; \
+	eor RX, RX, RT3; \
+	ldr RT1, [CTXs3, RT1]; \
+	eor RX, RX, RT0; \
+	\
+	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT1; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT3; \
+	add RX, RX, RT2; \
+	eor rd, RT0, rd, ror #31; \
+	eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+	ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+	ror_b(b); \
+	and RT2, RMASK, a, lsl#(2); \
+	and RT0, RMASK, a, lsr#(8 - 2); \
+	\
+	ldr RY, [CTXs1, RT3]; \
+	add RT1, RT1, #(s2 - s1); \
+	ldr RX, [CTXs0, RT2]; \
+	and RT3, RMASK, b, lsr#(16 - 2); \
+	ldr RT1, [CTXs1, RT1]; \
+	and RT2, RMASK, a, lsr#(16 - 2); \
+	ldr RT0, [CTXs1, RT0]; \
+	\
+	add RT2, RT2, #(s2 - s1); \
+	ldr RT3, [CTXs3, RT3]; \
+	eor RY, RY, RT1; \
+	\
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs1, RT2]; \
+	and RT0, RMASK, a, lsr#(24 - 2); \
+	\
+	ldr RT1, [CTXs0, RT1]; \
+	\
+	eor RY, RY, RT3; \
+	ldr RT0, [CTXs3, RT0]; \
+	eor RX, RX, RT2; \
+	eor RY, RY, RT1; \
+	\
+	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT1; \
+	add RX, RX, RT2; \
+	eor rd, rd, RT0; \
+	eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	ror1(RD);
+
+.align 3
+.global _gcry_twofish_armv6_encrypt_block
+.type   _gcry_twofish_armv6_encrypt_block,%function;
+
+_gcry_twofish_armv6_encrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add RY, CTXs0, #w;
+
+	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs3, CTXs0, #(s3 - s0);
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	first_encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	last_encrypt_cycle(7);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %lr};
+	bx %lr;
+.ltorg
+.size _gcry_twofish_armv6_encrypt_block,.-_gcry_twofish_armv6_encrypt_block;
+
+.align 3
+.global _gcry_twofish_armv6_decrypt_block
+.type   _gcry_twofish_armv6_decrypt_block,%function;
+
+_gcry_twofish_armv6_decrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	first_decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	last_decrypt_cycle(0);
+
+	add RY, CTXs0, #w;
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %lr};
+	bx %lr;
+.size _gcry_twofish_armv6_decrypt_block,.-_gcry_twofish_armv6_decrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 993ad0f..d2cabbe 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -57,6 +57,14 @@
 # define USE_AMD64_ASM 1
 #endif
 
+/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARMV6_ASM
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+#  define USE_ARMV6_ASM 1
+# endif
+#endif
+
 
 /* Prototype for the self-test function. */
 static const char *selftest(void);
@@ -746,7 +754,16 @@ extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
 extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
 					const byte *in, byte *iv);
 
-#else /*!USE_AMD64_ASM*/
+#elif defined(USE_ARMV6_ASM)
+
+/* Assembly implementations of Twofish. */
+extern void _gcry_twofish_armv6_encrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+extern void _gcry_twofish_armv6_decrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
 
 /* Macros to compute the g() function in the encryption and decryption
  * rounds.  G1 is the straight g() function; G2 includes the 8-bit
@@ -812,21 +829,25 @@ extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
 
 #ifdef USE_AMD64_ASM
 
-static void
-do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+static unsigned int
+twofish_encrypt (void *context, byte *out, const byte *in)
 {
+  TWOFISH_context *ctx = context;
   _gcry_twofish_amd64_encrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
 }
 
+#elif defined(USE_ARMV6_ASM)
+
 static unsigned int
 twofish_encrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_amd64_encrypt_block(ctx, out, in);
+  _gcry_twofish_armv6_encrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
-#else /*!USE_AMD64_ASM*/
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
 
 static void
 do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
@@ -868,28 +889,32 @@ twofish_encrypt (void *context, byte *out, const byte *in)
   return /*burn_stack*/ (24+3*sizeof (void*));
 }
 
-#endif /*!USE_AMD64_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
 
 

 /* Decrypt one block.  in and out may be the same. */
 
 #ifdef USE_AMD64_ASM
 
-static void
-do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+static unsigned int
+twofish_decrypt (void *context, byte *out, const byte *in)
 {
+  TWOFISH_context *ctx = context;
   _gcry_twofish_amd64_decrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
 }
 
+#elif defined(USE_ARMV6_ASM)
+
 static unsigned int
 twofish_decrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_amd64_decrypt_block(ctx, out, in);
+  _gcry_twofish_armv6_decrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
-#else /*!USE_AMD64_ASM*/
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
 
 static void
 do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
@@ -932,7 +957,7 @@ twofish_decrypt (void *context, byte *out, const byte *in)
   return /*burn_stack*/ (24+3*sizeof (void*));
 }
 
-#endif /*!USE_AMD64_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
 
 

 
@@ -947,14 +972,11 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char tmpbuf[TWOFISH_BLOCKSIZE];
-  int burn_stack_depth = 24 + 3 * sizeof (void*);
+  unsigned int burn, burn_stack_depth = 0;
   int i;
 
 #ifdef USE_AMD64_ASM
   {
-    if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*))
-      burn_stack_depth = 8 * sizeof(void*);
-
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
@@ -963,6 +985,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;
         inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 8 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
       }
 
     /* Use generic code to handle smaller chunks... */
@@ -973,7 +999,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
   for ( ;nblocks; nblocks-- )
     {
       /* Encrypt the counter. */
-      do_twofish_encrypt(ctx, tmpbuf, ctr);
+      burn = twofish_encrypt(ctx, tmpbuf, ctr);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
+
       /* XOR the input with the encrypted counter and store in output.  */
       buf_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE);
       outbuf += TWOFISH_BLOCKSIZE;
@@ -1002,13 +1031,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char savebuf[TWOFISH_BLOCKSIZE];
-  int burn_stack_depth = 24 + 3 * sizeof (void*);
+  unsigned int burn, burn_stack_depth = 0;
 
 #ifdef USE_AMD64_ASM
   {
-    if (nblocks >= 3 && burn_stack_depth < 9 * sizeof(void*))
-      burn_stack_depth = 9 * sizeof(void*);
-
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
@@ -1017,6 +1043,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;
         inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 9 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
       }
 
     /* Use generic code to handle smaller chunks... */
@@ -1029,7 +1059,9 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
          OUTBUF.  */
       memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE);
 
-      do_twofish_decrypt (ctx, outbuf, inbuf);
+      burn = twofish_decrypt (ctx, outbuf, inbuf);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
 
       buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE);
       memcpy(iv, savebuf, TWOFISH_BLOCKSIZE);
@@ -1051,13 +1083,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
   TWOFISH_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  int burn_stack_depth = 24 + 3 * sizeof (void*);
+  unsigned int burn, burn_stack_depth = 0;
 
 #ifdef USE_AMD64_ASM
   {
-    if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*))
-      burn_stack_depth = 8 * sizeof(void*);
-
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
@@ -1066,6 +1095,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;
         inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 8 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
       }
 
     /* Use generic code to handle smaller chunks... */
@@ -1074,7 +1107,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
   for ( ;nblocks; nblocks-- )
     {
-      do_twofish_encrypt(ctx, iv, iv);
+      burn = twofish_encrypt(ctx, iv, iv);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
+
       buf_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE);
       outbuf += TWOFISH_BLOCKSIZE;
       inbuf += TWOFISH_BLOCKSIZE;
diff --git a/configure.ac b/configure.ac
index a803b5f..66fb6b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1479,6 +1479,10 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
       ;;
+      arm*-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-armv6.lo"
+      ;;
    esac
 fi
 




More information about the Gcrypt-devel mailing list