[PATCH 2/2] cast5: add amd64 assembly implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri May 24 11:43:29 CEST 2013


* cipher/Makefile.am: Add 'cast5-amd64.S'.
* cipher/cast5-amd64.S: New file.
* cipher/cast5.c (USE_AMD64_ASM): New macro.
(_gcry_cast5_s1tos4): Merge arrays s1, s2, s3, s4 to single array to
simplify access from assembly implementation.
(s1, s2, s3, s4): New macros pointing to subarrays in
_gcry_cast5_s1tos4.
[USE_AMD64_ASM] (_gcry_cast5_amd64_encrypt_block)
(_gcry_cast5_amd64_decrypt_block, _gcry_cast5_amd64_ctr_enc)
(_gcry_cast5_amd64_cbc_dec, _gcry_cast5_amd64_cfb_dec): New prototypes.
[USE_AMD64_ASM] (do_encrypt_block, do_decrypt_block, encrypt_block)
(decrypt_block): New functions.
(_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec, _gcry_cast5_cfb_dec)
(selftest_ctr, selftest_cbc, selftest_cfb): New functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_CAST5]: Register CAST5 bulk
functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (cast5) [x86_64]: Add 'cast5-amd64.lo'.
* src/cipher.h (_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec)
(gcry_cast5_cfb_dec): New prototypes.
--

Provides non-parallel implementations for small speed-up and 4-way parallel
implementations that gets accelerated on `out-of-order' CPUs.

Speed old vs. new on AMD Phenom II X6 1055T:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
CAST5         1.23x   1.22x   1.21x   2.86x   1.21x   2.83x   1.22x   1.17x   2.73x   2.73x

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
CAST5         1.00x   1.04x   1.06x   2.56x   1.06x   2.37x   1.03x   1.01x   2.43x   2.41x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am   |    2 
 cipher/cast5-amd64.S |  587 ++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/cast5.c       |  278 +++++++++++++++++++++++-
 cipher/cipher.c      |    7 +
 configure.ac         |    7 +
 src/cipher.h         |   13 +
 6 files changed, 885 insertions(+), 9 deletions(-)
 create mode 100644 cipher/cast5-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 69f1e6d..1e2696f 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -54,7 +54,7 @@ rmd.h
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c \
 blowfish.c \
-cast5.c \
+cast5.c cast5-amd64.S \
 crc.c \
 des.c \
 dsa.c \
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
new file mode 100644
index 0000000..c3007d3
--- /dev/null
+++ b/cipher/cast5-amd64.S
@@ -0,0 +1,587 @@
+/* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_CAST5)
+
+#ifdef __PIC__
+#  define RIP %rip
+#  define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
+#else
+#  define RIP
+#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+#endif
+
+.text
+
+.extern _gcry_cast5_s1to4;
+
+#define s1 0
+#define s2 (s1 + (4 * 256))
+#define s3 (s2 + (4 * 256))
+#define s4 (s3 + (4 * 256))
+
+/* structure of CAST5_context: */
+#define Km 0
+#define Kr (Km + (16 * 4))
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RTAB %r8
+
+#define RLR0 %r9
+#define RLR1 %r10
+#define RLR2 %r11
+#define RLR3 %r12
+
+#define RLR0d %r9d
+#define RLR1d %r10d
+#define RLR2d %r11d
+#define RLR3d %r12d
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %dh
+
+#define RKR %rcx
+#define RKRd %ecx
+#define RKRbl %cl
+
+#define RT0 %rbp
+#define RT1 %rsi
+
+#define RT0d %ebp
+#define RT1d %esi
+
+#define RKM0d %r13d
+#define RKM1d %r14d
+
+/***********************************************************************
+ * 1-way cast5
+ ***********************************************************************/
+#define dummy(x)
+
+#define shr_kr(none) \
+	shrq $8,			RKR;
+
+#define F(km, load_next_kr, op0, op1, op2, op3) \
+	op0 ## l RLR0d,			km ## d; \
+	roll RKRbl,			km ## d; \
+	rorq $32,			RLR0; \
+	movzbl km ## bh,		RT0d; \
+	movzbl km ## bl,		RT1d; \
+	roll $16,			km ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	load_next_kr(kr_next); \
+	movzbl km ## bh,		RT1d; \
+	movzbl km ## bl,		km ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,km,4),		RT0d; \
+	xorq RT0,			RLR0;
+
+#define F1(km, load_next_kr) \
+	F(##km, load_next_kr, add, xor, sub, add)
+#define F2(km, load_next_kr) \
+	F(##km, load_next_kr, xor, sub, add, xor)
+#define F3(km, load_next_kr) \
+	F(##km, load_next_kr, sub, add, xor, sub)
+
+#define get_round_km(n, km) \
+	movl Km+4*(n)(CTX), 		km;
+
+#define get_round_kr_enc(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n)(CTX),		RKR;
+
+#define get_round_kr_dec(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n - 7)(CTX),		RKR; \
+	bswapq				RKR;
+
+#define round_enc(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n + 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_enc_last(n, FXA, FXB) \
+	get_round_km(n + 1, RX2d); \
+	\
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_enc_1(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, shr_kr)
+
+#define round_enc_2(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, dummy)
+
+#define round_dec(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n - 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_dec_last(n, FXA, FXB) \
+	get_round_km(n - 1, RX2d); \
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_dec_1(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, shr_kr)
+
+#define round_dec_2(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, dummy)
+
+#define read_block() \
+	movq (RIO), 		RLR0; \
+	bswapq 			RLR0;
+
+#define write_block() \
+	bswapq 			RLR0; \
+	rorq $32,		RLR0; \
+	movq RLR0, 		(RIO);
+
+.align 8
+.global _gcry_cast5_amd64_encrypt_block
+.type   _gcry_cast5_amd64_encrypt_block, at function;
+
+_gcry_cast5_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(0, RX0d);
+	get_round_kr_enc(0);
+	round_enc_1(0, F1, F2);
+	round_enc_1(2, F3, F1);
+	round_enc_1(4, F2, F3);
+	round_enc_2(6, F1, F2);
+	get_round_kr_enc(8);
+	round_enc_1(8, F3, F1);
+	round_enc_1(10, F2, F3);
+	round_enc_1(12, F1, F2);
+	round_enc_last(14, F3, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;
+
+.align 8
+.global _gcry_cast5_amd64_decrypt_block
+.type   _gcry_cast5_amd64_decrypt_block, at function;
+
+_gcry_cast5_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(15, RX0d);
+	get_round_kr_dec(15);
+	round_dec_1(15, F1, F3);
+	round_dec_1(13, F2, F1);
+	round_dec_1(11, F3, F2);
+	round_dec_2(9, F1, F3);
+	get_round_kr_dec(7);
+	round_dec_1(7, F2, F1);
+	round_dec_1(5, F3, F2);
+	round_dec_1(3, F1, F3);
+	round_dec_last(1, F2, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;
+
+/**********************************************************************
+  4-way cast5, four blocks parallel
+ **********************************************************************/
+#define F_tail(rlr, rx, op1, op2, op3) \
+	movzbl rx ## bh,		RT0d; \
+	movzbl rx ## bl,		RT1d; \
+	roll $16,			rx ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	movzbl rx ## bh,		RT1d; \
+	movzbl rx ## bl,		rx ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,rx,4),		RT0d; \
+	xorq RT0,			rlr;
+
+#define F4(km, load_next_kr, op0, op1, op2, op3) \
+	movl km,			RX0d; \
+	op0 ## l RLR0d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	rorq $32,			RLR0; \
+	\
+	movl km,			RX1d; \
+	op0 ## l RLR1d,			RX1d; \
+	roll RKRbl,			RX1d; \
+	rorq $32,			RLR1; \
+	\
+	movl km,			RX2d; \
+	op0 ## l RLR2d,			RX2d; \
+	roll RKRbl,			RX2d; \
+	rorq $32,			RLR2; \
+	\
+	F_tail(RLR0, RX0, op1, op2, op3); \
+	F_tail(RLR1, RX1, op1, op2, op3); \
+	F_tail(RLR2, RX2, op1, op2, op3); \
+	\
+	movl km,			RX0d; \
+	op0 ## l RLR3d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	load_next_kr();			\
+	rorq $32,			RLR3; \
+	\
+	F_tail(RLR3, RX0, op1, op2, op3);
+
+#define F4_1(km, load_next_kr) \
+	F4(km, load_next_kr, add, xor, sub, add)
+#define F4_2(km, load_next_kr) \
+	F4(km, load_next_kr, xor, sub, add, xor)
+#define F4_3(km, load_next_kr) \
+	F4(km, load_next_kr, sub, add, xor, sub)
+
+#define round_enc4(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n + 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_enc_last4(n, FXA, FXB) \
+	get_round_km(n + 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_enc4_1(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_enc4_2(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, dummy);
+
+#define round_dec4(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n - 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_dec_last4(n, FXA, FXB) \
+	get_round_km(n - 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_dec4_1(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_dec4_2(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, dummy);
+
+#define inbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d;
+
+#define outbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d; \
+	rorq $32,		a; \
+	rorq $32,		b; \
+	rorq $32,		c; \
+	rorq $32,		d;
+
+.align 8
+.type   __cast5_enc_blk4, at function;
+
+__cast5_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	get_round_km(0, RKM0d);
+	get_round_kr_enc(0);
+	round_enc4_1(0, F4_1, F4_2);
+	round_enc4_1(2, F4_3, F4_1);
+	round_enc4_1(4, F4_2, F4_3);
+	round_enc4_2(6, F4_1, F4_2);
+	get_round_kr_enc(8);
+	round_enc4_1(8, F4_3, F4_1);
+	round_enc4_1(10, F4_2, F4_3);
+	round_enc4_1(12, F4_1, F4_2);
+	round_enc_last4(14, F4_3, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+.size __cast5_enc_blk4,.-__cast5_enc_blk4;
+
+.align 8
+.type   __cast5_dec_blk4, at function;
+
+__cast5_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	get_round_km(15, RKM0d);
+	get_round_kr_dec(15);
+	round_dec4_1(15, F4_1, F4_3);
+	round_dec4_1(13, F4_2, F4_1);
+	round_dec4_1(11, F4_3, F4_2);
+	round_dec4_2(9, F4_1, F4_3);
+	get_round_kr_dec(7);
+	round_dec4_1(7, F4_2, F4_1);
+	round_dec4_1(5, F4_3, F4_2);
+	round_dec4_1(3, F4_1, F4_3);
+	round_dec_last4(1, F4_2, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+.size __cast5_dec_blk4,.-__cast5_dec_blk4;
+
+.align 8
+.global _gcry_cast5_amd64_ctr_enc
+.type   _gcry_cast5_amd64_ctr_enc, at function;
+_gcry_cast5_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load IV and byteswap */
+	movq (%rcx), RX0;
+	bswapq RX0;
+	movq RX0, RLR0;
+
+	/* construct IVs */
+	leaq 1(RX0), RLR1;
+	leaq 2(RX0), RLR2;
+	leaq 3(RX0), RLR3;
+	leaq 4(RX0), RX0;
+	bswapq RX0;
+
+	/* store new IV */
+	movq RX0, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %r14; /*src*/
+	popq %r13; /*dst*/
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r14), RLR0;
+	xorq 1 * 8(%r14), RLR1;
+	xorq 2 * 8(%r14), RLR2;
+	xorq 3 * 8(%r14), RLR3;
+	movq RLR0, 0 * 8(%r13);
+	movq RLR1, 1 * 8(%r13);
+	movq RLR2, 2 * 8(%r13);
+	movq RLR3, 3 * 8(%r13);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret
+.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;
+
+.align 8
+.global _gcry_cast5_amd64_cbc_dec
+.type   _gcry_cast5_amd64_cbc_dec, at function;
+_gcry_cast5_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rcx;
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load input */
+	movq 0 * 8(%rdx), RLR0;
+	movq 1 * 8(%rdx), RLR1;
+	movq 2 * 8(%rdx), RLR2;
+	movq 3 * 8(%rdx), RLR3;
+
+	call __cast5_dec_blk4;
+
+	popq RX0; /*src*/
+	popq RX1; /*dst*/
+	popq RX2; /*iv*/
+
+	movq 3 * 8(RX0), %r14;
+	xorq      (RX2), RLR0;
+	xorq 0 * 8(RX0), RLR1;
+	xorq 1 * 8(RX0), RLR2;
+	xorq 2 * 8(RX0), RLR3;
+	movq %r14, (RX2); /* store new IV */
+
+	movq RLR0, 0 * 8(RX1);
+	movq RLR1, 1 * 8(RX1);
+	movq RLR2, 2 * 8(RX1);
+	movq RLR3, 3 * 8(RX1);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;
+
+.align 8
+.global _gcry_cast5_amd64_cfb_dec
+.type   _gcry_cast5_amd64_cfb_dec, at function;
+_gcry_cast5_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* Load input */
+	movq (%rcx), RLR0;
+	movq 0 * 8(%rdx), RLR1;
+	movq 1 * 8(%rdx), RLR2;
+	movq 2 * 8(%rdx), RLR3;
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	/* Update IV */
+	movq 3 * 8(%rdx), %rdx;
+	movq %rdx, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %rdx; /*src*/
+	popq %rcx; /*dst*/
+
+	xorq 0 * 8(%rdx), RLR0;
+	xorq 1 * 8(%rdx), RLR1;
+	xorq 2 * 8(%rdx), RLR2;
+	xorq 3 * 8(%rdx), RLR3;
+	movq RLR0, 0 * 8(%rcx);
+	movq RLR1, 1 * 8(%rcx);
+	movq RLR2, 2 * 8(%rcx);
+	movq RLR3, 3 * 8(%rcx);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;
+
+#endif /*defined(USE_CAST5)*/
+#endif /*__x86_64*/
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 9905f5c..e3d7bc5 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -42,6 +42,14 @@
 #include "g10lib.h"
 #include "types.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__)
+# define USE_AMD64_ASM 1
+#endif
 
 #define CAST5_BLOCKSIZE 8
 
@@ -56,8 +64,12 @@ static void decrypt_block (void *c, byte *outbuf, const byte *inbuf);
 
 
 
+#define s1 _gcry_cast5_s1to4[0]
+#define s2 _gcry_cast5_s1to4[1]
+#define s3 _gcry_cast5_s1to4[2]
+#define s4 _gcry_cast5_s1to4[3]
 
-static const u32 s1[256] = {
+const u32 _gcry_cast5_s1to4[4][256] = { {
 0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949,
 0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
 0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
@@ -90,8 +102,7 @@ static const u32 s1[256] = {
 0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e,
 0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9,
 0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
-};
-static const u32 s2[256] = {
+}, {
 0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651,
 0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
 0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb,
@@ -124,8 +135,7 @@ static const u32 s2[256] = {
 0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
 0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9,
 0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1
-};
-static const u32 s3[256] = {
+}, {
 0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90,
 0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5,
 0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e,
@@ -158,8 +168,7 @@ static const u32 s3[256] = {
 0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d,
 0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a,
 0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783
-};
-static const u32 s4[256] = {
+}, {
 0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1,
 0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf,
 0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15,
@@ -192,7 +201,7 @@ static const u32 s4[256] = {
 0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04,
 0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282,
 0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
-};
+} };
 static const u32 s5[256] = {
 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f,
 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a,
@@ -331,6 +340,53 @@ static const u32 s8[256] = {
 };
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of CAST5. */
+extern void _gcry_cast5_amd64_encrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+extern void _gcry_cast5_amd64_decrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *ctr);
+
+extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+static void
+do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static void encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+#else /*USE_AMD64_ASM*/
+
 #if defined(__GNUC__) && defined(__i386__)
 static inline u32
 rol(int n, u32 x)
@@ -463,6 +519,201 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (20+4*sizeof(void*));
 }
 
+#endif /*!USE_AMD64_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size CAST5_BLOCKSIZE. */
+void
+_gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, 
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAST5_BLOCKSIZE];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+  int i;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+      /* Increment the counter.  */
+      for (i = CAST5_BLOCKSIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[CAST5_BLOCKSIZE];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, CAST5_BLOCKSIZE);
+
+      do_decrypt_block (ctx, outbuf, inbuf);
+
+      buf_xor(outbuf, outbuf, iv, CAST5_BLOCKSIZE);
+      memcpy(iv, savebuf, CAST5_BLOCKSIZE);
+      inbuf += CAST5_BLOCKSIZE;
+      outbuf += CAST5_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_ctr("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cbc("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cfb("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 
 static const char*
 selftest(void)
@@ -473,6 +724,7 @@ selftest(void)
     byte plain[8] = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF };
     byte cipher[8]= { 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 };
     byte buffer[8];
+    const char *r;
 
     cast_setkey( &c, key, 16 );
     encrypt_block( &c, buffer, plain );
@@ -507,6 +759,16 @@ selftest(void)
 
     }
 #endif
+
+    if ( (r = selftest_cbc ()) )
+      return r;
+
+    if ( (r = selftest_cfb ()) )
+      return r;
+
+    if ( (r = selftest_ctr ()) )
+      return r;
+
     return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 652d795..79ca755 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -718,6 +718,13 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               break;
 #endif /*USE_AES*/
+#ifdef USE_CAST5
+	    case GCRY_CIPHER_CAST5:
+              h->bulk.cfb_dec = _gcry_cast5_cfb_dec;
+              h->bulk.cbc_dec = _gcry_cast5_cbc_dec;
+              h->bulk.ctr_enc = _gcry_cast5_ctr_enc;
+              break;
+#endif /*USE_CAMELLIA*/
 #ifdef USE_CAMELLIA
 	    case GCRY_CIPHER_CAMELLIA128:
 	    case GCRY_CIPHER_CAMELLIA192:
diff --git a/configure.ac b/configure.ac
index 3fec8bc..113c71f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1195,6 +1195,13 @@ LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 9d6cc01..1742003 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -95,6 +95,19 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
 
+/*-- cast5.c --*/
+void _gcry_cast5_cfb_dec (void *context, unsigned char *iv,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
+void _gcry_cast5_cbc_dec (void *context, unsigned char *iv,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
+void _gcry_cast5_ctr_enc (void *context, unsigned char *ctr,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
 /*-- camellia-glue.c --*/
 void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
                              void *outbuf_arg, const void *inbuf_arg,




More information about the Gcrypt-devel mailing list