[PATCH 1/2] blowfish: add amd64 assembly implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed May 29 15:48:09 CEST 2013


* cipher/Makefile.am: Add 'blowfish-amd64.S'.
* cipher/blowfish-amd64.S: New file.
* cipher/blowfish.c (USE_AMD64_ASM): New macro.
[USE_AMD64_ASM] (_gcry_blowfish_amd64_do_encrypt)
(_gcry_blowfish_amd64_encrypt_block)
(_gcry_blowfish_amd64_decrypt_block, _gcry_blowfish_amd64_ctr_enc)
(_gcry_blowfish_amd64_cbc_dec, _gcry_blowfish_amd64_cfb_dec): New
prototypes.
[USE_AMD64_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block)
(encrypt_block, decrypt_block): New functions.
(_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(_gcry_blowfish_cfb_dec, selftest_ctr, selftest_cbc, selftest_cfb): New
functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_BLOWFISH]: Register Blowfish
bulk functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (blowfish) [x86_64]: Add 'blowfish-amd64.lo'.
* src/cipher.h (_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(gcry_blowfish_cfb_dec): New prototypes.
--

Add non-parallel functions for small speed-up and 4-way parallel functions for
modes of operation that support parallel processing.

Speed old vs. new on AMD Phenom II X6 1055T:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
BLOWFISH      1.21x   1.12x   1.17x   3.52x   1.18x   3.34x   1.16x   1.15x   3.38x   3.47x

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
BLOWFISH      1.16x   1.10x   1.17x   2.98x   1.18x   2.88x   1.16x   1.15x   3.00x   3.02x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am      |    2 
 cipher/blowfish-amd64.S |  533 +++++++++++++++++++++++++++++++++++++++++++++++
 cipher/blowfish.c       |  271 ++++++++++++++++++++++++
 cipher/cipher.c         |    7 +
 configure.ac            |    7 +
 src/cipher.h            |   13 +
 6 files changed, 832 insertions(+), 1 deletion(-)
 create mode 100644 cipher/blowfish-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 687c599..c0a7593 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -54,7 +54,7 @@ rmd.h
 
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c \
-blowfish.c \
+blowfish.c blowfish-amd64.S \
 cast5.c cast5-amd64.S \
 crc.c \
 des.c \
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
new file mode 100644
index 0000000..1008387
--- /dev/null
+++ b/cipher/blowfish-amd64.S
@@ -0,0 +1,533 @@
+/* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH)
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0	0
+#define s1	((s0) + 256 * 4)
+#define s2	((s1) + 256 * 4)
+#define s3	((s2) + 256 * 4)
+#define p	((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+	movzbl RX0bh,		RT1d; \
+	movzbl RX0bl,		RT3d; \
+	rorq $16,		RX0; \
+	movzbl RX0bh,		RT0d; \
+	movzbl RX0bl,		RT2d; \
+	rorq $16,		RX0; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		RX0;
+
+#define load_roundkey_enc(n) \
+	movq p+4*(n)(CTX), 	RX3;
+
+#define add_roundkey_enc() \
+	xorq RX3, 		RX0;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(); \
+	F();
+
+#define load_roundkey_dec(n) \
+	movq p+4*(n-1)(CTX),	RX3; \
+	rorq $32,		RX3;
+
+#define add_roundkey_dec() \
+	xorq RX3, 		RX0;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(); \
+	F();
+
+#define read_block() \
+	movq (RIO), 		RX0; \
+	rorq $32, 		RX0; \
+	bswapq 			RX0;
+
+#define write_block() \
+	bswapq 			RX0; \
+	movq RX0, 		(RIO);
+
+.align 8
+.type   __blowfish_enc_blk1, at function;
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0: input plaintext block
+	 * output:
+	 *	RX0: output plaintext block
+	 */
+	movq %rbp, %r11;
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	movq %r11, %rbp;
+
+	ret;
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_amd64_do_encrypt
+.type   _gcry_blowfish_amd64_do_encrypt, at function;
+
+_gcry_blowfish_amd64_do_encrypt:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: u32 *ret_xl
+	 *	%rdx: u32 *ret_xr
+	 */
+	movl (%rdx), RX0d;
+	shlq $32, RX0;
+	movl (%rsi), RT3d;
+	movq %rdx, %r10;
+	orq RT3, RX0;
+	movq %rsi, RX2;
+
+	call __blowfish_enc_blk1;
+
+	movl RX0d, (%r10);
+	shrq $32, RX0;
+	movl RX0d, (RX2);
+
+	ret;
+.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;
+
+.align 8
+.globl  _gcry_blowfish_amd64_encrypt_block
+.type   _gcry_blowfish_amd64_encrypt_block, at function;
+
+_gcry_blowfish_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r10;
+
+	movq %rdx, RIO;
+	read_block();
+
+	call __blowfish_enc_blk1;
+
+	movq %r10, RIO;
+	write_block();
+
+	ret;
+.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;
+
+.align 8
+.globl  _gcry_blowfish_amd64_decrypt_block
+.type   _gcry_blowfish_amd64_decrypt_block, at function;
+
+_gcry_blowfish_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	movq %rbp, %r11;
+
+	movq %rsi, %r10;
+	movq %rdx, RIO;
+
+	read_block();
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	movq %r10, RIO;
+	write_block();
+
+	movq %r11, %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;
+
+/**********************************************************************
+  4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+	movzbl x ## bh,		RT1d; \
+	movzbl x ## bl,		RT3d; \
+	rorq $16,		x; \
+	movzbl x ## bh,		RT0d; \
+	movzbl x ## bl,		RT2d; \
+	rorq $16,		x; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		x;
+
+#define add_preloaded_roundkey4() \
+	xorq RKEY,		RX0; \
+	xorq RKEY,		RX1; \
+	xorq RKEY,		RX2; \
+	xorq RKEY,		RX3;
+
+#define preload_roundkey_enc(n) \
+	movq p+4*(n)(CTX),	RKEY;
+
+#define add_roundkey_enc4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+	add_roundkey_enc4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define preload_roundkey_dec(n) \
+	movq p+4*((n)-1)(CTX),	RKEY; \
+	rorq $32,		RKEY;
+
+#define add_roundkey_dec4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+	add_roundkey_dec4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define inbswap_block4() \
+	rorq $32,		RX0; \
+	bswapq 			RX0; \
+	rorq $32,		RX1; \
+	bswapq 			RX1; \
+	rorq $32,		RX2; \
+	bswapq 			RX2; \
+	rorq $32,		RX3; \
+	bswapq 			RX3;
+
+#define inctrswap_block4() \
+	rorq $32,		RX0; \
+	rorq $32,		RX1; \
+	rorq $32,		RX2; \
+	rorq $32,		RX3;
+
+#define outbswap_block4() \
+	bswapq 			RX0; \
+	bswapq 			RX1; \
+	bswapq 			RX2; \
+	bswapq 			RX3;
+
+.align 8
+.type   __blowfish_enc_blk4, at function;
+
+__blowfish_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
+	 */
+	preload_roundkey_enc(0);
+
+	round_enc4(0);
+	round_enc4(2);
+	round_enc4(4);
+	round_enc4(6);
+	round_enc4(8);
+	round_enc4(10);
+	round_enc4(12);
+	round_enc4(14);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;
+
+.align 8
+.type   __blowfish_dec_blk4, at function;
+
+__blowfish_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input ciphertext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
+	 */
+	preload_roundkey_dec(17);
+
+	inbswap_block4();
+
+	round_dec4(17);
+	round_dec4(15);
+	round_dec4(13);
+	round_dec4(11);
+	round_dec4(9);
+	round_dec4(7);
+	round_dec4(5);
+	round_dec4(3);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;
+
+.align 8
+.globl  _gcry_blowfish_amd64_ctr_enc
+.type   _gcry_blowfish_amd64_ctr_enc, at function;
+_gcry_blowfish_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* load IV and byteswap */
+	movq (%r13), RT0;
+	bswapq RT0;
+	movq RT0, RX0;
+
+	/* construct IVs */
+	leaq 1(RT0), RX1;
+	leaq 2(RT0), RX2;
+	leaq 3(RT0), RX3;
+	leaq 4(RT0), RT0;
+	bswapq RT0;
+
+	inctrswap_block4();
+
+	/* store new IV */
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;
+
+.align 8
+.globl  _gcry_blowfish_amd64_cbc_dec
+.type   _gcry_blowfish_amd64_cbc_dec, at function;
+_gcry_blowfish_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
+	movq %rsi, %r11; /*dst*/
+	movq %rdx, %r12; /*src*/
+	movq %rcx, %r13; /*iv*/
+
+	/* load input */
+	movq 0 * 8(%r12), RX0;
+	movq 1 * 8(%r12), RX1;
+	movq 2 * 8(%r12), RX2;
+	movq 3 * 8(%r12), RX3;
+
+	call __blowfish_dec_blk4;
+
+	movq 3 * 8(%r12), RT0;
+	xorq      (%r13), RX0;
+	xorq 0 * 8(%r12), RX1;
+	xorq 1 * 8(%r12), RX2;
+	xorq 2 * 8(%r12), RX3;
+	movq RT0, (%r13); /* store new IV */
+
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;
+
+.align 8
+.globl  _gcry_blowfish_amd64_cfb_dec
+.type   _gcry_blowfish_amd64_cfb_dec, at function;
+_gcry_blowfish_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* Load input */
+	movq (%r13), RX0;
+	movq 0 * 8(%r12), RX1;
+	movq 1 * 8(%r12), RX2;
+	movq 2 * 8(%r12), RX3;
+
+	inbswap_block4();
+
+	/* Update IV */
+	movq 3 * 8(%r12), RT0;
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index b4d2b9c..39d4051 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -36,10 +36,20 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
 
 #define BLOWFISH_BLOCKSIZE 8
 #define BLOWFISH_ROUNDS 16
 
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (BLOWFISH_ROUNDS == 16)
+# define USE_AMD64_ASM 1
+#endif
+
+
 typedef struct {
     u32 s0[256];
     u32 s1[256];
@@ -240,6 +250,61 @@ static const u32 ps[BLOWFISH_ROUNDS+2] = {
     0xC0AC29B7,0xC97C50DD,0x3F84D5B5,0xB5470917,0x9216D5D9,0x8979FB1B };
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_amd64_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+					    u32 *ret_xr);
+
+extern void _gcry_blowfish_amd64_encrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+extern void _gcry_blowfish_amd64_decrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static void encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+#else /*USE_AMD64_ASM*/
 
 #if BLOWFISH_ROUNDS != 16
 static inline u32
@@ -461,6 +526,201 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (64);
 }
 
+#endif /*!USE_AMD64_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOWFISH_BLOCKSIZE. */
+void
+_gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE];
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+  int i;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+      /* Increment the counter.  */
+      for (i = BLOWFISH_BLOCKSIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[BLOWFISH_BLOCKSIZE];
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, BLOWFISH_BLOCKSIZE);
+
+      do_decrypt_block (ctx, outbuf, inbuf);
+
+      buf_xor(outbuf, outbuf, iv, BLOWFISH_BLOCKSIZE);
+      memcpy(iv, savebuf, BLOWFISH_BLOCKSIZE);
+      inbuf += BLOWFISH_BLOCKSIZE;
+      outbuf += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for BLOWFISH-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_ctr("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cbc("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cfb("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 
 static const char*
 selftest(void)
@@ -471,6 +731,7 @@ selftest(void)
   byte plain3[] = { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
   byte key3[] = { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
   byte cipher3[] = { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
+  const char *r;
 
   bf_setkey( (void *) &c,
              (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26 );
@@ -488,6 +749,16 @@ selftest(void)
   decrypt_block( (void *) &c, buffer, buffer );
   if( memcmp( buffer, plain3, 8 ) )
     return "Blowfish selftest failed (4).";
+
+  if ( (r = selftest_cbc ()) )
+    return r;
+
+  if ( (r = selftest_cfb ()) )
+    return r;
+
+  if ( (r = selftest_ctr ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 79ca755..508f26f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -718,6 +718,13 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               break;
 #endif /*USE_AES*/
+#ifdef USE_BLOWFISH
+	    case GCRY_CIPHER_BLOWFISH:
+              h->bulk.cfb_dec = _gcry_blowfish_cfb_dec;
+              h->bulk.cbc_dec = _gcry_blowfish_cbc_dec;
+              h->bulk.ctr_enc = _gcry_blowfish_ctr_enc;
+              break;
+#endif /*USE_BLOWFISH*/
 #ifdef USE_CAST5
 	    case GCRY_CIPHER_CAST5:
               h->bulk.cfb_dec = _gcry_cast5_cfb_dec;
diff --git a/configure.ac b/configure.ac
index 113c71f..f13a91b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1189,6 +1189,13 @@ LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 1742003..ca595b0 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -95,6 +95,19 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
 
+/*-- blowfish.c --*/
+void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
+void _gcry_blowfish_cbc_dec (void *context, unsigned char *iv,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
+void _gcry_blowfish_ctr_enc (void *context, unsigned char *ctr,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
 /*-- cast5.c --*/
 void _gcry_cast5_cfb_dec (void *context, unsigned char *iv,
 			  void *outbuf_arg, const void *inbuf_arg,




More information about the Gcrypt-devel mailing list