[PATCH 2/3] camellia: add parallel processing for CFB decryption
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu May 23 13:15:46 CEST 2013
* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_cfb_dec): New function.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New
prototype.
(_gcry_camellia_cfb_dec): New function.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
* cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function
for Camellia.
* src/cipher.h (_gcry_camellia_cfb_dec): New prototype.
--
Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-aesni-avx-amd64.S | 65 +++++++++++++++++++++++++++++++++
cipher/camellia-glue.c | 74 +++++++++++++++++++++++++++++++++++++
cipher/cipher.c | 1 +
src/cipher.h | 3 ++
4 files changed, 143 insertions(+)
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 2b1df17..95c96b8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec:
ret;
.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+.align 8
+.global _gcry_camellia_aesni_avx_cfb_dec
+.type _gcry_camellia_aesni_avx_cfb_dec, at function;
+
+_gcry_camellia_aesni_avx_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm0;
+ vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+ vpxor (%rcx), %xmm0, %xmm15;
+ vmovdqu 15 * 16(%rdx), %xmm1;
+ vmovdqu %xmm1, (%rcx); /* store new IV */
+ vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+ vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+ vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+ vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+ vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+ vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+ vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+ vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+ vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+ vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+ vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+ vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+ vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+ vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+ call __camellia_enc_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+ vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+ vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+ vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+ vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+ vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+ vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+ vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+ vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+ vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+ vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+ vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+ vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+ vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+ vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ ret;
+.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 4c724a9..f9bbb33 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -102,6 +102,11 @@ extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
#endif
static const char *selftest(void);
@@ -308,6 +313,58 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+/* Bulk decryption of complete blocks in CFB mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_camellia_cfb_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks)
+{
+ CAMELLIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ /* clear AVX registers */
+ asm volatile ("vzeroall;\n":::);
+
+ if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+ burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ for ( ;nblocks; nblocks-- )
+ {
+ Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
+ buf_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ }
+
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
static const char*
@@ -336,6 +393,20 @@ selftest_cbc_128 (void)
context_size);
}
+/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 16+2;
+ const int blocksize = CAMELLIA_BLOCK_SIZE;
+ const int context_size = sizeof(CAMELLIA_context);
+
+ return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+ &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
static const char *
selftest(void)
{
@@ -411,6 +482,9 @@ selftest(void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return NULL;
}
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 20ac2c7..e9a652f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -723,6 +723,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
case GCRY_CIPHER_CAMELLIA192:
case GCRY_CIPHER_CAMELLIA256:
h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
+ h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
break;
#endif /*USE_CAMELLIA*/
diff --git a/src/cipher.h b/src/cipher.h
index 4e68487..f28990d 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -102,6 +102,9 @@ void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks);
/*-- serpent.c --*/
void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
More information about the Gcrypt-devel
mailing list