[PATCH 2/3] camellia: add parallel processing for CFB decryption

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu May 23 13:15:46 CEST 2013


* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_cfb_dec): New function.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New
prototype.
(_gcry_camellia_cfb_dec): New function.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
* cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function
for Camellia.
* src/cipher.h (_gcry_camellia_cfb_dec): New prototype.
--

Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S |   65 +++++++++++++++++++++++++++++++++
 cipher/camellia-glue.c            |   74 +++++++++++++++++++++++++++++++++++++
 cipher/cipher.c                   |    1 +
 src/cipher.h                      |    3 ++
 4 files changed, 143 insertions(+)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 2b1df17..95c96b8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	ret;
 .size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
 
+.align 8
+.global _gcry_camellia_aesni_avx_cfb_dec
+.type   _gcry_camellia_aesni_avx_cfb_dec, at function;
+
+_gcry_camellia_aesni_avx_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm0;
+	vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+	vpxor (%rcx), %xmm0, %xmm15;
+	vmovdqu 15 * 16(%rdx), %xmm1;
+	vmovdqu %xmm1, (%rcx); /* store new IV */
+	vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+	vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+	vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+	vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+	vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+	vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+	vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+	vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+	vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+	vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+	vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+	vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+	vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+	vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+	call __camellia_enc_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 4c724a9..f9bbb33 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -102,6 +102,11 @@ extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv);
 #endif
 
 static const char *selftest(void);
@@ -308,6 +313,58 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_camellia_cfb_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf  += CAMELLIA_BLOCK_SIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 static const char*
@@ -336,6 +393,20 @@ selftest_cbc_128 (void)
 	   context_size);
 }
 
+/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 16+2;
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
+
+  return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 static const char *
 selftest(void)
 {
@@ -411,6 +482,9 @@ selftest(void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 20ac2c7..e9a652f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -723,6 +723,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
 	    case GCRY_CIPHER_CAMELLIA192:
 	    case GCRY_CIPHER_CAMELLIA256:
               h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
+              h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
diff --git a/src/cipher.h b/src/cipher.h
index 4e68487..f28990d 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -102,6 +102,9 @@ void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
 void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              unsigned int nblocks);
+void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
+                             void *outbuf_arg, const void *inbuf_arg,
+                             unsigned int nblocks);
 
 /*-- serpent.c --*/
 void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,




More information about the Gcrypt-devel mailing list