[PATCH 1/3] rinjdael: add parallel processing for CFB decryption with AES-NI
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu May 23 13:15:41 CEST 2013
* cipher/cipher-selftest.c (_gcry_selftest_helper_cfb_128): New
function for CFB selftests.
* cipher/cipher-selftest.h (_gcry_selftest_helper_cfb_128): New
prototype.
* cipher/rijndael.c [USE_AESNI] (do_aesni_enc_vec4): New function.
(_gcry_aes_cfb_dec) [USE_AESNI]: Add parallelized CFB decryption.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
--
CFB decryption can be parallelized for additional performance. On Intel
Sandy-Bridge processor, this change makes CFB decryption 4.6 times faster.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/cipher-selftest.c | 113 ++++++++++++++++++++++++++++++
cipher/cipher-selftest.h | 13 +++
cipher/rijndael.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 299 insertions(+), 1 deletion(-)
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 439f3ae..41eb405 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -160,6 +160,119 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
return NULL;
}
+/* Run the self-tests for <block cipher>-CFB-128, tests bulk CFB
+ decryption. Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher,
+ gcry_cipher_setkey_t setkey_func,
+ gcry_cipher_encrypt_t encrypt_one,
+ gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+ const int nblocks, const int blocksize,
+ const int context_size)
+{
+ int i, offs;
+ unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+ unsigned int ctx_aligned_size, memsize;
+
+ static const unsigned char key[16] ATTR_ALIGNED_16 = {
+ 0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+ };
+
+ /* Allocate buffers, align elements to 16 bytes. */
+ ctx_aligned_size = context_size + 15;
+ ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+ memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+ mem = gcry_calloc (1, memsize);
+ if (!mem)
+ return "failed to allocate memory";
+
+ offs = (16 - ((uintptr_t)mem & 15)) & 15;
+ ctx = (void*)(mem + offs);
+ iv = ctx + ctx_aligned_size;
+ iv2 = iv + blocksize;
+ plaintext = iv2 + blocksize;
+ plaintext2 = plaintext + nblocks * blocksize;
+ ciphertext = plaintext2 + nblocks * blocksize;
+
+ /* Initialize ctx */
+ setkey_func (ctx, key, sizeof(key));
+
+ /* Test single block code path */
+ memset(iv, 0xd3, blocksize);
+ memset(iv2, 0xd3, blocksize);
+ for (i = 0; i < blocksize; i++)
+ plaintext[i] = i;
+
+ /* CFB manually. */
+ encrypt_one (ctx, ciphertext, iv);
+ buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+ /* CFB decrypt. */
+ bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
+ if (memcmp(plaintext2, plaintext, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (plaintext mismatch)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (IV mismatch)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ /* Test parallelized code paths */
+ memset(iv, 0xe6, blocksize);
+ memset(iv2, 0xe6, blocksize);
+
+ for (i = 0; i < nblocks * blocksize; i++)
+ plaintext[i] = i;
+
+ /* Create CFB ciphertext manually. */
+ for (i = 0; i < nblocks * blocksize; i+=blocksize)
+ {
+ encrypt_one (ctx, &ciphertext[i], iv);
+ buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+ }
+
+ /* Decrypt using bulk CBC and compare result. */
+ bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+ if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (plaintext mismatch, parallel path)",
+ cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (IV mismatch, parallel path)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ gcry_free(mem);
+ return NULL;
+}
+
/* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
const char *
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
index 89d79c2..30bc251 100644
--- a/cipher/cipher-selftest.h
+++ b/cipher/cipher-selftest.h
@@ -30,6 +30,11 @@ typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
const void *inbuf_arg,
unsigned int nblocks);
+typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ unsigned int nblocks);
+
typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
void *outbuf_arg,
const void *inbuf_arg,
@@ -43,6 +48,14 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
const int nblocks, const int blocksize,
const int context_size);
+/* Helper function for bulk CFB decryption selftest */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+ gcry_cipher_encrypt_t encrypt_one,
+ gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+ const int nblocks, const int blocksize,
+ const int context_size);
+
/* Helper function for bulk CTR encryption selftest */
const char *
_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4c81688..9f075ff 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -821,6 +821,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
}
+/* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesenclast_xmm0_xmm1
+ aesenclast_xmm0_xmm2
+ aesenclast_xmm0_xmm3
+ aesenclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
* and output through SSE registers xmm1 to xmm4. */
static void
@@ -1685,7 +1794,7 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
/* Bulk decryption of complete blocks in CFB mode. Caller needs to
- make sure that IV is aligned on an unisgned lonhg boundary. This
+ make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
@@ -1716,6 +1825,50 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
else if (ctx->use_aesni)
{
aesni_prepare ();
+
+ /* CFB decryption can be parallelized */
+ for ( ;nblocks >= 4; nblocks -= 4)
+ {
+ asm volatile
+ ("movdqu (%[iv]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+ "movdqu %%xmm0, (%[iv])\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf), [iv] "r" (iv)
+ : "memory");
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
for ( ;nblocks; nblocks-- )
{
do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
@@ -1723,6 +1876,7 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
inbuf += BLOCKSIZE;
}
aesni_cleanup ();
+ aesni_cleanup_2_5 ();
}
#endif /*USE_AESNI*/
else
@@ -2035,6 +2189,21 @@ selftest_cbc_128 (void)
}
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = BLOCKSIZE;
+ const int context_size = sizeof(RIJNDAEL_context);
+
+ return _gcry_selftest_helper_cfb_128("AES", &rijndael_setkey,
+ &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
+
/* Run all the self-tests and return NULL on success. This function
is used for the on-the-fly self-tests. */
static const char *
@@ -2053,6 +2222,9 @@ selftest (void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return r;
}
More information about the Gcrypt-devel
mailing list