possible mpi-pow improvement
NIIBE Yutaka
gniibe at fsij.org
Fri Sep 6 10:41:37 CEST 2013
For the Yarom/Falkner flush+reload cache side-channel attack, we
changed the code so that it always calls the multiplication routine.
But this results some performance regression.
It is sad thing. Thus, I am trying to recover performance.
Attached is the patch I am testing. It needs more clean up, but it
works for me with 'make check'.
About performance:
====================== original =====================
$ ./tests/benchmark rsa
Algorithm generate 100*sign 100*verify
------------------------------------------------
RSA 1024 bit 340ms 860ms 30ms
RSA 2048 bit 870ms 5510ms 110ms
RSA 3072 bit 6440ms 16930ms 210ms
RSA 4096 bit 17470ms 37270ms 360ms
Current fix:
Call MUL always, regardless of E's bit.
====================== Always call MUL ===============
$ ./tests/benchmark rsa
Algorithm generate 100*sign 100*verify
------------------------------------------------
RSA 1024 bit 210ms 1180ms 30ms
RSA 2048 bit 2040ms 7450ms 110ms
RSA 3072 bit 21720ms 21960ms 210ms
RSA 4096 bit 25290ms 49680ms 360ms
My possible change:
====================== k-ary, MUL instead of SQR =====
Algorithm generate 100*sign 100*verify
------------------------------------------------
RSA 1024 bit 280ms 710ms 30ms
RSA 2048 bit 960ms 4410ms 110ms
RSA 3072 bit 17680ms 12990ms 220ms
RSA 4096 bit 12280ms 29550ms 360ms
Any comments are appreciated.
diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index 85d6fd8..4e75008 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -34,8 +34,36 @@
#include "longlong.h"
+static void
+mul_mod (mpi_ptr_t xp, mpi_size_t *xsize_p,
+ mpi_ptr_t rp, mpi_size_t rsize,
+ mpi_ptr_t sp, mpi_size_t ssize,
+ mpi_ptr_t mp, mpi_size_t msize,
+ struct karatsuba_ctx *karactx_p)
+{
+ if( ssize < KARATSUBA_THRESHOLD )
+ _gcry_mpih_mul ( xp, rp, rsize, sp, ssize );
+ else
+ _gcry_mpih_mul_karatsuba_case (xp, rp, rsize, sp, ssize, karactx_p);
+
+ if (rsize + ssize > msize)
+ {
+ _gcry_mpih_divrem (xp + msize, 0, xp, rsize + ssize, mp, msize);
+ *xsize_p = msize;
+ }
+ else
+ *xsize_p = rsize + ssize;
+}
+
+#define SIZE_G ((1 << (5 - 1)) - 1)
+
/****************
* RES = BASE ^ EXPO mod MOD
+ *
+ * Reference:
+ * Handbook of Applied Cryptography
+ * Algorithm 14.83: Modified left-to-right k-ary exponentiation
+ *
*/
void
gcry_mpi_powm (gcry_mpi_t res,
@@ -60,15 +88,28 @@ gcry_mpi_powm (gcry_mpi_t res,
unsigned int bp_nlimbs = 0;
unsigned int ep_nlimbs = 0;
unsigned int xp_nlimbs = 0;
- mpi_ptr_t tspace = NULL;
- mpi_size_t tsize = 0;
-
+ mpi_ptr_t g[SIZE_G];
+ mpi_size_t gsize[SIZE_G];
+ mpi_size_t W;
+ mpi_ptr_t g_k;
+ mpi_size_t g_k_size;
esize = expo->nlimbs;
msize = mod->nlimbs;
size = 2 * msize;
msign = mod->sign;
+ if (esize * BITS_PER_MPI_LIMB >= 512)
+ W = 5;
+ else if (esize * BITS_PER_MPI_LIMB >= 256)
+ W = 4;
+ else if (esize * BITS_PER_MPI_LIMB >= 128)
+ W = 3;
+ else if (esize * BITS_PER_MPI_LIMB >= 64)
+ W = 2;
+ else
+ W = 1;
+
esec = mpi_is_secure(expo);
msec = mpi_is_secure(mod);
bsec = mpi_is_secure(base);
@@ -167,18 +208,17 @@ gcry_mpi_powm (gcry_mpi_t res,
mpi_resize (res, size);
rp = res->d;
}
- MPN_COPY ( rp, bp, bsize );
- rsize = bsize;
- rsign = bsign;
/* Main processing. */
{
- mpi_size_t i;
+ mpi_size_t i, j;
mpi_ptr_t xp;
+ mpi_size_t xsize;
int c;
mpi_limb_t e;
mpi_limb_t carry_limb;
struct karatsuba_ctx karactx;
+ mpi_ptr_t tp;
xp_nlimbs = msec? (2 * (msize + 1)):0;
xp = xp_marker = mpi_alloc_limb_space( 2 * (msize + 1), msec );
@@ -186,11 +226,34 @@ gcry_mpi_powm (gcry_mpi_t res,
memset( &karactx, 0, sizeof karactx );
negative_result = (ep[0] & 1) && base->sign;
+ /* Precompute G[] */
+ if (W > 1) /* X^2 */
+ mul_mod (xp, &xsize, bp, bsize, bp, bsize, mp, msize, &karactx);
+ for (i = 1; i < (1 << (W - 1)); i++)
+ { /* Gk, k = 2*i + 1 */
+ if (i == 1)
+ {
+ g_k = bp;
+ g_k_size = bsize;
+ }
+ else
+ {
+ g_k = g[i-2];
+ g_k_size = gsize[i-2];
+ }
+
+ if (xsize >= g_k_size)
+ mul_mod (rp, &rsize, xp, xsize, g_k, g_k_size,
+ mp, msize, &karactx);
+ else
+ mul_mod (rp, &rsize, g_k, g_k_size, xp, xsize,
+ mp, msize, &karactx);
+ g[i-1] = mpi_alloc_limb_space (rsize, esec);
+ gsize[i-1] = rsize;
+ MPN_COPY (g[i-1], rp, rsize);
+ }
+
i = esize - 1;
- e = ep[i];
- count_leading_zeros (c, e);
- e = (e << c) << 1; /* Shift the expo bits to the left, lose msb. */
- c = BITS_PER_MPI_LIMB - 1 - c;
/* Main loop.
@@ -200,78 +263,137 @@ gcry_mpi_powm (gcry_mpi_t res,
_gcry_mpih_divmod. With 50% probability the result after this
loop will be in the area originally pointed by RP (==RES->d),
and with 50% probability in the area originally pointed to by XP. */
+ rsign = bsign;
+ if (W == 1)
+ {
+ rsize = bsize;
+ }
+ else
+ {
+ rsize = msize;
+ MPN_ZERO (rp, rsize);
+ }
+ MPN_COPY ( rp, bp, bsize );
+
+ e = ep[i];
+ count_leading_zeros (c, e);
+ e = (e << c) << 1;
+ c = BITS_PER_MPI_LIMB - 1 - c;
+
+ j = 0;
+
for (;;)
+ if (e == 0)
+ {
+ j += c;
+ i--;
+ if ( i < 0 )
+ {
+ c = 0;
+ break;
+ }
+
+ e = ep[i];
+ c = BITS_PER_MPI_LIMB;
+ }
+ else
+ {
+ int c0;
+ mpi_limb_t e0;
+
+ count_leading_zeros (c0, e);
+ e = (e << c0);
+ c -= c0;
+ j += c0;
+
+ if (c >= W)
+ {
+ e0 = (e >> (BITS_PER_MPI_LIMB - W));
+ e = (e << W);
+ c -= W;
+ }
+ else
+ {
+ i--;
+ if ( i < 0 )
+ {
+ e = (e >> (BITS_PER_MPI_LIMB - c));
+ break;
+ }
+
+ c0 = c;
+ e0 = (e >> (BITS_PER_MPI_LIMB - W))
+ | (ep[i] >> (BITS_PER_MPI_LIMB - W + c0));
+ e = (ep[i] << (W - c0));
+ c = BITS_PER_MPI_LIMB - W + c0;
+ }
+
+ count_trailing_zeros (c0, e0);
+ e0 = (e0 >> c0) >> 1;
+
+ for (j += W - c0; j; j--)
+ {
+ mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+ tp = rp; rp = xp; xp = tp;
+ rsize = xsize;
+ }
+
+ if (e0 == 0)
+ {
+ g_k = bp;
+ g_k_size = bsize;
+ }
+ else
+ {
+ g_k = g[e0 - 1];
+ g_k_size = gsize[e0 -1];
+ }
+
+ mul_mod (xp, &xsize, rp, rsize, g_k, g_k_size, mp, msize, &karactx);
+ tp = rp; rp = xp; xp = tp;
+ rsize = xsize;
+
+ j = c0;
+ }
+
+ if (c != 0)
{
- while (c)
- {
- mpi_ptr_t tp;
- mpi_size_t xsize;
-
- /*mpih_mul_n(xp, rp, rp, rsize);*/
- if ( rsize < KARATSUBA_THRESHOLD )
- _gcry_mpih_sqr_n_basecase( xp, rp, rsize );
- else
- {
- if ( !tspace )
- {
- tsize = 2 * rsize;
- tspace = mpi_alloc_limb_space( tsize, 0 );
- }
- else if ( tsize < (2*rsize) )
- {
- _gcry_mpi_free_limb_space (tspace, 0);
- tsize = 2 * rsize;
- tspace = mpi_alloc_limb_space (tsize, 0 );
- }
- _gcry_mpih_sqr_n (xp, rp, rsize, tspace);
- }
-
- xsize = 2 * rsize;
- if ( xsize > msize )
- {
- _gcry_mpih_divrem(xp + msize, 0, xp, xsize, mp, msize);
- xsize = msize;
- }
-
- tp = rp; rp = xp; xp = tp;
- rsize = xsize;
-
- /* To mitigate the Yarom/Falkner flush+reload cache
- * side-channel attack on the RSA secret exponent, we do
- * the multiplication regardless of the value of the
- * high-bit of E. But to avoid this performance penalty
- * we do it only if the exponent has been stored in secure
- * memory and we can thus assume it is a secret exponent. */
- if (esec || (mpi_limb_signed_t)e < 0)
- {
- /*mpih_mul( xp, rp, rsize, bp, bsize );*/
- if( bsize < KARATSUBA_THRESHOLD )
- _gcry_mpih_mul ( xp, rp, rsize, bp, bsize );
- else
- _gcry_mpih_mul_karatsuba_case (xp, rp, rsize, bp, bsize,
- &karactx);
-
- xsize = rsize + bsize;
- if ( xsize > msize )
- {
- _gcry_mpih_divrem(xp + msize, 0, xp, xsize, mp, msize);
- xsize = msize;
- }
- }
- if ( (mpi_limb_signed_t)e < 0 )
- {
- tp = rp; rp = xp; xp = tp;
- rsize = xsize;
- }
- e <<= 1;
- c--;
- }
+ j += c;
+ count_trailing_zeros (c, e);
+ e = (e >> c);
+ j -= c;
+ }
- i--;
- if ( i < 0 )
- break;
- e = ep[i];
- c = BITS_PER_MPI_LIMB;
+ while (j--)
+ {
+ mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+ tp = rp; rp = xp; xp = tp;
+ rsize = xsize;
+ }
+
+ if (e != 0)
+ {
+ if ((e>>1) == 0)
+ {
+ g_k = bp;
+ g_k_size = bsize;
+ }
+ else
+ {
+ g_k = g[(e>>1) - 1];
+ g_k_size = gsize[(e>>1) -1];
+ }
+
+ mul_mod (xp, &xsize, rp, rsize, g_k, g_k_size, mp, msize, &karactx);
+ tp = rp; rp = xp; xp = tp;
+ rsize = xsize;
+
+ for (; c; c--)
+ {
+ mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+ tp = rp; rp = xp; xp = tp;
+ rsize = xsize;
+ }
}
/* We shifted MOD, the modulo reduction argument, left
@@ -308,6 +430,8 @@ gcry_mpi_powm (gcry_mpi_t res,
MPN_NORMALIZE (rp, rsize);
_gcry_mpih_release_karatsuba_ctx (&karactx );
+ for (i = 0; i < (1 << (W - 1)) - 1; i++)
+ _gcry_mpi_free_limb_space( g[i], esec ? gsize[i] : 0 );
}
/* Fixup for negative results. */
@@ -333,6 +457,4 @@ gcry_mpi_powm (gcry_mpi_t res,
_gcry_mpi_free_limb_space( ep_marker, ep_nlimbs );
if (xp_marker)
_gcry_mpi_free_limb_space( xp_marker, xp_nlimbs );
- if (tspace)
- _gcry_mpi_free_limb_space( tspace, 0 );
}
--
More information about the Gcrypt-devel
mailing list