possible mpi-pow improvement

Fri Sep 6 10:41:37 CEST 2013

For the Yarom/Falkner flush+reload cache side-channel attack, we
changed the code so that it always calls the multiplication routine.
But this results some performance regression.

It is sad thing.  Thus, I am trying to recover performance.

Attached is the patch I am testing.  It needs more clean up, but it
works for me with 'make check'.

About performance:

====================== original =====================
$ ./tests/benchmark rsa
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit         340ms     860ms        30ms
RSA 2048 bit         870ms    5510ms       110ms
RSA 3072 bit        6440ms   16930ms       210ms
RSA 4096 bit       17470ms   37270ms       360ms

Current fix:
  Call MUL always, regardless of E's bit.
====================== Always call MUL ===============
$ ./tests/benchmark rsa
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit         210ms    1180ms        30ms
RSA 2048 bit        2040ms    7450ms       110ms
RSA 3072 bit       21720ms   21960ms       210ms
RSA 4096 bit       25290ms   49680ms       360ms


My possible change:
====================== k-ary, MUL instead of SQR =====
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit         280ms     710ms        30ms
RSA 2048 bit         960ms    4410ms       110ms
RSA 3072 bit       17680ms   12990ms       220ms
RSA 4096 bit       12280ms   29550ms       360ms

Any comments are appreciated.

diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index 85d6fd8..4e75008 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -34,8 +34,36 @@
 #include "longlong.h"
 

+static void
+mul_mod (mpi_ptr_t xp, mpi_size_t *xsize_p,
+	 mpi_ptr_t rp, mpi_size_t rsize,
+	 mpi_ptr_t sp, mpi_size_t ssize,
+	 mpi_ptr_t mp, mpi_size_t msize,
+	 struct karatsuba_ctx *karactx_p)
+{
+  if( ssize < KARATSUBA_THRESHOLD )
+    _gcry_mpih_mul ( xp, rp, rsize, sp, ssize );
+  else
+    _gcry_mpih_mul_karatsuba_case (xp, rp, rsize, sp, ssize, karactx_p);
+
+   if (rsize + ssize > msize)
+    {
+      _gcry_mpih_divrem (xp + msize, 0, xp, rsize + ssize, mp, msize);
+      *xsize_p = msize;
+    }
+   else
+     *xsize_p = rsize + ssize;
+}
+
+#define SIZE_G ((1 << (5 - 1)) - 1)
+
 /****************
  * RES = BASE ^ EXPO mod MOD
+ *
+ * Reference:
+ *   Handbook of Applied Cryptography
+ *       Algorithm 14.83: Modified left-to-right k-ary exponentiation
+ *
  */
 void
 gcry_mpi_powm (gcry_mpi_t res,
@@ -60,15 +88,28 @@ gcry_mpi_powm (gcry_mpi_t res,
   unsigned int bp_nlimbs = 0;
   unsigned int ep_nlimbs = 0;
   unsigned int xp_nlimbs = 0;
-  mpi_ptr_t tspace = NULL;
-  mpi_size_t tsize = 0;
-
+  mpi_ptr_t g[SIZE_G];
+  mpi_size_t gsize[SIZE_G];
+  mpi_size_t W;
+  mpi_ptr_t g_k;
+  mpi_size_t g_k_size;
 
   esize = expo->nlimbs;
   msize = mod->nlimbs;
   size = 2 * msize;
   msign = mod->sign;
 
+  if (esize * BITS_PER_MPI_LIMB >= 512)
+    W = 5;
+  else if (esize * BITS_PER_MPI_LIMB >= 256)
+    W = 4;
+  else if (esize * BITS_PER_MPI_LIMB >= 128)
+    W = 3;
+  else if (esize * BITS_PER_MPI_LIMB >= 64)
+    W = 2;
+  else
+    W = 1;
+
   esec = mpi_is_secure(expo);
   msec = mpi_is_secure(mod);
   bsec = mpi_is_secure(base);
@@ -167,18 +208,17 @@ gcry_mpi_powm (gcry_mpi_t res,
       mpi_resize (res, size);
       rp = res->d;
     }
-  MPN_COPY ( rp, bp, bsize );
-  rsize = bsize;
-  rsign = bsign;
 
   /* Main processing.  */
   {
-    mpi_size_t i;
+    mpi_size_t i, j;
     mpi_ptr_t xp;
+    mpi_size_t xsize;
     int c;
     mpi_limb_t e;
     mpi_limb_t carry_limb;
     struct karatsuba_ctx karactx;
+    mpi_ptr_t tp;
 
     xp_nlimbs = msec? (2 * (msize + 1)):0;
     xp = xp_marker = mpi_alloc_limb_space( 2 * (msize + 1), msec );
@@ -186,11 +226,34 @@ gcry_mpi_powm (gcry_mpi_t res,
     memset( &karactx, 0, sizeof karactx );
     negative_result = (ep[0] & 1) && base->sign;
 
+    /* Precompute G[] */
+    if (W > 1)			/* X^2 */
+      mul_mod (xp, &xsize, bp, bsize, bp, bsize, mp, msize, &karactx);
+    for (i = 1; i < (1 << (W - 1)); i++)
+      {				/* Gk, k = 2*i + 1 */
+	if (i == 1)
+	  {
+	    g_k = bp;
+	    g_k_size = bsize;
+	  }
+	else
+	  {
+	    g_k = g[i-2];
+	    g_k_size = gsize[i-2];
+	  }
+
+	if (xsize >= g_k_size)
+	  mul_mod (rp, &rsize, xp, xsize, g_k, g_k_size,
+		   mp, msize, &karactx);
+	else
+	  mul_mod (rp, &rsize, g_k, g_k_size, xp, xsize,
+		   mp, msize, &karactx);
+	g[i-1] = mpi_alloc_limb_space (rsize, esec); 
+	gsize[i-1] = rsize;
+	MPN_COPY (g[i-1], rp, rsize);
+      }
+
     i = esize - 1;
-    e = ep[i];
-    count_leading_zeros (c, e);
-    e = (e << c) << 1;     /* Shift the expo bits to the left, lose msb.  */
-    c = BITS_PER_MPI_LIMB - 1 - c;
 
     /* Main loop.
 
@@ -200,78 +263,137 @@ gcry_mpi_powm (gcry_mpi_t res,
        _gcry_mpih_divmod. With 50% probability the result after this
        loop will be in the area originally pointed by RP (==RES->d),
        and with 50% probability in the area originally pointed to by XP. */
+    rsign = bsign;
+    if (W == 1)
+      {
+	rsize = bsize;
+      }
+    else
+      {
+	rsize = msize;
+	MPN_ZERO (rp, rsize);
+      }
+    MPN_COPY ( rp, bp, bsize );
+
+    e = ep[i];
+    count_leading_zeros (c, e);
+    e = (e << c) << 1;
+    c = BITS_PER_MPI_LIMB - 1 - c;
+
+    j = 0;
+
     for (;;)
+      if (e == 0)
+	{
+	  j += c;
+	  i--;
+	  if ( i < 0 )
+	    {
+	      c = 0;
+	      break;
+	    }
+
+	  e = ep[i];
+	  c = BITS_PER_MPI_LIMB;
+	}
+      else
+	{
+	  int c0;
+	  mpi_limb_t e0;
+
+	  count_leading_zeros (c0, e);
+	  e = (e << c0);
+	  c -= c0;
+	  j += c0;
+
+	  if (c >= W)
+	    {
+	      e0 = (e >> (BITS_PER_MPI_LIMB - W));
+	      e = (e << W);
+	      c -= W;
+	    }
+	  else
+	    {
+	      i--;
+	      if ( i < 0 )
+		{
+		  e = (e >> (BITS_PER_MPI_LIMB - c));
+		  break;
+		}
+
+	      c0 = c;
+	      e0 = (e >> (BITS_PER_MPI_LIMB - W))
+		| (ep[i] >> (BITS_PER_MPI_LIMB - W + c0));
+	      e = (ep[i] << (W - c0));
+	      c = BITS_PER_MPI_LIMB - W + c0;
+	    }
+
+	  count_trailing_zeros (c0, e0);
+	  e0 = (e0 >> c0) >> 1;
+
+	  for (j += W - c0; j; j--)
+	    {
+	      mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+	      tp = rp; rp = xp; xp = tp;
+	      rsize = xsize;
+	    }
+
+	  if (e0 == 0)
+	    {
+	      g_k = bp;
+	      g_k_size = bsize;
+	    }
+	  else
+	    {
+	      g_k = g[e0 - 1];
+	      g_k_size = gsize[e0 -1];
+	    }
+
+	  mul_mod (xp, &xsize, rp, rsize, g_k, g_k_size, mp, msize, &karactx);
+	  tp = rp; rp = xp; xp = tp;
+	  rsize = xsize;
+
+	  j = c0;
+	}
+
+    if (c != 0)
       {
-        while (c)
-          {
-            mpi_ptr_t tp;
-            mpi_size_t xsize;
-
-            /*mpih_mul_n(xp, rp, rp, rsize);*/
-            if ( rsize < KARATSUBA_THRESHOLD )
-              _gcry_mpih_sqr_n_basecase( xp, rp, rsize );
-            else
-              {
-                if ( !tspace )
-                  {
-                    tsize = 2 * rsize;
-                    tspace = mpi_alloc_limb_space( tsize, 0 );
-                  }
-                else if ( tsize < (2*rsize) )
-                  {
-                    _gcry_mpi_free_limb_space (tspace, 0);
-                    tsize = 2 * rsize;
-                    tspace = mpi_alloc_limb_space (tsize, 0 );
-                  }
-                _gcry_mpih_sqr_n (xp, rp, rsize, tspace);
-              }
-
-            xsize = 2 * rsize;
-            if ( xsize > msize )
-              {
-                _gcry_mpih_divrem(xp + msize, 0, xp, xsize, mp, msize);
-                xsize = msize;
-              }
-
-            tp = rp; rp = xp; xp = tp;
-            rsize = xsize;
-
-            /* To mitigate the Yarom/Falkner flush+reload cache
-             * side-channel attack on the RSA secret exponent, we do
-             * the multiplication regardless of the value of the
-             * high-bit of E.  But to avoid this performance penalty
-             * we do it only if the exponent has been stored in secure
-             * memory and we can thus assume it is a secret exponent.  */
-            if (esec || (mpi_limb_signed_t)e < 0)
-              {
-                /*mpih_mul( xp, rp, rsize, bp, bsize );*/
-                if( bsize < KARATSUBA_THRESHOLD )
-                  _gcry_mpih_mul ( xp, rp, rsize, bp, bsize );
-                else
-                  _gcry_mpih_mul_karatsuba_case (xp, rp, rsize, bp, bsize,
-                                                 &karactx);
-
-                xsize = rsize + bsize;
-                if ( xsize > msize )
-                  {
-                    _gcry_mpih_divrem(xp + msize, 0, xp, xsize, mp, msize);
-                    xsize = msize;
-                  }
-              }
-            if ( (mpi_limb_signed_t)e < 0 )
-              {
-                tp = rp; rp = xp; xp = tp;
-                rsize = xsize;
-              }
-            e <<= 1;
-            c--;
-          }
+	j += c;
+	count_trailing_zeros (c, e);
+	e = (e >> c);
+	j -= c;
+      }
 
-        i--;
-        if ( i < 0 )
-          break;
-        e = ep[i];
-        c = BITS_PER_MPI_LIMB;
+    while (j--)
+      {
+	mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+	tp = rp; rp = xp; xp = tp;
+	rsize = xsize;
+      }
+
+    if (e != 0)
+      {
+	if ((e>>1) == 0)
+	  {
+	    g_k = bp;
+	    g_k_size = bsize;
+	  }
+	else
+	  {
+	    g_k = g[(e>>1) - 1];
+	    g_k_size = gsize[(e>>1) -1];
+	  }
+
+	mul_mod (xp, &xsize, rp, rsize, g_k, g_k_size, mp, msize, &karactx);
+	tp = rp; rp = xp; xp = tp;
+	rsize = xsize;
+
+	for (; c; c--)
+	  {
+	    mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+	    tp = rp; rp = xp; xp = tp;
+	    rsize = xsize;
+	  }
       }
 
     /* We shifted MOD, the modulo reduction argument, left
@@ -308,6 +430,8 @@ gcry_mpi_powm (gcry_mpi_t res,
     MPN_NORMALIZE (rp, rsize);
 
     _gcry_mpih_release_karatsuba_ctx (&karactx );
+    for (i = 0; i < (1 << (W - 1)) - 1; i++)
+      _gcry_mpi_free_limb_space( g[i], esec ? gsize[i] : 0 );
   }
 
   /* Fixup for negative results.  */
@@ -333,6 +457,4 @@ gcry_mpi_powm (gcry_mpi_t res,
     _gcry_mpi_free_limb_space( ep_marker, ep_nlimbs );
   if (xp_marker)
     _gcry_mpi_free_limb_space( xp_marker, xp_nlimbs );
-  if (tspace)
-    _gcry_mpi_free_limb_space( tspace, 0 );
 }
--