possible mpi-pow improvement

Wed Oct 2 02:41:52 CEST 2013

On 2013-10-01 at 13:31 +0200, Werner Koch wrote:
> Thus you change is even an improvement for the general case.

Yes, at least on machine.

> Can you please change your patch to conditionally include the k-ary
> multiply but enable it right away.

Sure.  Here is current version, which is ready to push.

Since then, I modified the selection of the value W (window size) for
64-bit machine, and added some comments.

With compilation option -DUSE_ALGORITHM_SIMPLE_EXPONENTIATION, we can
use old implementation.

Please test it out.


* mpi/mpi-pow.c (gcry_mpi_powm): New implementation of left-to-right
k-ary exponentiation.
--

Signed-off-by: NIIBE Yutaka <gniibe at fsij.org>

For the Yarom/Falkner flush+reload cache side-channel attack, we
changed the code so that it always calls the multiplication routine
(even if we can skip it to get result).  This results some performance
regression.

This change is for recovering performance with efficient algorithm.
---
 mpi/mpi-pow.c |  454 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 454 insertions(+)

diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index 85d6fd8..469c382 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -34,6 +34,14 @@
 #include "longlong.h"
 

+/*
+ * When you need old implementation, please add compilation option
+ * -DUSE_ALGORITHM_SIMPLE_EXPONENTIATION
+ * or expose this line:
+#define USE_ALGORITHM_SIMPLE_EXPONENTIATION 1
+ */
+
+#if defined(USE_ALGORITHM_SIMPLE_EXPONENTIATION)
 /****************
  * RES = BASE ^ EXPO mod MOD
  */
@@ -336,3 +344,449 @@ gcry_mpi_powm (gcry_mpi_t res,
   if (tspace)
     _gcry_mpi_free_limb_space( tspace, 0 );
 }
+#else
+/**
+ * Internal function to compute
+ *
+ *    X = R * S mod M
+ *
+ * and set the size of X at the pointer XSIZE_P.
+ * Use karatsuba structure at KARACTX_P.
+ *
+ * Condition:
+ *   RSIZE >= SSIZE
+ *   Enough space for X is allocated beforehand.
+ *
+ * For generic cases, we can/should use gcry_mpi_mulm.
+ * This function is use for specific internal case.
+ */
+static void
+mul_mod (mpi_ptr_t xp, mpi_size_t *xsize_p,
+         mpi_ptr_t rp, mpi_size_t rsize,
+         mpi_ptr_t sp, mpi_size_t ssize,
+         mpi_ptr_t mp, mpi_size_t msize,
+         struct karatsuba_ctx *karactx_p)
+{
+  if( ssize < KARATSUBA_THRESHOLD )
+    _gcry_mpih_mul ( xp, rp, rsize, sp, ssize );
+  else
+    _gcry_mpih_mul_karatsuba_case (xp, rp, rsize, sp, ssize, karactx_p);
+
+   if (rsize + ssize > msize)
+    {
+      _gcry_mpih_divrem (xp + msize, 0, xp, rsize + ssize, mp, msize);
+      *xsize_p = msize;
+    }
+   else
+     *xsize_p = rsize + ssize;
+}
+
+#define SIZE_B_2I3 ((1 << (5 - 1)) - 1)
+
+/****************
+ * RES = BASE ^ EXPO mod MOD
+ *
+ * To mitigate the Yarom/Falkner flush+reload cache side-channel
+ * attack on the RSA secret exponent, we don't use the square
+ * routine but multiplication.
+ *
+ * Reference:
+ *   Handbook of Applied Cryptography
+ *       Algorithm 14.83: Modified left-to-right k-ary exponentiation
+ */
+void
+gcry_mpi_powm (gcry_mpi_t res,
+               gcry_mpi_t base, gcry_mpi_t expo, gcry_mpi_t mod)
+{
+  /* Pointer to the limbs of the arguments, their size and signs. */
+  mpi_ptr_t  rp, ep, mp, bp;
+  mpi_size_t esize, msize, bsize, rsize;
+  int               msign, bsign, rsign;
+  /* Flags telling the secure allocation status of the arguments.  */
+  int        esec,  msec,  bsec;
+  /* Size of the result including space for temporary values.  */
+  mpi_size_t size;
+  /* Helper.  */
+  int mod_shift_cnt;
+  int negative_result;
+  mpi_ptr_t mp_marker = NULL;
+  mpi_ptr_t bp_marker = NULL;
+  mpi_ptr_t ep_marker = NULL;
+  mpi_ptr_t xp_marker = NULL;
+  unsigned int mp_nlimbs = 0;
+  unsigned int bp_nlimbs = 0;
+  unsigned int ep_nlimbs = 0;
+  unsigned int xp_nlimbs = 0;
+  mpi_ptr_t b_2i3[SIZE_B_2I3]; /* Pre-computed array: BASE^3, ^5, ^7, ... */
+  mpi_size_t b_2i3size[SIZE_B_2I3];
+  mpi_size_t W;
+  mpi_ptr_t base_u;
+  mpi_size_t base_u_size;
+
+  esize = expo->nlimbs;
+  msize = mod->nlimbs;
+  size = 2 * msize;
+  msign = mod->sign;
+
+  if (esize * BITS_PER_MPI_LIMB > 512)
+    W = 5;
+  else if (esize * BITS_PER_MPI_LIMB > 256)
+    W = 4;
+  else if (esize * BITS_PER_MPI_LIMB > 128)
+    W = 3;
+  else if (esize * BITS_PER_MPI_LIMB > 64)
+    W = 2;
+  else
+    W = 1;
+
+  esec = mpi_is_secure(expo);
+  msec = mpi_is_secure(mod);
+  bsec = mpi_is_secure(base);
+
+  rp = res->d;
+  ep = expo->d;
+
+  if (!msize)
+    _gcry_divide_by_zero();
+
+  if (!esize)
+    {
+      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 depending
+         on if MOD equals 1.  */
+      res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+      if (res->nlimbs)
+        {
+          RESIZE_IF_NEEDED (res, 1);
+          rp = res->d;
+          rp[0] = 1;
+        }
+      res->sign = 0;
+      goto leave;
+    }
+
+  /* Normalize MOD (i.e. make its most significant bit set) as
+     required by mpn_divrem.  This will make the intermediate values
+     in the calculation slightly larger, but the correct result is
+     obtained after a final reduction using the original MOD value. */
+  mp_nlimbs = msec? msize:0;
+  mp = mp_marker = mpi_alloc_limb_space(msize, msec);
+  count_leading_zeros (mod_shift_cnt, mod->d[msize-1]);
+  if (mod_shift_cnt)
+    _gcry_mpih_lshift (mp, mod->d, msize, mod_shift_cnt);
+  else
+    MPN_COPY( mp, mod->d, msize );
+
+  bsize = base->nlimbs;
+  bsign = base->sign;
+  if (bsize > msize)
+    {
+      /* The base is larger than the module.  Reduce it.
+
+         Allocate (BSIZE + 1) with space for remainder and quotient.
+         (The quotient is (bsize - msize + 1) limbs.)  */
+      bp_nlimbs = bsec ? (bsize + 1):0;
+      bp = bp_marker = mpi_alloc_limb_space( bsize + 1, bsec );
+      MPN_COPY ( bp, base->d, bsize );
+      /* We don't care about the quotient, store it above the
+       * remainder, at BP + MSIZE.  */
+      _gcry_mpih_divrem( bp + msize, 0, bp, bsize, mp, msize );
+      bsize = msize;
+      /* Canonicalize the base, since we are going to multiply with it
+         quite a few times.  */
+      MPN_NORMALIZE( bp, bsize );
+    }
+  else
+    bp = base->d;
+
+  if (!bsize)
+    {
+      res->nlimbs = 0;
+      res->sign = 0;
+      goto leave;
+    }
+
+
+  /* Make BASE, EXPO and MOD not overlap with RES.  */
+  if ( rp == bp )
+    {
+      /* RES and BASE are identical.  Allocate temp. space for BASE.  */
+      gcry_assert (!bp_marker);
+      bp_nlimbs = bsec? bsize:0;
+      bp = bp_marker = mpi_alloc_limb_space( bsize, bsec );
+      MPN_COPY(bp, rp, bsize);
+    }
+  if ( rp == ep )
+    {
+      /* RES and EXPO are identical.  Allocate temp. space for EXPO.  */
+      ep_nlimbs = esec? esize:0;
+      ep = ep_marker = mpi_alloc_limb_space( esize, esec );
+      MPN_COPY(ep, rp, esize);
+    }
+  if ( rp == mp )
+    {
+      /* RES and MOD are identical.  Allocate temporary space for MOD.*/
+      gcry_assert (!mp_marker);
+      mp_nlimbs = msec?msize:0;
+      mp = mp_marker = mpi_alloc_limb_space( msize, msec );
+      MPN_COPY(mp, rp, msize);
+    }
+
+  /* Copy base to the result.  */
+  if (res->alloced < size)
+    {
+      mpi_resize (res, size);
+      rp = res->d;
+    }
+
+  /* Main processing.  */
+  {
+    mpi_size_t i, j;
+    mpi_ptr_t xp;
+    mpi_size_t xsize;
+    int c;
+    mpi_limb_t e;
+    mpi_limb_t carry_limb;
+    struct karatsuba_ctx karactx;
+    mpi_ptr_t tp;
+
+    xp_nlimbs = msec? (2 * (msize + 1)):0;
+    xp = xp_marker = mpi_alloc_limb_space( 2 * (msize + 1), msec );
+
+    memset( &karactx, 0, sizeof karactx );
+    negative_result = (ep[0] & 1) && bsign;
+
+    /* Precompute B_2I3[], BASE^(2 * i + 3), BASE^3, ^5, ^7, ... */
+    if (W > 1)                  /* X := BASE^2 */
+      mul_mod (xp, &xsize, bp, bsize, bp, bsize, mp, msize, &karactx);
+    for (i = 0; i < (1 << (W - 1)) - 1; i++)
+      {                         /* B_2I3[i] = BASE^(2 * i + 3) */
+        if (i == 0)
+          {
+            base_u = bp;
+            base_u_size = bsize;
+          }
+        else
+          {
+            base_u = b_2i3[i-1];
+            base_u_size = b_2i3size[i-1];
+          }
+
+        if (xsize >= base_u_size)
+          mul_mod (rp, &rsize, xp, xsize, base_u, base_u_size,
+                   mp, msize, &karactx);
+        else
+          mul_mod (rp, &rsize, base_u, base_u_size, xp, xsize,
+                   mp, msize, &karactx);
+        b_2i3[i] = mpi_alloc_limb_space (rsize, esec);
+        b_2i3size[i] = rsize;
+        MPN_COPY (b_2i3[i], rp, rsize);
+      }
+
+    i = esize - 1;
+
+    /* Main loop.
+
+       Make the result be pointed to alternately by XP and RP.  This
+       helps us avoid block copying, which would otherwise be
+       necessary with the overlap restrictions of
+       _gcry_mpih_divmod. With 50% probability the result after this
+       loop will be in the area originally pointed by RP (==RES->d),
+       and with 50% probability in the area originally pointed to by XP. */
+    rsign = 0;
+    if (W == 1)
+      {
+        rsize = bsize;
+      }
+    else
+      {
+        rsize = msize;
+        MPN_ZERO (rp, rsize);
+      }
+    MPN_COPY ( rp, bp, bsize );
+
+    e = ep[i];
+    count_leading_zeros (c, e);
+    e = (e << c) << 1;
+    c = BITS_PER_MPI_LIMB - 1 - c;
+
+    j = 0;
+
+    for (;;)
+      if (e == 0)
+        {
+          j += c;
+          i--;
+          if ( i < 0 )
+            {
+              c = 0;
+              break;
+            }
+
+          e = ep[i];
+          c = BITS_PER_MPI_LIMB;
+        }
+      else
+        {
+          int c0;
+          mpi_limb_t e0;
+
+          count_leading_zeros (c0, e);
+          e = (e << c0);
+          c -= c0;
+          j += c0;
+
+          if (c >= W)
+            {
+              e0 = (e >> (BITS_PER_MPI_LIMB - W));
+              e = (e << W);
+              c -= W;
+            }
+          else
+            {
+              i--;
+              if ( i < 0 )
+                {
+                  e = (e >> (BITS_PER_MPI_LIMB - c));
+                  break;
+                }
+
+              c0 = c;
+              e0 = (e >> (BITS_PER_MPI_LIMB - W))
+                | (ep[i] >> (BITS_PER_MPI_LIMB - W + c0));
+              e = (ep[i] << (W - c0));
+              c = BITS_PER_MPI_LIMB - W + c0;
+            }
+
+          count_trailing_zeros (c0, e0);
+          e0 = (e0 >> c0) >> 1;
+
+          for (j += W - c0; j; j--)
+            {
+              mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+              tp = rp; rp = xp; xp = tp;
+              rsize = xsize;
+            }
+
+          if (e0 == 0)
+            {
+              base_u = bp;
+              base_u_size = bsize;
+            }
+          else
+            {
+              base_u = b_2i3[e0 - 1];
+              base_u_size = b_2i3size[e0 -1];
+            }
+
+          mul_mod (xp, &xsize, rp, rsize, base_u, base_u_size,
+                   mp, msize, &karactx);
+          tp = rp; rp = xp; xp = tp;
+          rsize = xsize;
+
+          j = c0;
+        }
+
+    if (c != 0)
+      {
+        j += c;
+        count_trailing_zeros (c, e);
+        e = (e >> c);
+        j -= c;
+      }
+
+    while (j--)
+      {
+        mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+        tp = rp; rp = xp; xp = tp;
+        rsize = xsize;
+      }
+
+    if (e != 0)
+      {
+        if ((e>>1) == 0)
+          {
+            base_u = bp;
+            base_u_size = bsize;
+          }
+        else
+          {
+            base_u = b_2i3[(e>>1) - 1];
+            base_u_size = b_2i3size[(e>>1) -1];
+          }
+
+        mul_mod (xp, &xsize, rp, rsize, base_u, base_u_size,
+                 mp, msize, &karactx);
+        tp = rp; rp = xp; xp = tp;
+        rsize = xsize;
+
+        for (; c; c--)
+          {
+            mul_mod (xp, &xsize, rp, rsize, rp, rsize, mp, msize, &karactx);
+            tp = rp; rp = xp; xp = tp;
+            rsize = xsize;
+          }
+      }
+
+    /* We shifted MOD, the modulo reduction argument, left
+       MOD_SHIFT_CNT steps.  Adjust the result by reducing it with the
+       original MOD.
+
+       Also make sure the result is put in RES->d (where it already
+       might be, see above).  */
+    if ( mod_shift_cnt )
+      {
+        carry_limb = _gcry_mpih_lshift( res->d, rp, rsize, mod_shift_cnt);
+        rp = res->d;
+        if ( carry_limb )
+          {
+            rp[rsize] = carry_limb;
+            rsize++;
+          }
+      }
+    else if (res->d != rp)
+      {
+        MPN_COPY (res->d, rp, rsize);
+        rp = res->d;
+      }
+
+    if ( rsize >= msize )
+      {
+        _gcry_mpih_divrem(rp + msize, 0, rp, rsize, mp, msize);
+        rsize = msize;
+      }
+
+    /* Remove any leading zero words from the result.  */
+    if ( mod_shift_cnt )
+      _gcry_mpih_rshift( rp, rp, rsize, mod_shift_cnt);
+    MPN_NORMALIZE (rp, rsize);
+
+    _gcry_mpih_release_karatsuba_ctx (&karactx );
+    for (i = 0; i < (1 << (W - 1)) - 1; i++)
+      _gcry_mpi_free_limb_space( b_2i3[i], esec ? b_2i3size[i] : 0 );
+  }
+
+  /* Fixup for negative results.  */
+  if ( negative_result && rsize )
+    {
+      if ( mod_shift_cnt )
+        _gcry_mpih_rshift( mp, mp, msize, mod_shift_cnt);
+      _gcry_mpih_sub( rp, mp, msize, rp, rsize);
+      rsize = msize;
+      rsign = msign;
+      MPN_NORMALIZE(rp, rsize);
+    }
+  gcry_assert (res->d == rp);
+  res->nlimbs = rsize;
+  res->sign = rsign;
+
+ leave:
+  if (mp_marker)
+    _gcry_mpi_free_limb_space( mp_marker, mp_nlimbs );
+  if (bp_marker)
+    _gcry_mpi_free_limb_space( bp_marker, bp_nlimbs );
+  if (ep_marker)
+    _gcry_mpi_free_limb_space( ep_marker, ep_nlimbs );
+  if (xp_marker)
+    _gcry_mpi_free_limb_space( xp_marker, xp_nlimbs );
+}
+#endif
-- 
1.7.10.4