[PATCH] Improve performance of generic SHA256 implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu Jan 14 21:34:27 CET 2016
* cipher/sha256.c (R): Let caller do variable shuffling.
(Chro, Maj, Sum0, Sum1): Convert from inline functions to macros.
(W, I): New.
(transform_blk): Unroll round loop; inline message expansion to rounds
to make message expansion buffer smaller.
--
Benchmark on Cortex-A8 (armv6, 1008 Mhz):
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 27.63 ns/B 34.52 MiB/s 27.85 c/B
After (1.31x faster):
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 20.97 ns/B 45.48 MiB/s 21.13 c/B
Benchmark on Cortex-A8 (armv7, 1008 Mhz):
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 24.18 ns/B 39.43 MiB/s 24.38 c/B
After (1.13x faster):
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 21.28 ns/B 44.82 MiB/s 21.45 c/B
Benchmark on Intel Core i5-4570 (i386, 3.2 Ghz):
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 5.78 ns/B 164.9 MiB/s 18.51 c/B
After (1.06x faster)
| nanosecs/byte mebibytes/sec cycles/byte
SHA256 | 5.41 ns/B 176.1 MiB/s 17.33 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/sha256.c | 167 +++++++++++++++++++++++++++----------------------------
1 file changed, 82 insertions(+), 85 deletions(-)
diff --git a/cipher/sha256.c b/cipher/sha256.c
index bc326e0..624c296 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -180,44 +180,27 @@ sha224_init (void *context, unsigned int flags)
{ \
t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + (k) + (w); \
t2 = Sum0((a)) + Maj((a),(b),(c)); \
- h = g; \
- g = f; \
- f = e; \
- e = d + t1; \
- d = c; \
- c = b; \
- b = a; \
- a = t1 + t2; \
+ d += t1; \
+ h = t1 + t2; \
} while (0)
/* (4.2) same as SHA-1's F1. */
-static inline u32
-Cho (u32 x, u32 y, u32 z)
-{
- return (z ^ (x & (y ^ z)));
-}
+#define Cho(x, y, z) (z ^ (x & (y ^ z)))
/* (4.3) same as SHA-1's F3 */
-static inline u32
-Maj (u32 x, u32 y, u32 z)
-{
- return ((x & y) | (z & (x|y)));
-}
+#define Maj(x, y, z) ((x & y) + (z & (x ^ y)))
/* (4.4) */
-static inline u32
-Sum0 (u32 x)
-{
- return (ror (x, 2) ^ ror (x, 13) ^ ror (x, 22));
-}
+#define Sum0(x) (ror (x, 2) ^ ror (x, 13) ^ ror (x, 22))
/* (4.5) */
-static inline u32
-Sum1 (u32 x)
-{
- return (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25));
-}
+#define Sum1(x) (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
+/* Message expansion */
+#define W(i) ( w[i&0x0f] = S1(w[(i-2) &0x0f]) \
+ + w[(i-7) &0x0f] \
+ + S0(w[(i-15)&0x0f]) \
+ + w[(i-16)&0x0f] )
static unsigned int
transform_blk (void *ctx, const unsigned char *data)
@@ -243,8 +226,9 @@ transform_blk (void *ctx, const unsigned char *data)
};
u32 a,b,c,d,e,f,g,h,t1,t2;
- u32 w[64];
- int i;
+ u32 w[16];
+
+#define I(i) (w[i] = buf_get_be32(data + i * 4))
a = hd->h0;
b = hd->h1;
@@ -255,60 +239,73 @@ transform_blk (void *ctx, const unsigned char *data)
g = hd->h6;
h = hd->h7;
- for (i=0; i < 16; i++)
- w[i] = buf_get_be32(data + i * 4);
- for (; i < 64; i++)
- w[i] = S1(w[i-2]) + w[i-7] + S0(w[i-15]) + w[i-16];
-
- for (i=0; i < 64;)
- {
-#if 0
- R(a,b,c,d,e,f,g,h,K[i],w[i]);
- i++;
-#else
- t1 = h + Sum1 (e) + Cho (e, f, g) + K[i] + w[i];
- t2 = Sum0 (a) + Maj (a, b, c);
- d += t1;
- h = t1 + t2;
-
- t1 = g + Sum1 (d) + Cho (d, e, f) + K[i+1] + w[i+1];
- t2 = Sum0 (h) + Maj (h, a, b);
- c += t1;
- g = t1 + t2;
-
- t1 = f + Sum1 (c) + Cho (c, d, e) + K[i+2] + w[i+2];
- t2 = Sum0 (g) + Maj (g, h, a);
- b += t1;
- f = t1 + t2;
-
- t1 = e + Sum1 (b) + Cho (b, c, d) + K[i+3] + w[i+3];
- t2 = Sum0 (f) + Maj (f, g, h);
- a += t1;
- e = t1 + t2;
-
- t1 = d + Sum1 (a) + Cho (a, b, c) + K[i+4] + w[i+4];
- t2 = Sum0 (e) + Maj (e, f, g);
- h += t1;
- d = t1 + t2;
-
- t1 = c + Sum1 (h) + Cho (h, a, b) + K[i+5] + w[i+5];
- t2 = Sum0 (d) + Maj (d, e, f);
- g += t1;
- c = t1 + t2;
-
- t1 = b + Sum1 (g) + Cho (g, h, a) + K[i+6] + w[i+6];
- t2 = Sum0 (c) + Maj (c, d, e);
- f += t1;
- b = t1 + t2;
-
- t1 = a + Sum1 (f) + Cho (f, g, h) + K[i+7] + w[i+7];
- t2 = Sum0 (b) + Maj (b, c, d);
- e += t1;
- a = t1 + t2;
-
- i += 8;
-#endif
- }
+ R(a, b, c, d, e, f, g, h, K[0], I(0));
+ R(h, a, b, c, d, e, f, g, K[1], I(1));
+ R(g, h, a, b, c, d, e, f, K[2], I(2));
+ R(f, g, h, a, b, c, d, e, K[3], I(3));
+ R(e, f, g, h, a, b, c, d, K[4], I(4));
+ R(d, e, f, g, h, a, b, c, K[5], I(5));
+ R(c, d, e, f, g, h, a, b, K[6], I(6));
+ R(b, c, d, e, f, g, h, a, K[7], I(7));
+ R(a, b, c, d, e, f, g, h, K[8], I(8));
+ R(h, a, b, c, d, e, f, g, K[9], I(9));
+ R(g, h, a, b, c, d, e, f, K[10], I(10));
+ R(f, g, h, a, b, c, d, e, K[11], I(11));
+ R(e, f, g, h, a, b, c, d, K[12], I(12));
+ R(d, e, f, g, h, a, b, c, K[13], I(13));
+ R(c, d, e, f, g, h, a, b, K[14], I(14));
+ R(b, c, d, e, f, g, h, a, K[15], I(15));
+
+ R(a, b, c, d, e, f, g, h, K[16], W(16));
+ R(h, a, b, c, d, e, f, g, K[17], W(17));
+ R(g, h, a, b, c, d, e, f, K[18], W(18));
+ R(f, g, h, a, b, c, d, e, K[19], W(19));
+ R(e, f, g, h, a, b, c, d, K[20], W(20));
+ R(d, e, f, g, h, a, b, c, K[21], W(21));
+ R(c, d, e, f, g, h, a, b, K[22], W(22));
+ R(b, c, d, e, f, g, h, a, K[23], W(23));
+ R(a, b, c, d, e, f, g, h, K[24], W(24));
+ R(h, a, b, c, d, e, f, g, K[25], W(25));
+ R(g, h, a, b, c, d, e, f, K[26], W(26));
+ R(f, g, h, a, b, c, d, e, K[27], W(27));
+ R(e, f, g, h, a, b, c, d, K[28], W(28));
+ R(d, e, f, g, h, a, b, c, K[29], W(29));
+ R(c, d, e, f, g, h, a, b, K[30], W(30));
+ R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+ R(a, b, c, d, e, f, g, h, K[32], W(32));
+ R(h, a, b, c, d, e, f, g, K[33], W(33));
+ R(g, h, a, b, c, d, e, f, K[34], W(34));
+ R(f, g, h, a, b, c, d, e, K[35], W(35));
+ R(e, f, g, h, a, b, c, d, K[36], W(36));
+ R(d, e, f, g, h, a, b, c, K[37], W(37));
+ R(c, d, e, f, g, h, a, b, K[38], W(38));
+ R(b, c, d, e, f, g, h, a, K[39], W(39));
+ R(a, b, c, d, e, f, g, h, K[40], W(40));
+ R(h, a, b, c, d, e, f, g, K[41], W(41));
+ R(g, h, a, b, c, d, e, f, K[42], W(42));
+ R(f, g, h, a, b, c, d, e, K[43], W(43));
+ R(e, f, g, h, a, b, c, d, K[44], W(44));
+ R(d, e, f, g, h, a, b, c, K[45], W(45));
+ R(c, d, e, f, g, h, a, b, K[46], W(46));
+ R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+ R(a, b, c, d, e, f, g, h, K[48], W(48));
+ R(h, a, b, c, d, e, f, g, K[49], W(49));
+ R(g, h, a, b, c, d, e, f, K[50], W(50));
+ R(f, g, h, a, b, c, d, e, K[51], W(51));
+ R(e, f, g, h, a, b, c, d, K[52], W(52));
+ R(d, e, f, g, h, a, b, c, K[53], W(53));
+ R(c, d, e, f, g, h, a, b, K[54], W(54));
+ R(b, c, d, e, f, g, h, a, K[55], W(55));
+ R(a, b, c, d, e, f, g, h, K[56], W(56));
+ R(h, a, b, c, d, e, f, g, K[57], W(57));
+ R(g, h, a, b, c, d, e, f, K[58], W(58));
+ R(f, g, h, a, b, c, d, e, K[59], W(59));
+ R(e, f, g, h, a, b, c, d, K[60], W(60));
+ R(d, e, f, g, h, a, b, c, K[61], W(61));
+ R(c, d, e, f, g, h, a, b, K[62], W(62));
+ R(b, c, d, e, f, g, h, a, K[63], W(63));
hd->h0 += a;
hd->h1 += b;
@@ -319,7 +316,7 @@ transform_blk (void *ctx, const unsigned char *data)
hd->h6 += g;
hd->h7 += h;
- return /*burn_stack*/ 74*4+32;
+ return /*burn_stack*/ 26*4+32;
}
#undef S0
#undef S1
More information about the Gcrypt-devel
mailing list