Micro optimizations for sha3. Doubled the performance on x86_64.

author Niels Möller <nisse@lysator.liu.se>

Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)
author Niels Möller <nisse@lysator.liu.se>
Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)
diff --git a/ChangeLog b/ChangeLog

index a7f1653b37e5b31bd80a85ed2d53bdebba37341b..4aa393a29915ffdf0e137a61ba93d0b84e2828e9 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
  2012-11-13  Niels Möller  <nisse@lysator.liu.se>
  
+       * sha3-permute.c (sha3_permute): Micro optimizations. Partial
+       unrolling. Use lookup table for the permutation. On an x86_64,
+       execution time reduced from appr. 13000 cycles to appr. 6000.
+
         * examples/nettle-benchmark.c (TIME_CYCLES): New macro.
         (bench_sha1_compress, bench_salsa20_core): Use it.
         (bench_sha3_permute): New function.
diff --git a/sha3-permute.c b/sha3-permute.c

index fb6f55f8a295a46c1e894f17faccb28467e979de..e3714a2e3c7bc39d3fa4a8d5499d5b76efdcb2d5 100644 (file)
--- a/sha3-permute.c
+++ b/sha3-permute.c
@@ -47,6 +47,15 @@ sha3_permute (struct sha3_state *state)
        18,  2, 61, 56, 14,
      };
  
+  static const unsigned char perm[25] =
+    {
+       0,10,20, 5,15,
+      16, 1,11,21, 6,
+       7,17, 2,12,22,
+      23, 8,18, 3,13,
+      14,24, 9,19, 4
+    };
+
    static const uint64_t rc[SHA3_ROUNDS] = {
      0x0000000000000001, 0x0000000000008082,
      0x800000000000808A, 0x8000000080008000,
@@ -62,39 +71,47 @@ sha3_permute (struct sha3_state *state)
      0x0000000080000001, 0x8000000080008008,
    };
    unsigned i;
+
+#define A state->a
+
    for (i = 0; i < SHA3_ROUNDS; i++)
      {
        uint64_t C[5], D[5], B[25];
        unsigned x, y;
  
        /* theta step */
+      C[0] = A[0] ^ A[5+0] ^ A[10+0] ^ A[15+0] ^ A[20+0];
+      C[1] = A[1] ^ A[5+1] ^ A[10+1] ^ A[15+1] ^ A[20+1];
+      C[2] = A[2] ^ A[5+2] ^ A[10+2] ^ A[15+2] ^ A[20+2];
+      C[3] = A[3] ^ A[5+3] ^ A[10+3] ^ A[15+3] ^ A[20+3];
+      C[4] = A[4] ^ A[5+4] ^ A[10+4] ^ A[15+4] ^ A[20+4];
+
+      D[0] = C[4] ^ ROTL64(1, C[1]);
+      D[1] = C[0] ^ ROTL64(1, C[2]);
+      D[2] = C[1] ^ ROTL64(1, C[3]);
+      D[3] = C[2] ^ ROTL64(1, C[4]);
+      D[4] = C[3] ^ ROTL64(1, C[0]);
+
        for (x = 0; x < 5; x++)
-       C[x] = state->a[x] ^ state->a[5+x] ^ state->a[10+x]
-         ^ state->a[15+x] ^ state->a[20+x];
-      for (x = 0; x < 5; x++)
-       /* Use the simplest indexing expressions in the argument to
-          the ROTL64 macro */
-       D[(x+4)%5] = C[(x+3)%5] ^ ROTL64(1, C[x]);
-      for (x = 0; x < 5; x++)
-       for (y = 0; y < 5; y++)
-         state->a[x +5*y] ^= D[x];
+       for (y = 0; y < 25; y += 5)
+         A[y + x] ^= D[x];
  
-      /* rho step */
+      /* rho and pi steps */
        for (x = 0; x < 25; x++)
-       state->a[x] = ROTL64 (rot[x], state->a[x]);
-      
-      /* pi step */
-      for (x = 0; x < 5; x++)
-       for (y = 0; y < 5; y++)
-         /* B[y,2*x+3*y] = B[y+5*(2*x + 3*y)]= B[10*x + 16*y] */
-         B[(10*x+16*y) % 25] = state->a[x+5*y];
+       B[perm[x]] = ROTL64 (rot[x], A[x]);
  
        /* chi step */
-      for (x = 0; x < 5; x++)
-       for (y = 0; y < 5; y++)
-         state->a[x+5*y] = B[x+5*y] ^ (~B[(x+1)%5 + 5*y] & B[(x+2)%5+5*y]);
-
+      for (y = 0; y < 25; y += 5)
+       {
+         A[y]   = B[y]   ^ (~B[y+1] & B[y+2]);
+         A[y+1] = B[y+1] ^ (~B[y+2] & B[y+3]);
+         A[y+2] = B[y+2] ^ (~B[y+3] & B[y+4]);
+         A[y+3] = B[y+3] ^ (~B[y+4] & B[y+0]);
+         A[y+4] = B[y+4] ^ (~B[y+0] & B[y+1]);
+       }
+         
        /* iota step */
-      state->a[0] ^= rc[i];
+      A[0] ^= rc[i];
      }
+#undef A
  }
author	Niels Möller <nisse@lysator.liu.se>
	Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Tue, 13 Nov 2012 19:39:45 +0000 (20:39 +0100)
ChangeLog		patch \| blob \| blame \| history
sha3-permute.c		patch \| blob \| blame \| history