Added x86_64 assembly for _salsa20_core.

author Niels Möller <nisse@lysator.liu.se>

Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)
author Niels Möller <nisse@lysator.liu.se>
Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)
diff --git a/ChangeLog b/ChangeLog

index 49f223123b06978a01b0e39425b2c50c5c1b6e4a..0f0a0ae98abe9a1cc7b33e379f8f63bd7be4ac34 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2012-10-29  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/salsa20-core-internal.asm: New file.
+       * configure.ac: Added salsa20-core-internal.asm.
+       * examples/nettle-benchmark.c (bench_salsa20_core): New function.
+
  2012-10-27  Niels Möller  <nisse@lysator.liu.se>
  
         * testsuite/Makefile.in (TS_SOURCES, CXX_SOURCES): Include sources
diff --git a/configure.ac b/configure.ac

index 998ed5570cd499c2d4679102295d3115a28f8485..b5f3571b002629416b54dee6ab0eec79e123ba6e 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then
      found=no
      for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \
                  arcfour-crypt.asm camellia-crypt-internal.asm \
-                md5-compress.asm memxor.asm salsa20-crypt.asm \
+                md5-compress.asm memxor.asm \
+                salsa20-crypt.asm salsa20-core-internal.asm \
                  serpent-encrypt.asm serpent-decrypt.asm \
                  sha1-compress.asm machine.m4; do
  #       echo "Looking for $srcdir/$asm_path/$tmp_f"
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c

index 3d0786862ec8cbb1b7edf0c4801b56c0cadd93ec..7446fbcec2c73a177bfd84e833a27e77055d230d 100644 (file)
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -47,6 +47,7 @@
  #include "des.h"
  #include "gcm.h"
  #include "memxor.h"
+#include "salsa20.h"
  #include "serpent.h"
  #include "sha.h"
  #include "twofish.h"
@@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp)
  }
  
  /* Try to get accurate cycle times for assembler functions. */
+#if WITH_CYCLE_COUNTER
  static void
  bench_sha1_compress(void)
  {
-#if WITH_CYCLE_COUNTER
    uint32_t state[_SHA1_DIGEST_LENGTH];
    uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE];
    uint32_t start_lo, start_hi, end_lo, end_hi;
@@ -594,9 +595,42 @@ bench_sha1_compress(void)
  
    qsort(count, 5, sizeof(double), compare_double);
    printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
-#endif
  }
  
+static void
+bench_salsa20_core(void)
+{
+  uint32_t state[_SALSA20_INPUT_LENGTH];
+  uint32_t start_lo, start_hi, end_lo, end_hi;
+
+  double count[5];
+  
+  uint8_t *p;
+  unsigned i, j;
+
+  for (j = 0; j < 5; j++)
+    {
+      i = 0;
+      GET_CYCLE_COUNTER(start_hi, start_lo);
+      for (; i < BENCH_ITERATIONS; i++)
+       _nettle_salsa20_core(state, state, 20);
+
+      GET_CYCLE_COUNTER(end_hi, end_lo);
+
+      end_hi -= (start_hi + (start_lo > end_lo));
+      end_lo -= start_lo;
+
+      count[j] = ldexp(end_hi, 32) + end_lo;
+    }
+
+  qsort(count, 5, sizeof(double), compare_double);
+  printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
+}
+#else
+#define bench_sha1_compress()
+#define bench_salsa20_core()
+#endif
+
  #if WITH_OPENSSL
  # define OPENSSL(x) x,
  #else
@@ -684,7 +718,7 @@ main(int argc, char **argv)
      }
  #endif
    bench_sha1_compress();
-
+  bench_salsa20_core();
    time_overhead();
  
    header();
diff --git a/x86_64/salsa20-core-internal.asm b/x86_64/salsa20-core-internal.asm

new file mode 100644 (file)

index 0000000..81ca2cc
--- /dev/null
+++ b/x86_64/salsa20-core-internal.asm
@@ -0,0 +1,98 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2012 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+define(<DST>, <%rdi>)
+define(<SRC>, <%rsi>)
+define(<COUNT>, <%rdx>)
+define(<X0>, <%xmm0>)
+define(<X1>, <%xmm1>)
+define(<X2>, <%xmm2>)
+define(<X3>, <%xmm3>)
+define(<T0>, <%xmm4>)
+define(<T1>, <%xmm5>)
+define(<M0101>, <%xmm6>)
+define(<M0110>, <%xmm7>)
+define(<M0011>, <%xmm8>)
+
+include_src(<x86_64/salsa20.m4>)
+
+       C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+       .text
+       ALIGN(4)
+PROLOGUE(_nettle_salsa20_core)
+       W64_ENTRY(3, 9) 
+
+       C Load mask registers
+       mov     $-1, %eax
+       movd    %eax, M0101
+       pshufd  $0x09, M0101, M0011     C 01 01 00 00
+       pshufd  $0x41, M0101, M0110     C 01 00 00 01
+       pshufd  $0x22, M0101, M0101     C 01 00 01 00
+
+       movups  (SRC), X0
+       movups  16(SRC), X1
+       movups  32(SRC), X2
+       movups  48(SRC), X3
+
+       C The original rows are now diagonals.
+       SWAP(X0, X1, M0101)
+       SWAP(X2, X3, M0101)
+       SWAP(X1, X3, M0110)
+       SWAP(X0, X2, M0011)     
+
+       shrl    $1, XREG(COUNT)
+
+       ALIGN(4)
+.Loop:
+       QROUND(X0, X1, X2, X3)
+       pshufd  $0x93, X1, X1   C       11 00 01 10 (least sign. left)
+       pshufd  $0x4e, X2, X2   C       10 11 00 01
+       pshufd  $0x39, X3, X3   C       01 10 11 00
+
+       QROUND(X0, X3, X2, X1)
+
+       C Inverse rotation of the rows
+       pshufd  $0x39, X1, X1   C       01 10 11 00
+       pshufd  $0x4e, X2, X2   C       10 11 00 01
+       pshufd  $0x93, X3, X3   C       11 00 01 10
+
+       decl    XREG(COUNT)
+       jnz     .Loop
+
+       SWAP(X0, X2, M0011)     
+       SWAP(X1, X3, M0110)
+       SWAP(X0, X1, M0101)
+       SWAP(X2, X3, M0101)
+
+       movups  (SRC), T0
+       movups  16(SRC), T1
+       paddd   T0, X0
+       paddd   T1, X1
+       movups  X0,(DST)
+       movups  X1,16(DST)
+       movups  32(SRC), T0
+       movups  48(SRC), T1
+       paddd   T0, X2
+       paddd   T1, X3
+       movups  X2,32(DST)
+       movups  X3,48(DST)
+       
+       W64_EXIT(3, 9)
+       ret
+EPILOGUE(_nettle_salsa20_core)
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 29 Oct 2012 13:29:34 +0000 (14:29 +0100)
ChangeLog		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
examples/nettle-benchmark.c		patch \| blob \| blame \| history
x86_64/salsa20-core-internal.asm	[new file with mode: 0644]	patch \| blob