Two-way interleaving of salsa20 on Neon

author Niels Möller <nisse@lysator.liu.se>

Mon, 6 Jul 2020 08:57:25 +0000 (10:57 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 6 Jul 2020 21:14:59 +0000 (23:14 +0200)
author Niels Möller <nisse@lysator.liu.se>
Mon, 6 Jul 2020 08:57:25 +0000 (10:57 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 6 Jul 2020 21:14:59 +0000 (23:14 +0200)
diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm

new file mode 100644 (file)

index 0000000..cdb6133
--- /dev/null
+++ b/arm/neon/salsa20-2core.asm
@@ -0,0 +1,206 @@
+C arm/neon/salsa20-2core.asm
+
+ifelse(<
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+       .file "salsa20-2core.asm"
+       .fpu    neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+C State, even elements in X, odd elements in Y
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<Y0>, <q8>)
+define(<Y1>, <q9>)
+define(<Y2>, <q10>)
+define(<Y3>, <q11>)
+define(<T0>, <q12>)
+define(<T1>, <q13>)
+define(<T2>, <q14>)
+define(<T3>, <q15>)
+
+       .text
+       .align 4
+.Lcount1:
+       .int 1,0,0,0
+
+       C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_salsa20_2core)
+       vldm    SRC, {X0,X1,X2,X3}
+       adr     r12, .Lcount1
+
+       vmov    Y3, X0
+       vld1.64 {Y1}, [r12]
+       vmov    Y0, X1
+       vadd.i64 Y1, Y1, X2     C Increment counter
+       vmov    Y2, X3
+
+       vtrn.32 X0, Y3          C X0:  0  0  2  2  Y3:  1  1  3  3
+       vtrn.32 X1, Y0          C X1:  4  4  6  6  Y0:  5  5  7  7
+       vtrn.32 X2, Y1          C X2:  8  8 10 10  Y1:  9  9  1  1
+       vtrn.32 X3, Y2          C X3: 12 12 14 14  Y2: 13 13 15 15
+
+       C Swap, to get
+       C X0:  0 10  Y0:  5 15
+       C X1:  4 14  Y1:  9  3
+       C X2:  8  2  Y2: 13  7
+       C X3: 12  6  Y3:  1 11
+       vswp    D1REG(X0), D1REG(X2)
+       vswp    D1REG(X1), D1REG(X3)
+       vswp    D1REG(Y0), D1REG(Y2)
+       vswp    D1REG(Y1), D1REG(Y3)
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C X1: A4  B4  A14 B14  Y1: A9  B9  A3  B3
+C X2: A8  B8  A2  B2   Y2: A13 B13 A7  B7
+C X3: A12 B12 A6  B6   Y3: A1  B1  A11 B11
+
+       vadd.i32        T0, X0, X3
+       vshl.i32        T1, T0, #7
+        vadd.i32       T2, Y0, Y3
+       vsri.u32        T1, T0, #25
+        vshl.i32       T3, T2, #7
+       veor            X1, X1, T1
+        vsri.u32       T3, T2, #25
+       vadd.i32        T0, X1, X0
+        veor           Y1, Y1, T3
+       vshl.i32        T1, T0, #9
+        vadd.i32       T2, Y1, Y0
+       vsri.u32        T1, T0, #23
+        vshl.i32       T3, T2, #9
+       veor            X2, X2, T1
+        vsri.u32       T3, T2, #23
+       vadd.i32        T0, X2, X1
+        veor           Y2, Y2, T3
+       vshl.i32        T1, T0, #13
+        vadd.i32       T2, Y2, Y1
+       vsri.u32        T1, T0, #19
+        vshl.i32       T3, T2, #13
+       veor            X3, X3, T1
+        vsri.u32       T3, T2, #19
+       vadd.i32        T0, X3, X2
+        veor           Y3, Y3, T3
+       vshl.i32        T1, T0, #18
+        vadd.i32       T2, Y3, Y2
+         vext.32       Y1, Y1, Y1, #2
+       vsri.u32        T1, T0, #14
+        vshl.i32       T3, T2, #18
+         vext.32       Y2, Y2, Y2, #2
+       veor            X0, X0, T1
+        vsri.u32       T3, T2, #14
+         vext.32       X3, X3, X3, #2
+        veor           Y0, Y0, T3
+
+C Register layout:
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C Y1: A3  B3   A9  B9  X1: A4  B4  A14 B14 (Y1 swapped)
+C X2: A2  B2   A8  B8  Y2: A7  B7  A13 B13 (X2, Y2 swapped)
+C Y3: A1  B1  A11 B11  X3: A6  B6  A12 B12 (X3 swapped)
+
+       vadd.i32        T0, X0, Y1
+         vext.32       X2, X2, X2, #2
+       vshl.i32        T1, T0, #7
+        vadd.i32       T2, Y0, X1
+       vsri.u32        T1, T0, #25
+        vshl.i32       T3, T2, #7
+       veor            Y3, Y3, T1
+        vsri.u32       T3, T2, #25
+       vadd.i32        T0, Y3, X0
+        veor           X3, X3, T3
+       vshl.i32        T1, T0, #9
+        vadd.i32       T2, X3, Y0
+       vsri.u32        T1, T0, #23
+        vshl.i32       T3, T2, #9
+       veor            X2, X2, T1
+        vsri.u32       T3, T2, #23
+       vadd.i32        T0, X2, Y3
+        veor           Y2, Y2, T3
+       vshl.i32        T1, T0, #13
+        vadd.i32       T2, Y2, X3
+       vsri.u32        T1, T0, #19
+        vshl.i32       T3, T2, #13
+       veor            Y1, Y1, T1
+        vsri.u32       T3, T2, #19
+       vadd.i32        T0, Y1, X2
+        veor           X1, X1, T3
+         vext.32       X2, X2, X2, #2
+       vshl.i32        T1, T0, #18
+        vadd.i32       T2, X1, Y2
+         vext.32       Y1, Y1, Y1, #2
+       vsri.u32        T1, T0, #14
+          subs         ROUNDS, ROUNDS, #2
+        vshl.i32       T3, T2, #18
+         vext.32       X3, X3, X3, #2
+       veor            X0, X0, T1
+        vsri.u32       T3, T2, #14
+         vext.32       Y2, Y2, Y2, #2
+        veor           Y0, Y0, T3
+
+       bhi             .Loop
+
+C Inverse swaps and transpositions
+
+       vswp    D1REG(X0), D1REG(X2)
+       vswp    D1REG(X1), D1REG(X3)
+       vswp    D1REG(Y0), D1REG(Y2)
+       vswp    D1REG(Y1), D1REG(Y3)
+
+       vldm    SRC, {T0,T1,T2,T3}
+
+       vtrn.32 X0, Y3
+       vtrn.32 X1, Y0
+       vtrn.32 X2, Y1
+       vtrn.32 X3, Y2
+
+C Add in the original context
+       vadd.i32        X0, X0, T0
+       vadd.i32        X1, X1, T1
+       vadd.i32        X2, X2, T2
+       vadd.i32        X3, X3, T3
+
+       vstmia  DST!, {X0,X1,X2,X3}
+       vld1.64 {X0}, [r12]
+       vadd.i32        T0, T0, Y3
+       vadd.i64        T2, T2, X0
+       vadd.i32        T1, T1, Y0
+       vadd.i32        T2, T2, Y1
+       vadd.i32        T3, T3, Y2
+
+       vstm    DST, {T0,T1,T2,T3}
+       bx      lr
+EPILOGUE(_nettle_salsa20_2core)
diff --git a/configure.ac b/configure.ac

index 1c0b7393e0744997533de167d97ccda56b7e7302..3f6c2f3b5306334e1a01ff95923049120d0382d3 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
  # Assembler files which generate additional object files if they are used.
  asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
    aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
-  chacha-core-internal-2.asm \
+  chacha-core-internal-2.asm salsa20-2core.asm \
    salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
    sha3-permute-2.asm sha512-compress-2.asm \
    umac-nh-n-2.asm umac-nh-2.asm"
@@ -573,6 +573,7 @@ AH_VERBATIM([HAVE_NATIVE],
  #undef HAVE_NATIVE_ecc_secp521r1_redc
  #undef HAVE_NATIVE_gcm_hash8
  #undef HAVE_NATIVE_salsa20_core
+#undef HAVE_NATIVE_salsa20_2core
  #undef HAVE_NATIVE_sha1_compress
  #undef HAVE_NATIVE_sha256_compress
  #undef HAVE_NATIVE_sha512_compress
diff --git a/salsa20-crypt.c b/salsa20-crypt.c

index 770b3b4c529383cbf4ea2e51c057cc0022b5c3b6..b25cfc3df562811c5cb13127bf67245ee8f8539d 100644 (file)
--- a/salsa20-crypt.c
+++ b/salsa20-crypt.c
@@ -57,7 +57,30 @@ salsa20_crypt(struct salsa20_ctx *ctx,
  {
    if (!length)
      return;
-  
+
+#if HAVE_NATIVE_salsa20_2core
+  uint32_t x[2*_SALSA20_INPUT_LENGTH];
+  while (length > SALSA20_BLOCK_SIZE)
+    {
+      _salsa20_2core (x, ctx->input, 20);
+      ctx->input[8] += 2;
+      ctx->input[9] += (ctx->input[8] < 2);
+      if (length < 2 * SALSA20_BLOCK_SIZE)
+       {
+         memxor3 (c, m, x, length);
+         return;
+       }
+      memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE);
+
+      length -= 2*SALSA20_BLOCK_SIZE;
+      c += 2*SALSA20_BLOCK_SIZE;
+      m += 2*SALSA20_BLOCK_SIZE;
+    }
+  _salsa20_core (x, ctx->input, 20);
+  ctx->input[9] += (++ctx->input[8] == 0);
+  memxor3 (c, m, x, length);
+  return;
+#else
    for (;;)
      {
        uint32_t x[_SALSA20_INPUT_LENGTH];
@@ -79,4 +102,5 @@ salsa20_crypt(struct salsa20_ctx *ctx,
        c += SALSA20_BLOCK_SIZE;
        m += SALSA20_BLOCK_SIZE;
    }
+#endif
  }
diff --git a/salsa20-internal.h b/salsa20-internal.h

index e056b8d3b9a18f216bd7911aad273dacfc1c24ef..fc1bb3102eb10e1628ca83218345330dbd9d03d2 100644 (file)
--- a/salsa20-internal.h
+++ b/salsa20-internal.h
@@ -38,8 +38,12 @@
  #include "nettle-types.h"
  
  #define _salsa20_core _nettle_salsa20_core
+#define _salsa20_2core _nettle_salsa20_2core
  
  void
  _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds);
  
+void
+_salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
  #endif /* NETTLE_SALSA20_INTERNAL_H_INCLUDED */
diff --git a/salsa20r12-crypt.c b/salsa20r12-crypt.c

index 20aecfc0480199dbda5e8dcd872c8353bfc03e47..41e32d8bf9887df1cab58b05feb544da8c8e8a6a 100644 (file)
--- a/salsa20r12-crypt.c
+++ b/salsa20r12-crypt.c
@@ -55,13 +55,35 @@ salsa20r12_crypt(struct salsa20_ctx *ctx,
                  uint8_t *c,
                  const uint8_t *m)
  {
-  uint32_t x[_SALSA20_INPUT_LENGTH];
-
    if (!length)
      return;
    
+#if HAVE_NATIVE_salsa20_2core
+  uint32_t x[2*_SALSA20_INPUT_LENGTH];
+  while (length > SALSA20_BLOCK_SIZE)
+    {
+      _salsa20_2core (x, ctx->input, 12);
+      ctx->input[8] += 2;
+      ctx->input[9] += (ctx->input[8] < 2);
+      if (length < 2 * SALSA20_BLOCK_SIZE)
+       {
+         memxor3 (c, m, x, length);
+         return;
+       }
+      memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE);
+
+      length -= 2*SALSA20_BLOCK_SIZE;
+      c += 2*SALSA20_BLOCK_SIZE;
+      m += 2*SALSA20_BLOCK_SIZE;
+    }
+  _salsa20_core (x, ctx->input, 12);
+  ctx->input[9] += (++ctx->input[8] == 0);
+  memxor3 (c, m, x, length);
+  return;
+#else
    for (;;)
      {
+      uint32_t x[_SALSA20_INPUT_LENGTH];
  
        _salsa20_core (x, ctx->input, 12);
  
@@ -80,4 +102,5 @@ salsa20r12_crypt(struct salsa20_ctx *ctx,
        c += SALSA20_BLOCK_SIZE;
        m += SALSA20_BLOCK_SIZE;
      }
+#endif
  }
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 6 Jul 2020 08:57:25 +0000 (10:57 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 6 Jul 2020 21:14:59 +0000 (23:14 +0200)
arm/neon/salsa20-2core.asm	[new file with mode: 0644]	patch \| blob
configure.ac		patch \| blob \| blame \| history
salsa20-crypt.c		patch \| blob \| blame \| history
salsa20-internal.h		patch \| blob \| blame \| history
salsa20r12-crypt.c		patch \| blob \| blame \| history