Three-way interleaving of chacha on Neon

author Niels Möller <nisse@lysator.liu.se>

Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)
author Niels Möller <nisse@lysator.liu.se>
Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)
diff --git a/ChangeLog b/ChangeLog

index 40104add647c850c83e4bfa91a2f70ba7e3b448c..8b496f1a297e4683668bf2b0e034e5ea6722e4b2 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-07-13  Niels Möller  <nisse@lysator.liu.se>
+
+       * arm/neon/chacha-3core.asm: New file, 3-way interleaving of
+       chacha.
+
  2020-07-11  Niels Möller  <nisse@lysator.liu.se>
  
         * testsuite/chacha-test.c (test_main): Delete obsolete tests for
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm

new file mode 100644 (file)

index 0000000..b73df2f
--- /dev/null
+++ b/arm/neon/chacha-3core.asm
@@ -0,0 +1,242 @@
+C arm/neon/chacha-3core.asm
+
+ifelse(<
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+       .file "chacha-3core.asm"
+       .fpu    neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+C State, X, Y and Z representing consecutive blocks
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<Y0>, <q8>)
+define(<Y1>, <q9>)
+define(<Y2>, <q10>)
+define(<Y3>, <q11>)
+define(<Z0>, <q12>)
+define(<Z1>, <q13>)
+define(<Z2>, <q14>)
+define(<Z3>, <q15>)
+
+define(<T0>, <q4>)
+define(<T1>, <q5>)
+define(<T2>, <q6>)
+define(<T3>, <q7>)
+
+       .text
+       .align 4
+.Lcount1:
+       .int 1,0,0,0
+
+       C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_3core)
+       vldm    SRC, {X0,X1,X2,X3}
+       vpush   {q4,q5,q6,q7}
+       adr     r12, .Lcount1
+       vld1.64 {Z3}, [r12]
+
+       vadd.i64        Y3, X3, Z3      C Increment 64-bit counter
+       vadd.i64        Z3, Y3, Z3
+
+.Lshared_entry:
+       vmov    Y0, X0
+       vmov    Z0, X0
+       vmov    Y1, X1
+       vmov    Z1, X1
+       vmov    Y2, X2
+       vmov    Z2, X2
+       vpush   {Z3}
+       vpush   {Y3}
+
+.Loop:
+       C Interleave three blocks. Note that with this scheduling,
+       C only two temporaries, T0 and T1, are needed.
+       vadd.i32        X0, X0, X1
+       veor            X3, X3, X0
+       vrev32.16       X3, X3          C lrot 16
+        vadd.i32       Y0, Y0, Y1
+       vadd.i32        X2, X2, X3
+        veor           Y3, Y3, Y0
+       veor            T0, X1, X2
+        vrev32.16      Y3, Y3          C lrot 16
+         vadd.i32      Z0, Z0, Z1
+       vshl.i32        X1, T0, #12
+        vadd.i32       Y2, Y2, Y3
+         veor          Z3, Z3, Z0
+       vsri.u32        X1, T0, #20
+        veor           T0, Y1, Y2
+         vrev32.16     Z3, Z3          C lrot 16
+       vadd.i32        X0, X0, X1
+        vshl.i32       Y1, T0, #12
+         vadd.i32      Z2, Z2, Z3
+       veor            T1, X3, X0
+        vsri.u32       Y1, T0, #20
+         veor          T0, Z1, Z2
+       vshl.i32        X3, T1, #8
+       vsri.u32        X3, T1, #24
+        vadd.i32       Y0, Y0, Y1
+         vshl.i32      Z1, T0, #12
+       vadd.i32        X2, X2, X3
+        veor           T1, Y3, Y0
+         vsri.u32      Z1, T0, #20
+       veor            T0, X1, X2
+        vshl.i32       Y3, T1, #8
+        vsri.u32       Y3, T1, #24
+         vadd.i32      Z0, Z0, Z1
+       vshl.i32        X1, T0, #7
+        vadd.i32       Y2, Y2, Y3
+         veor          T1, Z3, Z0
+       vsri.u32        X1, T0, #25
+        veor           T0, Y1, Y2
+         vshl.i32      Z3, T1, #8
+         vsri.u32      Z3, T1, #24
+        vshl.i32       Y1, T0, #7
+         vadd.i32      Z2, Z2, Z3
+        vsri.u32       Y1, T0, #25
+         veor          T0, Z1, Z2
+         vshl.i32      Z1, T0, #7
+         vsri.u32      Z1, T0, #25
+
+       vext.32 X1, X1, X1, #1
+       vext.32 X2, X2, X2, #2
+       vext.32 X3, X3, X3, #3
+
+       vext.32 Y1, Y1, Y1, #1
+       vext.32 Y2, Y2, Y2, #2
+       vext.32 Y3, Y3, Y3, #3
+
+       vext.32 Z1, Z1, Z1, #1
+       vext.32 Z2, Z2, Z2, #2
+       vext.32 Z3, Z3, Z3, #3
+
+       vadd.i32        X0, X0, X1
+       veor            X3, X3, X0
+       vrev32.16       X3, X3          C lrot 16
+        vadd.i32       Y0, Y0, Y1
+       vadd.i32        X2, X2, X3
+        veor           Y3, Y3, Y0
+       veor            T0, X1, X2
+        vrev32.16      Y3, Y3          C lrot 16
+         vadd.i32      Z0, Z0, Z1
+       vshl.i32        X1, T0, #12
+        vadd.i32       Y2, Y2, Y3
+         veor          Z3, Z3, Z0
+       vsri.u32        X1, T0, #20
+        veor           T0, Y1, Y2
+         vrev32.16     Z3, Z3          C lrot 16
+       vadd.i32        X0, X0, X1
+        vshl.i32       Y1, T0, #12
+         vadd.i32      Z2, Z2, Z3
+       veor            T1, X3, X0
+        vsri.u32       Y1, T0, #20
+         veor          T0, Z1, Z2
+       vshl.i32        X3, T1, #8
+       vsri.u32        X3, T1, #24
+        vadd.i32       Y0, Y0, Y1
+         vshl.i32      Z1, T0, #12
+       vadd.i32        X2, X2, X3
+        veor           T1, Y3, Y0
+         vsri.u32      Z1, T0, #20
+       veor            T0, X1, X2
+        vshl.i32       Y3, T1, #8
+        vsri.u32       Y3, T1, #24
+         vadd.i32      Z0, Z0, Z1
+       vshl.i32        X1, T0, #7
+        vadd.i32       Y2, Y2, Y3
+         veor          T1, Z3, Z0
+       vsri.u32        X1, T0, #25
+        veor           T0, Y1, Y2
+         vshl.i32      Z3, T1, #8
+         vsri.u32      Z3, T1, #24
+        vshl.i32       Y1, T0, #7
+         vadd.i32      Z2, Z2, Z3
+        vsri.u32       Y1, T0, #25
+         veor          T0, Z1, Z2
+         vshl.i32      Z1, T0, #7
+         vsri.u32      Z1, T0, #25
+
+       subs    ROUNDS, ROUNDS, #2
+
+       vext.32 X1, X1, X1, #3
+       vext.32 X2, X2, X2, #2
+       vext.32 X3, X3, X3, #1
+
+       vext.32 Y1, Y1, Y1, #3
+       vext.32 Y2, Y2, Y2, #2
+       vext.32 Y3, Y3, Y3, #1
+
+       vext.32 Z1, Z1, Z1, #3
+       vext.32 Z2, Z2, Z2, #2
+       vext.32 Z3, Z3, Z3, #1
+
+       bhi     .Loop
+
+       vldm    SRC, {T0,T1,T2,T3}
+       vadd.i32        X0, X0, T0
+       vadd.i32        Y0, Y0, T0
+       vadd.i32        Z0, Z0, T0
+       vadd.i32        X1, X1, T1
+       vadd.i32        Y1, Y1, T1
+       vadd.i32        Z1, Z1, T1
+       vadd.i32        X2, X2, T2
+       vadd.i32        Y2, Y2, T2
+       vadd.i32        Z2, Z2, T2
+
+       vpop    {T0, T1}        C updated counters
+       vadd.i32        X3, X3, T3
+       vadd.i32        Y3, Y3, T0
+       vadd.i32        Z3, Z3, T1
+
+       vpop    {q4,q5,q6,q7}
+
+       vstmia  DST!, {X0,X1,X2,X3}
+       vstmia  DST!, {Y0,Y1,Y2,Y3}
+       vstm    DST, {Z0,Z1,Z2,Z3}
+       bx      lr
+EPILOGUE(_nettle_chacha_3core)
+
+PROLOGUE(_nettle_chacha_3core32)
+       vldm    SRC, {X0,X1,X2,X3}
+       vpush   {q4,q5,q6,q7}
+       adr     r12, .Lcount1
+       vld1.64 {Z3}, [r12]
+       vadd.i32        Z3, Y3, Y3
+
+       vadd.i32        Y3, X3, Z3      C Increment 32-bit counter
+       vadd.i32        Z3, Y3, Z3
+       b .Lshared_entry
+EPILOGUE(_nettle_chacha_3core32)
diff --git a/chacha-crypt.c b/chacha-crypt.c

index 1797bd02962545835c5170e6fa9f3aa86b2766ea..59d808d1eda2ee559a3db50585c5cdede1e0a263 100644 (file)
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,6 +54,45 @@
  
  #define CHACHA_ROUNDS 20
  
+#if HAVE_NATIVE_chacha_3core
+void
+chacha_crypt(struct chacha_ctx *ctx,
+             size_t length,
+             uint8_t *dst,
+             const uint8_t *src)
+{
+  uint32_t x[3*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _chacha_3core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 3;
+      ctx->state[13] += (ctx->state[12] < 3);
+      if (length <= 3*CHACHA_BLOCK_SIZE)
+       {
+         memxor3 (dst, src, x, length);
+         return;
+       }
+      memxor3 (dst, src, x, 3*CHACHA_BLOCK_SIZE);
+
+      length -= 3*CHACHA_BLOCK_SIZE;
+      dst += 3*CHACHA_BLOCK_SIZE;
+      src += 3*CHACHA_BLOCK_SIZE;
+    }
+  _chacha_core (x, ctx->state, CHACHA_ROUNDS);
+  ctx->state[13] += (++ctx->state[12] == 0);
+
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _chacha_core (x + _CHACHA_STATE_LENGTH, ctx->state, CHACHA_ROUNDS);
+      ctx->state[13] += (++ctx->state[12] == 0);
+    }
+  memxor3 (dst, src, x, length);
+}
+#else
  void
  chacha_crypt(struct chacha_ctx *ctx,
               size_t length,
@@ -85,6 +124,7 @@ chacha_crypt(struct chacha_ctx *ctx,
        m += CHACHA_BLOCK_SIZE;
    }
  }
+#endif
  
  void
  chacha_crypt32(struct chacha_ctx *ctx,
diff --git a/chacha-internal.h b/chacha-internal.h

index 1bca8e7483965eece987ec5a491ff5b8602b51cf..cc90b132adc3cde3b2ec5fd6edbca59c52c29a59 100644 (file)
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -39,8 +39,13 @@
  #include "nettle-types.h"
  
  #define _chacha_core _nettle_chacha_core
+#define _chacha_3core _nettle_chacha_3core
  
  void
  _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds);
  
+/* Functions available only in some configurations */
+void
+_chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
  #endif /* NETTLE_CHACHA_INTERNAL_H_INCLUDED */
diff --git a/configure.ac b/configure.ac

index a01eb7d3d9d563490b417de884dde31c4d911f27..3136c1a3666760549f35c9a7d6b6426b672d1030 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
  # Assembler files which generate additional object files if they are used.
  asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
    aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
-  chacha-core-internal-2.asm salsa20-2core.asm \
+  chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
    salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
    sha3-permute-2.asm sha512-compress-2.asm \
    umac-nh-n-2.asm umac-nh-2.asm"
@@ -559,6 +559,7 @@ AH_VERBATIM([HAVE_NATIVE],
  [/* Define to 1 each of the following for which a native (ie. CPU specific)
      implementation of the corresponding routine exists.  */
  #undef HAVE_NATIVE_chacha_core
+#undef HAVE_NATIVE_chacha_3core
  #undef HAVE_NATIVE_ecc_curve25519_modp
  #undef HAVE_NATIVE_ecc_curve448_modp
  #undef HAVE_NATIVE_ecc_secp192r1_modp
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 13 Jul 2020 19:43:37 +0000 (21:43 +0200)
ChangeLog		patch \| blob \| blame \| history
arm/neon/chacha-3core.asm	[new file with mode: 0644]	patch \| blob
chacha-crypt.c		patch \| blob \| blame \| history
chacha-internal.h		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history