From: Niels Möller <nisse@lysator.liu.se>
Date: Mon, 13 Jul 2020 19:43:37 +0000 (+0200)
Subject: Three-way interleaving of chacha on Neon
X-Git-Tag: nettle_3.7rc1~89
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7a9d3f59ae3cf0690135d951580516cdafc4db5d;p=thirdparty%2Fnettle.git

Three-way interleaving of chacha on Neon
---

diff --git a/ChangeLog b/ChangeLog
index 40104add..8b496f1a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-07-13  Niels MÃ¶ller  <nisse@lysator.liu.se>
+
+	* arm/neon/chacha-3core.asm: New file, 3-way interleaving of
+	chacha.
+
 2020-07-11  Niels MÃ¶ller  <nisse@lysator.liu.se>
 
 	* testsuite/chacha-test.c (test_main): Delete obsolete tests for
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm
new file mode 100644
index 00000000..b73df2f1
--- /dev/null
+++ b/arm/neon/chacha-3core.asm
@@ -0,0 +1,242 @@
+C arm/neon/chacha-3core.asm
+
+ifelse(<
+   Copyright (C) 2020 Niels MÃ¶ller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+	.file "chacha-3core.asm"
+	.fpu	neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+C State, X, Y and Z representing consecutive blocks
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<Y0>, <q8>)
+define(<Y1>, <q9>)
+define(<Y2>, <q10>)
+define(<Y3>, <q11>)
+define(<Z0>, <q12>)
+define(<Z1>, <q13>)
+define(<Z2>, <q14>)
+define(<Z3>, <q15>)
+
+define(<T0>, <q4>)
+define(<T1>, <q5>)
+define(<T2>, <q6>)
+define(<T3>, <q7>)
+
+	.text
+	.align 4
+.Lcount1:
+	.int 1,0,0,0
+
+	C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_3core)
+	vldm	SRC, {X0,X1,X2,X3}
+	vpush	{q4,q5,q6,q7}
+	adr	r12, .Lcount1
+	vld1.64 {Z3}, [r12]
+
+	vadd.i64	Y3, X3, Z3	C Increment 64-bit counter
+	vadd.i64	Z3, Y3, Z3
+
+.Lshared_entry:
+	vmov	Y0, X0
+	vmov	Z0, X0
+	vmov	Y1, X1
+	vmov	Z1, X1
+	vmov	Y2, X2
+	vmov	Z2, X2
+	vpush	{Z3}
+	vpush	{Y3}
+
+.Loop:
+	C Interleave three blocks. Note that with this scheduling,
+	C only two temporaries, T0 and T1, are needed.
+	vadd.i32	X0, X0, X1
+	veor		X3, X3, X0
+	vrev32.16	X3, X3		C lrot 16
+	 vadd.i32	Y0, Y0, Y1
+	vadd.i32	X2, X2, X3
+	 veor		Y3, Y3, Y0
+	veor		T0, X1, X2
+	 vrev32.16	Y3, Y3		C lrot 16
+	  vadd.i32	Z0, Z0, Z1
+	vshl.i32	X1, T0, #12
+	 vadd.i32	Y2, Y2, Y3
+	  veor		Z3, Z3, Z0
+	vsri.u32	X1, T0, #20
+	 veor		T0, Y1, Y2
+	  vrev32.16	Z3, Z3		C lrot 16
+	vadd.i32	X0, X0, X1
+	 vshl.i32	Y1, T0, #12
+	  vadd.i32	Z2, Z2, Z3
+	veor		T1, X3, X0
+	 vsri.u32	Y1, T0, #20
+	  veor		T0, Z1, Z2
+	vshl.i32	X3, T1, #8
+	vsri.u32	X3, T1, #24
+	 vadd.i32	Y0, Y0, Y1
+	  vshl.i32	Z1, T0, #12
+	vadd.i32	X2, X2, X3
+	 veor		T1, Y3, Y0
+	  vsri.u32	Z1, T0, #20
+	veor		T0, X1, X2
+	 vshl.i32	Y3, T1, #8
+	 vsri.u32	Y3, T1, #24
+	  vadd.i32	Z0, Z0, Z1
+	vshl.i32	X1, T0, #7
+	 vadd.i32	Y2, Y2, Y3
+	  veor		T1, Z3, Z0
+	vsri.u32	X1, T0, #25
+	 veor		T0, Y1, Y2
+	  vshl.i32	Z3, T1, #8
+	  vsri.u32	Z3, T1, #24
+	 vshl.i32	Y1, T0, #7
+	  vadd.i32	Z2, Z2, Z3
+	 vsri.u32	Y1, T0, #25
+	  veor		T0, Z1, Z2
+	  vshl.i32	Z1, T0, #7
+	  vsri.u32	Z1, T0, #25
+
+	vext.32	X1, X1, X1, #1
+	vext.32	X2, X2, X2, #2
+	vext.32	X3, X3, X3, #3
+
+	vext.32	Y1, Y1, Y1, #1
+	vext.32	Y2, Y2, Y2, #2
+	vext.32	Y3, Y3, Y3, #3
+
+	vext.32	Z1, Z1, Z1, #1
+	vext.32	Z2, Z2, Z2, #2
+	vext.32	Z3, Z3, Z3, #3
+
+	vadd.i32	X0, X0, X1
+	veor		X3, X3, X0
+	vrev32.16	X3, X3		C lrot 16
+	 vadd.i32	Y0, Y0, Y1
+	vadd.i32	X2, X2, X3
+	 veor		Y3, Y3, Y0
+	veor		T0, X1, X2
+	 vrev32.16	Y3, Y3		C lrot 16
+	  vadd.i32	Z0, Z0, Z1
+	vshl.i32	X1, T0, #12
+	 vadd.i32	Y2, Y2, Y3
+	  veor		Z3, Z3, Z0
+	vsri.u32	X1, T0, #20
+	 veor		T0, Y1, Y2
+	  vrev32.16	Z3, Z3		C lrot 16
+	vadd.i32	X0, X0, X1
+	 vshl.i32	Y1, T0, #12
+	  vadd.i32	Z2, Z2, Z3
+	veor		T1, X3, X0
+	 vsri.u32	Y1, T0, #20
+	  veor		T0, Z1, Z2
+	vshl.i32	X3, T1, #8
+	vsri.u32	X3, T1, #24
+	 vadd.i32	Y0, Y0, Y1
+	  vshl.i32	Z1, T0, #12
+	vadd.i32	X2, X2, X3
+	 veor		T1, Y3, Y0
+	  vsri.u32	Z1, T0, #20
+	veor		T0, X1, X2
+	 vshl.i32	Y3, T1, #8
+	 vsri.u32	Y3, T1, #24
+	  vadd.i32	Z0, Z0, Z1
+	vshl.i32	X1, T0, #7
+	 vadd.i32	Y2, Y2, Y3
+	  veor		T1, Z3, Z0
+	vsri.u32	X1, T0, #25
+	 veor		T0, Y1, Y2
+	  vshl.i32	Z3, T1, #8
+	  vsri.u32	Z3, T1, #24
+	 vshl.i32	Y1, T0, #7
+	  vadd.i32	Z2, Z2, Z3
+	 vsri.u32	Y1, T0, #25
+	  veor		T0, Z1, Z2
+	  vshl.i32	Z1, T0, #7
+	  vsri.u32	Z1, T0, #25
+
+	subs	ROUNDS, ROUNDS, #2
+
+	vext.32	X1, X1, X1, #3
+	vext.32	X2, X2, X2, #2
+	vext.32	X3, X3, X3, #1
+
+	vext.32	Y1, Y1, Y1, #3
+	vext.32	Y2, Y2, Y2, #2
+	vext.32	Y3, Y3, Y3, #1
+
+	vext.32	Z1, Z1, Z1, #3
+	vext.32	Z2, Z2, Z2, #2
+	vext.32	Z3, Z3, Z3, #1
+
+	bhi	.Loop
+
+	vldm	SRC, {T0,T1,T2,T3}
+	vadd.i32	X0, X0, T0
+	vadd.i32	Y0, Y0, T0
+	vadd.i32	Z0, Z0, T0
+	vadd.i32	X1, X1, T1
+	vadd.i32	Y1, Y1, T1
+	vadd.i32	Z1, Z1, T1
+	vadd.i32	X2, X2, T2
+	vadd.i32	Y2, Y2, T2
+	vadd.i32	Z2, Z2, T2
+
+	vpop	{T0, T1}	C updated counters
+	vadd.i32	X3, X3, T3
+	vadd.i32	Y3, Y3, T0
+	vadd.i32	Z3, Z3, T1
+
+	vpop	{q4,q5,q6,q7}
+
+	vstmia	DST!, {X0,X1,X2,X3}
+	vstmia	DST!, {Y0,Y1,Y2,Y3}
+	vstm	DST, {Z0,Z1,Z2,Z3}
+	bx	lr
+EPILOGUE(_nettle_chacha_3core)
+
+PROLOGUE(_nettle_chacha_3core32)
+	vldm	SRC, {X0,X1,X2,X3}
+	vpush	{q4,q5,q6,q7}
+	adr	r12, .Lcount1
+	vld1.64 {Z3}, [r12]
+	vadd.i32	Z3, Y3, Y3
+
+	vadd.i32	Y3, X3, Z3	C Increment 32-bit counter
+	vadd.i32	Z3, Y3, Z3
+	b .Lshared_entry
+EPILOGUE(_nettle_chacha_3core32)
diff --git a/chacha-crypt.c b/chacha-crypt.c
index 1797bd02..59d808d1 100644
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,6 +54,45 @@
 
 #define CHACHA_ROUNDS 20
 
+#if HAVE_NATIVE_chacha_3core
+void
+chacha_crypt(struct chacha_ctx *ctx,
+	      size_t length,
+	      uint8_t *dst,
+	      const uint8_t *src)
+{
+  uint32_t x[3*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _chacha_3core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 3;
+      ctx->state[13] += (ctx->state[12] < 3);
+      if (length <= 3*CHACHA_BLOCK_SIZE)
+	{
+	  memxor3 (dst, src, x, length);
+	  return;
+	}
+      memxor3 (dst, src, x, 3*CHACHA_BLOCK_SIZE);
+
+      length -= 3*CHACHA_BLOCK_SIZE;
+      dst += 3*CHACHA_BLOCK_SIZE;
+      src += 3*CHACHA_BLOCK_SIZE;
+    }
+  _chacha_core (x, ctx->state, CHACHA_ROUNDS);
+  ctx->state[13] += (++ctx->state[12] == 0);
+
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _chacha_core (x + _CHACHA_STATE_LENGTH, ctx->state, CHACHA_ROUNDS);
+      ctx->state[13] += (++ctx->state[12] == 0);
+    }
+  memxor3 (dst, src, x, length);
+}
+#else
 void
 chacha_crypt(struct chacha_ctx *ctx,
 	      size_t length,
@@ -85,6 +124,7 @@ chacha_crypt(struct chacha_ctx *ctx,
       m += CHACHA_BLOCK_SIZE;
   }
 }
+#endif
 
 void
 chacha_crypt32(struct chacha_ctx *ctx,
diff --git a/chacha-internal.h b/chacha-internal.h
index 1bca8e74..cc90b132 100644
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -39,8 +39,13 @@
 #include "nettle-types.h"
 
 #define _chacha_core _nettle_chacha_core
+#define _chacha_3core _nettle_chacha_3core
 
 void
 _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds);
 
+/* Functions available only in some configurations */
+void
+_chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
 #endif /* NETTLE_CHACHA_INTERNAL_H_INCLUDED */
diff --git a/configure.ac b/configure.ac
index a01eb7d3..3136c1a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 # Assembler files which generate additional object files if they are used.
 asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
-  chacha-core-internal-2.asm salsa20-2core.asm \
+  chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
   umac-nh-n-2.asm umac-nh-2.asm"
@@ -559,6 +559,7 @@ AH_VERBATIM([HAVE_NATIVE],
 [/* Define to 1 each of the following for which a native (ie. CPU specific)
     implementation of the corresponding routine exists.  */
 #undef HAVE_NATIVE_chacha_core
+#undef HAVE_NATIVE_chacha_3core
 #undef HAVE_NATIVE_ecc_curve25519_modp
 #undef HAVE_NATIVE_ecc_curve448_modp
 #undef HAVE_NATIVE_ecc_secp192r1_modp