ppc: New assembly for chacha_core4, doing four blocks in parallel.

author Niels Möller <nisse@lysator.liu.se>

Mon, 30 Nov 2020 18:54:10 +0000 (19:54 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 30 Nov 2020 19:02:57 +0000 (20:02 +0100)
author Niels Möller <nisse@lysator.liu.se>
Mon, 30 Nov 2020 18:54:10 +0000 (19:54 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 30 Nov 2020 19:02:57 +0000 (20:02 +0100)
diff --git a/ChangeLog b/ChangeLog

index f123ba1987ac4de491730c8c9c9510a8dea8ba0e..d47c138e78ea5b99fffcbc6ff76d15c2c7cd1bd4 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
  2020-11-30  Niels Möller  <nisse@lysator.liu.se>
  
+       * chacha-crypt.c: (_nettle_chacha_crypt_4core)
+       (_nettle_chacha_crypt32_4core): New functions.
+       * chacha-internal.h: Add prototypes for _nettle_chacha_4core and
+       related functions.
+       * configure.ac (asm_nettle_optional_list): Add chacha-4core.asm.
+       * powerpc64/fat/chacha-4core.asm: New file.
+       * powerpc64/p7/chacha-4core.asm: New file.
+       * fat-ppc.c (fat_init): When altivec is available, use
+       _nettle_chacha_crypt_4core and _nettle_chacha_crypt32_4core
+       instead of _2core variants.
+
         * chacha-crypt.c (_nettle_chacha_crypt32_3core): Fix bug in
         handling of counter; this function should not propagate any carry.
  
diff --git a/chacha-crypt.c b/chacha-crypt.c

index a13898f1aec18c7e9a75efa0b63dbc85864a1a09..d3af5f5874b3b4769ceb6e15dacecd39a8f933ea 100644 (file)
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,17 +54,60 @@
  
  #define CHACHA_ROUNDS 20
  
-#if HAVE_NATIVE_chacha_3core
+#if HAVE_NATIVE_chacha_4core
+#define _nettle_chacha_crypt_4core chacha_crypt
+#define _nettle_chacha_crypt32_4core chacha_crypt32
+#elif HAVE_NATIVE_chacha_3core
  #define _nettle_chacha_crypt_3core chacha_crypt
  #define _nettle_chacha_crypt32_3core chacha_crypt32
-#elif HAVE_NATIVE_chacha_2core
-#define _nettle_chacha_crypt_2core chacha_crypt
-#define _nettle_chacha_crypt32_2core chacha_crypt32
-#elif !(HAVE_NATIVE_fat_chacha_3core || HAVE_NATIVE_fat_chacha_2core)
+#elif !(HAVE_NATIVE_fat_chacha_4core || HAVE_NATIVE_fat_chacha_3core)
  #define _nettle_chacha_crypt_1core chacha_crypt
  #define _nettle_chacha_crypt32_1core chacha_crypt32
  #endif
  
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
+                          size_t length,
+                          uint8_t *dst,
+                          const uint8_t *src)
+{
+  uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_4core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 4;
+      ctx->state[13] += (ctx->state[12] < 4);
+      if (length <= 4*CHACHA_BLOCK_SIZE)
+       {
+         memxor3 (dst, src, x, length);
+         return;
+       }
+      memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+      length -= 4*CHACHA_BLOCK_SIZE;
+      dst += 4*CHACHA_BLOCK_SIZE;
+      src += 4*CHACHA_BLOCK_SIZE;
+    }
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_2core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 2;
+      ctx->state[13] += (ctx->state[12] < 2);
+    }
+  else
+    {
+      _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[13] += (++ctx->state[12] == 0);
+    }
+  memxor3 (dst, src, x, length);
+}
+#endif
+
  #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
  void
  _nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
@@ -108,7 +151,7 @@ _nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
  }
  #endif
  
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
  void
  _nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
                            size_t length,
@@ -143,7 +186,7 @@ _nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
  }
  #endif
  
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
  void
  _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
                            size_t length,
@@ -177,6 +220,47 @@ _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
  }
  #endif
  
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
+                            size_t length,
+                            uint8_t *dst,
+                            const uint8_t *src)
+{
+  uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+  if (!length)
+    return;
+
+  while (length > 2*CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_4core32 (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 4;
+      if (length <= 4*CHACHA_BLOCK_SIZE)
+       {
+         memxor3 (dst, src, x, length);
+         return;
+       }
+      memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+      length -= 4*CHACHA_BLOCK_SIZE;
+      dst += 4*CHACHA_BLOCK_SIZE;
+      src += 4*CHACHA_BLOCK_SIZE;
+    }
+  if (length > CHACHA_BLOCK_SIZE)
+    {
+      _nettle_chacha_2core32 (x, ctx->state, CHACHA_ROUNDS);
+      ctx->state[12] += 2;
+    }
+  else
+    {
+      _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+      ++ctx->state[12];
+    }
+  memxor3 (dst, src, x, length);
+}
+#endif
+
  #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
  void
  _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
@@ -218,7 +302,7 @@ _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
  }
  #endif
  
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
  void
  _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
                              size_t length,
@@ -252,7 +336,7 @@ _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
  }
  #endif
  
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
  void
  _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
                              size_t length,
diff --git a/chacha-internal.h b/chacha-internal.h

index d92a67792e79c10ca3194d95b66efccb0dfd7a3b..897fdc16898506886421ffecee087b65bf01099c 100644 (file)
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -55,6 +55,12 @@ _nettle_chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds);
  void
  _nettle_chacha_3core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
  
+void
+_nettle_chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+void
+_nettle_chacha_4core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
  void
  _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
                            size_t length,
@@ -62,13 +68,13 @@ _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
                            const uint8_t *src);
  
  void
-_nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
                            size_t length,
                            uint8_t *dst,
                            const uint8_t *src);
  
  void
-_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
                            size_t length,
                            uint8_t *dst,
                            const uint8_t *src);
@@ -80,13 +86,13 @@ _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
                              const uint8_t *src);
  
  void
-_nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
                              size_t length,
                              uint8_t *dst,
                              const uint8_t *src);
  
  void
-_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
                              size_t length,
                              uint8_t *dst,
                              const uint8_t *src);
diff --git a/configure.ac b/configure.ac

index 6fafaa776bab2993c985f3647b08d6fde71cea94..776a9a6141211f39adae6ec539613658164d8ea4 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -499,8 +499,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
  # Assembler files which generate additional object files if they are used.
  asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
    aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
-  chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
-  salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
+  chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+  salsa20-2core.asm salsa20-core-internal-2.asm \
+  sha1-compress-2.asm sha256-compress-2.asm \
    sha3-permute-2.asm sha512-compress-2.asm \
    umac-nh-n-2.asm umac-nh-2.asm"
  
@@ -609,8 +610,10 @@ AH_VERBATIM([HAVE_NATIVE],
  #undef HAVE_NATIVE_chacha_core
  #undef HAVE_NATIVE_chacha_2core
  #undef HAVE_NATIVE_chacha_3core
+#undef HAVE_NATIVE_chacha_4core
  #undef HAVE_NATIVE_fat_chacha_2core
  #undef HAVE_NATIVE_fat_chacha_3core
+#undef HAVE_NATIVE_fat_chacha_4core
  #undef HAVE_NATIVE_ecc_curve25519_modp
  #undef HAVE_NATIVE_ecc_curve448_modp
  #undef HAVE_NATIVE_ecc_secp192r1_modp
diff --git a/fat-ppc.c b/fat-ppc.c

index 8d4a703d3a40763c4fcf1350adac62874b2ca8e5..847af14f7407c6af01d4a5beddf68c1d901f5d38 100644 (file)
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -214,8 +214,8 @@ fat_init (void)
        if (verbose)
         fprintf (stderr, "libnettle: enabling altivec code.\n");
        _nettle_chacha_core_vec = _nettle_chacha_core_altivec;
-      nettle_chacha_crypt_vec = _nettle_chacha_crypt_2core;
-      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_2core;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
      }
    else
      {
diff --git a/powerpc64/fat/chacha-4core.asm b/powerpc64/fat/chacha-4core.asm

new file mode 100644 (file)

index 0000000..bd6be1b
--- /dev/null
+++ b/powerpc64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/chacha-4core.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`powerpc64/p7/chacha-4core.asm')
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm

new file mode 100644 (file)

index 0000000..49a801b
--- /dev/null
+++ b/powerpc64/p7/chacha-4core.asm
@@ -0,0 +1,231 @@
+C powerpc64/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state in v0,...,v15
+
+define(`ROT16', v16)
+define(`ROT12', v17)
+define(`ROT8', v18)
+define(`ROT7', v19)
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v20')
+define(`T1', `v21')
+define(`T2', `v22')
+define(`T3', `v23')
+
+C Main loop for round
+define(`QR',`
+       vadduwm $1, $1, $2
+       vxor    $4, $4, $1
+       vrlw    $4, $4, ROT16
+       vadduwm $3, $3, $4
+       vxor    $2, $2, $3
+       vrlw    $2, $2, ROT12
+       vadduwm $1, $1, $2
+       vxor    $4, $4, $1
+       vrlw    $4, $4, ROT8
+       vadduwm $3, $3, $4
+       vxor    $2, $2, $3
+       vrlw    $2, $2, ROT7
+ ')
+
+define(`TRANSPOSE',`
+       vmrghw  T0, $1, $3      C A0 A2 B0 B2
+       vmrghw  T1, $2, $4      C A1 A3 B1 B3
+       vmrglw  T2, $1, $3      C C0 C2 D0 D2
+       vmrglw  T3, $2, $4      C C1 C3 D1 D3
+
+       vmrghw  $1, T0, T1      C A0 A1 A2 A3
+       vmrglw  $2, T0, T1      C B0 B1 B2 B3
+       vmrghw  $3, T2, T3      C C0 C2 C1 C3
+       vmrglw  $4, T2, T3      C D0 D1 D2 D3
+')
+
+       C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core)
+
+       vspltisw T2, 1          C Apply counter carries
+
+.Lshared_entry:
+
+       li      r6, 0x10        C set up some...
+       li      r7, 0x20        C ...useful...
+       li      r8, 0x30        C ...offsets
+
+       addi    SP, SP, -0x40   C Save callee-save registers
+       stvx    v20, 0, SP
+       stvx    v21, r6, SP
+       stvx    v22, r7, SP
+       stvx    v23, r8, SP
+
+       vspltisw ROT16, -16     C -16 instead of 16 actually works!
+       vspltisw ROT12, 12
+       vspltisw ROT8, 8
+       vspltisw ROT7, 7
+
+C Load state and splat
+       lxvw4x  VSR(v0),  0, SRC        C "expa ..."
+       lxvw4x  VSR(v4),  r6, SRC       C key
+       lxvw4x  VSR(v8),  r7, SRC       C key
+       lxvw4x  VSR(v12), r8, SRC       C cnt and nonce
+
+       vspltw  v1, v0, 1
+       vspltw  v2, v0, 2
+       vspltw  v3, v0, 3
+       vspltw  v0, v0, 0
+       vspltw  v5, v4, 1
+       vspltw  v6, v4, 2
+       vspltw  v7, v4, 3
+       vspltw  v4, v4, 0
+       vspltw  v9,  v8, 1
+       vspltw  v10, v8, 2
+       vspltw  v11, v8, 3
+       vspltw  v8,  v8, 0
+       vspltw  v13, v12, 1
+       vspltw  v14, v12, 2
+       vspltw  v15, v12, 3
+       vspltw  v12, v12, 0
+
+       ld      r9, .Lcnts@got(r2)
+       lxvw4x  VSR(T0), 0, r9  C increments
+       vaddcuw T1, v12, T0     C compute carry-out
+       vadduwm v12, v12, T0    C low adds
+       vand    T1, T1, T2      C discard carries for 32-bit counter variant
+       vadduwm v13, v13, T1    C apply carries
+
+       C Save all 4x4 of the last words.
+       vor     T0, v12, v12
+       vor     T1, v13, v13
+       vor     T2, v14, v14
+       vor     T3, v15, v15
+
+       srdi    ROUNDS, ROUNDS, 1
+       mtctr   ROUNDS
+.Loop:
+       QR(v0, v4,  v8, v12)
+       QR(v1, v5,  v9, v13)
+       QR(v2, v6, v10, v14)
+       QR(v3, v7, v11, v15)
+       QR(v0, v5, v10, v15)
+       QR(v1, v6, v11, v12)
+       QR(v2, v7,  v8, v13)
+       QR(v3, v4,  v9, v14)
+       bdnz    .Loop
+
+       C Add in saved original words, including counters, before
+       C transpose.
+       vadduwm v12, v12, T0
+       vadduwm v13, v13, T1
+       vadduwm v14, v14, T2
+       vadduwm v15, v15, T3
+
+       TRANSPOSE(v0, v1,v2, v3)
+       TRANSPOSE(v4, v5, v6, v7)
+       TRANSPOSE(v8, v9, v10, v11)
+       TRANSPOSE(v12, v13, v14, v15)
+
+       lxvw4x  VSR(T0),  0, SRC
+       lxvw4x  VSR(T1), r6, SRC
+       lxvw4x  VSR(T2), r7, SRC
+
+       vadduwm v0, v0, T0
+       vadduwm v1, v1, T0
+       vadduwm v2, v2, T0
+       vadduwm v3, v3, T0
+
+       vadduwm v4, v4, T1
+       vadduwm v5, v5, T1
+       vadduwm v6, v6, T1
+       vadduwm v7, v7, T1
+
+       vadduwm v8, v8, T2
+       vadduwm v9, v9, T2
+       vadduwm v10, v10, T2
+       vadduwm v11, v11, T2
+
+       stxvw4x VSR(v0), 0, DST
+       stxvw4x VSR(v4), r6, DST
+       stxvw4x VSR(v8), r7, DST
+       stxvw4x VSR(v12), r8, DST
+
+       addi    DST, DST, 64
+
+       stxvw4x VSR(v1), 0, DST
+       stxvw4x VSR(v5), r6, DST
+       stxvw4x VSR(v9), r7, DST
+       stxvw4x VSR(v13), r8, DST
+
+       addi    DST, DST, 64
+
+       stxvw4x VSR(v2), 0, DST
+       stxvw4x VSR(v6), r6, DST
+       stxvw4x VSR(v10), r7, DST
+       stxvw4x VSR(v14), r8, DST
+
+       addi    DST, DST, 64
+
+       stxvw4x VSR(v3), 0, DST
+       stxvw4x VSR(v7), r6, DST
+       stxvw4x VSR(v11), r7, DST
+       stxvw4x VSR(v15), r8, DST
+
+       C Restore callee-save registers
+       lvx     v20, 0, SP
+       lvx     v21, r6, SP
+       lvx     v22, r7, SP
+       lvx     v23, r8, SP
+       addi    SP, SP, 0x40
+
+       blr
+EPILOGUE(_nettle_chacha_4core)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core32)
+       vspltisw T2, 0          C Ignore counter carries
+       b       .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+       .section .rodata
+       ALIGN(16)
+.Lcnts: .long  0,1,2,3         C increments
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 30 Nov 2020 18:54:10 +0000 (19:54 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 30 Nov 2020 19:02:57 +0000 (20:02 +0100)
ChangeLog		patch \| blob \| blame \| history
chacha-crypt.c		patch \| blob \| blame \| history
chacha-internal.h		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
fat-ppc.c		patch \| blob \| blame \| history
powerpc64/fat/chacha-4core.asm	[new file with mode: 0644]	patch \| blob
powerpc64/p7/chacha-4core.asm	[new file with mode: 0644]	patch \| blob