]> git.ipfire.org Git - thirdparty/nettle.git/commitdiff
[Arm64] Optimize Poly1305 arm64-poly1305
authorMamone Tarsha <maamoun.tk@googlemail.com>
Tue, 18 Jan 2022 17:29:32 +0000 (19:29 +0200)
committerMamone Tarsha <maamoun.tk@googlemail.com>
Tue, 18 Jan 2022 17:29:32 +0000 (19:29 +0200)
Makefile.in
arm64/asimd/poly1305-2core.asm [new file with mode: 0644]
arm64/fat/poly1305-2core.asm [new file with mode: 0644]
chacha-poly1305.c
configure.ac
fat-arm64.c
fat-setup.h
poly1305-aes.c
poly1305-internal.h
poly1305-update.c [new file with mode: 0644]

index 0590c370b26f1b340452c6458b7f1f166107ee10..4fd02bf6be745a0847333e6c6daaa256ecc70650 100644 (file)
@@ -102,6 +102,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
                 siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \
                 cnd-memcpy.c \
                 chacha-crypt.c chacha-core-internal.c \
+                poly1305-update.c \
                 chacha-poly1305.c chacha-poly1305-meta.c \
                 chacha-set-key.c chacha-set-nonce.c \
                 ctr.c ctr16.c des.c des3.c \
@@ -606,7 +607,7 @@ distdir: $(DISTFILES)
        set -e; for d in sparc32 sparc64 x86 \
                x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
                arm arm/neon arm/v6 arm/fat \
-               arm64 arm64/crypto arm64/fat \
+               arm64 arm64/asimd arm64/crypto arm64/fat \
                powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
                s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
          mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/poly1305-2core.asm b/arm64/asimd/poly1305-2core.asm
new file mode 100644 (file)
index 0000000..d624cde
--- /dev/null
@@ -0,0 +1,351 @@
+C arm64/asimd/poly1305-2core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`CTX', `x0')
+define(`DATA', `x1')
+define(`LEN', `x2')
+define(`T4', `w3')
+
+C Working state
+define(`H0', `v1')
+define(`H1', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v0')
+
+define(`R0', `v16')
+define(`R1', `v17')
+define(`R2', `v18')
+define(`R3', `v19')
+define(`R4', `v20')
+
+define(`S1', `v21')
+define(`S2', `v22')
+define(`S3', `v23')
+define(`S4', `v24')
+
+define(`C0', `v25')
+define(`C1', `v26')
+define(`C2',  `v27')
+define(`C3',  `v28')
+define(`C4',  `v29')
+
+define(`T4W',  `v5')
+define(`MASK26',  `v6')
+define(`H2TBL',  `v7')
+
+C Multiply state by key of two horizontal parts and reduce both products
+define(`MUL_REDC', `
+       umull   C0.2d, H0.2s, R0.2s
+       umull   C1.2d, H1.2s, R0.2s
+       umull   C2.2d, H2.2s, R0.2s
+       umull   C3.2d, H3.2s, R0.2s
+       umull   C4.2d, H4.2s, R0.2s
+
+       umlal   C0.2d, H4.2s, S1.2s
+       umlal   C1.2d, H0.2s, R1.2s
+       umlal   C2.2d, H1.2s, R1.2s
+       umlal   C3.2d, H2.2s, R1.2s
+       umlal   C4.2d, H3.2s, R1.2s
+
+       umlal   C0.2d, H3.2s, S2.2s
+       umlal   C1.2d, H4.2s, S2.2s
+       umlal   C2.2d, H0.2s, R2.2s
+       umlal   C3.2d, H1.2s, R2.2s
+       umlal   C4.2d, H2.2s, R2.2s
+
+       umlal   C0.2d, H2.2s, S3.2s
+       umlal   C1.2d, H3.2s, S3.2s
+       umlal   C2.2d, H4.2s, S3.2s
+       umlal   C3.2d, H0.2s, R3.2s
+       umlal   C4.2d, H1.2s, R3.2s
+
+       umlal   C0.2d, H1.2s, S4.2s
+       umlal   C1.2d, H2.2s, S4.2s
+       umlal   C2.2d, H3.2s, S4.2s
+       umlal   C3.2d, H4.2s, S4.2s
+       umlal   C4.2d, H0.2s, R4.2s
+
+       C -- Reduction phase --
+       
+       C carry h0 -> h1
+       C carry h3 -> h4
+       ushr    H1.2d, C0.2d, #26
+       ushr    H4.2d, C3.2d, #26
+       add             H1.2d, H1.2d, C1.2d
+       add             H4.2d, H4.2d, C4.2d
+       and             H0.16b, C0.16b, MASK26.16b
+       and             H3.16b, C3.16b, MASK26.16b
+
+       C carry h1 -> h2
+       C carry h4 -> h0
+       ushr    C1.2d, H1.2d, #26
+       ushr    C4.2d, H4.2d, #26
+       add             H2.2d, C2.2d, C1.2d
+       add             H0.2d, H0.2d, C4.2d
+       and             H1.16b, H1.16b, MASK26.16b
+       and             H4.16b, H4.16b, MASK26.16b
+
+       C carry h4*4 -> h0
+       C carry h2 -> h3
+       shl             C4.2d, C4.2d, #2
+       ushr    C2.2d, H2.2d, #26
+       add             H0.2d, H0.2d, C4.2d
+       add             H3.2d, H3.2d, C2.2d
+       and             H2.16b, H2.16b, MASK26.16b
+
+       C carry h0 -> h1
+       C carry h3 -> h4
+       ushr    C0.2d, H0.2d, #26
+       ushr    C3.2d, H3.2d, #26
+       add             H1.2d, H1.2d, C0.2d
+       add             H4.2d, H4.2d, C3.2d
+       and             H0.16b, H0.16b, MASK26.16b
+       and             H3.16b, H3.16b, MASK26.16b
+       ')
+
+       .text
+       C void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4)
+
+PROLOGUE(_nettle_poly1305_2core)
+       adr             x4, .mask26
+       ld1             {MASK26.2d}, [x4]
+       adr             x4, .h2tbl
+       ld1             {H2TBL.16b}, [x4]
+
+       C Shift and replicate T4 across vector
+       lsl             T4, T4, #24
+       dup             T4W.4s, T4
+
+       C In case the buffer has only two blocks, process them separately
+       cmp             LEN, #32
+       b.eq    L2B
+
+       C This procedure processes two blocks horizontally over vector 
+       C registers. In order to keep two separated parts of state, we
+       C store the state in the first parts of vector reigters and
+       C initialize the second part with zeros. For each iteration, two
+       C blocks would be added to both parts and multiply the state parts
+       C by r^2 except for the last iteration we multiply the first part
+       C of state by r^2 and the second part by r. In this way we can
+       C maintain the correct sequence of multiples for each mutiplication
+       C of consecutive blocks.
+
+       C Load key and cached multiples
+       ld4             {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+       ld1             {R4.s}[0], [CTX], #4
+       ld4             {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+
+       C -- Calculate r^2 = r*r ---
+       
+       ins             H0.s[0], R0.s[0]
+       ins             H1.s[0], R1.s[0]
+       ins             H2.s[0], R2.s[0]
+       ins             H3.s[0], R3.s[0]
+       ins             H4.s[0], R4.s[0]
+
+       MUL_REDC()
+
+       C Horizontally asssign two parts of key vectors to r^2
+       dup             R0.4s, H0.s[0]
+       dup             R1.4s, H1.s[0]
+       dup             R2.4s, H2.s[0]
+       dup             R3.4s, H3.s[0]
+       dup             R4.4s, H4.s[0]
+
+       C Calculate S = R*5
+       shl             S1.4s, R1.4s, #2
+       shl             S2.4s, R2.4s, #2
+       shl             S3.4s, R3.4s, #2
+       shl             S4.4s, R4.4s, #2
+       add             S1.4s, S1.4s, R1.4s
+       add             S2.4s, S2.4s, R2.4s
+       add             S3.4s, S3.4s, R3.4s
+       add             S4.4s, S4.4s, R4.4s
+
+       C initialize the second parts of state with zeros
+       eor             H0.16b, H0.16b, H0.16b
+       eor             H1.16b, H1.16b, H1.16b
+       eor             H2.16b, H2.16b, H2.16b
+       eor             H3.16b, H3.16b, H3.16b
+       eor             H4.16b, H4.16b, H4.16b
+
+       C Load state
+       ld4             {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+       ld1             {H3.s}[0], [CTX]
+
+       C Iterate over every pair of blocks and exclude the final one.
+       sub             LEN, LEN, #32
+L2B_loop:
+       C Load two blocks
+       ld1             {C3.16b, C4.16b}, [DATA], #32
+
+       C Permute the two blocks and line them horizontally
+       zip1    C0.2d, C3.2d, C4.2d
+       tbl             C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+       zip2    C4.2d, C3.2d, C4.2d
+
+       ushr    C1.2d, C0.2d, #26
+       ushr    C2.2d, C2.2d, #4
+       ushr    C3.2d, C4.2d, #14
+       ushr    C4.2d, C4.2d, #40
+
+       and             C0.16b, C0.16b, MASK26.16b
+       and             C1.16b, C1.16b, MASK26.16b
+       and             C2.16b, C2.16b, MASK26.16b
+       and             C3.16b, C3.16b, MASK26.16b
+       orr             C4.16b, C4.16b, T4W.16b
+
+       add             H0.2d, H0.2d, C0.2d
+       add             H1.2d, H1.2d, C1.2d
+       add             H2.2d, H2.2d, C2.2d
+       add             H3.2d, H3.2d, C3.2d
+       add             H4.2d, H4.2d, C4.2d
+
+       xtn             H0.2s, H0.2d
+       xtn             H1.2s, H1.2d
+       xtn             H2.2s, H2.2d
+       xtn             H3.2s, H3.2d
+       xtn             H4.2s, H4.2d
+
+       MUL_REDC()
+
+       subs    LEN, LEN, #32
+       b.ne    L2B_loop
+
+       C Set the first part of key to r^2 and the second part to r
+       sub             CTX, CTX, #52
+       ld4             {R0.s, R1.s, R2.s, R3.s}[1], [CTX], #16
+       ld1             {R4.s}[1], [CTX], #4
+       ld4             {S1.s, S2.s, S3.s, S4.s}[1], [CTX], #16
+
+       ld1             {C3.16b, C4.16b}, [DATA]
+
+       zip1    C0.2d, C3.2d, C4.2d
+       tbl             C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+       zip2    C4.2d, C3.2d, C4.2d
+
+       ushr    C1.2d, C0.2d, #26
+       ushr    C2.2d, C2.2d, #4
+       ushr    C3.2d, C4.2d, #14
+       ushr    C4.2d, C4.2d, #40
+
+       and             C0.16b, C0.16b, MASK26.16b
+       and             C1.16b, C1.16b, MASK26.16b
+       and             C2.16b, C2.16b, MASK26.16b
+       and             C3.16b, C3.16b, MASK26.16b
+       orr             C4.16b, C4.16b, T4W.16b
+
+       add             H0.2d, H0.2d, C0.2d
+       add             H1.2d, H1.2d, C1.2d
+       add             H2.2d, H2.2d, C2.2d
+       add             H3.2d, H3.2d, C3.2d
+       add             H4.2d, H4.2d, C4.2d
+
+       xtn             H0.2s, H0.2d
+       xtn             H1.2s, H1.2d
+       xtn             H2.2s, H2.2d
+       xtn             H3.2s, H3.2d
+       xtn             H4.2s, H4.2d
+
+       MUL_REDC()
+
+       C Combine both state parts
+       dup             C0.2d, H0.d[1]
+       dup             C1.2d, H1.d[1]
+       dup             C2.2d, H2.d[1]
+       dup             C3.2d, H3.d[1]
+       dup             C4.2d, H4.d[1]
+
+       add             H0.2d, H0.2d, C0.2d
+       add             H1.2d, H1.2d, C1.2d
+       add             H2.2d, H2.2d, C2.2d
+       add             H3.2d, H3.2d, C3.2d
+       add             H4.2d, H4.2d, C4.2d
+
+       b               Ldone
+
+       C Process two blocks separately
+L2B:
+       ld4             {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+       ld1             {R4.s}[0], [CTX], #4
+       ld4             {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+       ld4             {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+       ld1             {H3.s}[0], [CTX]
+       sub             CTX, CTX, #16
+L1B_loop:
+       ld1             {C0.16b}, [DATA], #16
+
+       tbl             C2.16b, { C0.16b }, H2TBL.16b
+       ext             C4.16b, C0.16b, C0.16b, #8
+
+       ushr    C1.2d, C0.2d, #26
+       ushr    C2.2d, C2.2d, #4
+       ushr    C3.2d, C4.2d, #14
+       ushr    C4.2d, C4.2d, #40
+
+       and             C0.16b, C0.16b, MASK26.16b
+       and             C1.16b, C1.16b, MASK26.16b
+       and             C2.16b, C2.16b, MASK26.16b
+       and             C3.16b, C3.16b, MASK26.16b
+       orr             C4.16b, C4.16b, T4W.16b
+
+       add             H0.2d, H0.2d, C0.2d
+       add             H1.2d, H1.2d, C1.2d
+       add             H2.2d, H2.2d, C2.2d
+       add             H3.2d, H3.2d, C3.2d
+       add             H4.2d, H4.2d, C4.2d
+
+       xtn             H0.2s, H0.2d
+       xtn             H1.2s, H1.2d
+       xtn             H2.2s, H2.2d
+       xtn             H3.2s, H3.2d
+       xtn             H4.2s, H4.2d
+
+       MUL_REDC()
+
+       subs    LEN, LEN, #16
+       b.ne    L1B_loop
+
+Ldone:
+       C Store state
+       st4             {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+       st1             {H3.s}[0], [CTX]
+
+       ret
+EPILOGUE(_nettle_poly1305_2core)
+
+.align 4
+.mask26: .quad 0x0000000003FFFFFF,0x0000000003FFFFFF
+.h2tbl: .byte  0x06,0x07,0x08,0x09,0x00,0x00,0x00,0x00,0x16,0x17,0x18,0x19,0x00,0x00,0x00,0x00
diff --git a/arm64/fat/poly1305-2core.asm b/arm64/fat/poly1305-2core.asm
new file mode 100644 (file)
index 0000000..f548630
--- /dev/null
@@ -0,0 +1,35 @@
+C arm64/fat/poly1305-2core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_poly1305_2core) picked up by configure
+
+include_src(`arm64/asimd/poly1305-2core.asm')
index 7a423e1e627ac12cd221468063f8c3e9c5f292e4..521b441a64d1be85afe8bed04be838390e71bd71 100644 (file)
@@ -90,14 +90,11 @@ chacha_poly1305_set_nonce (struct chacha_poly1305_ctx *ctx,
   ctx->auth_size = ctx->data_size = ctx->index = 0;
 }
 
-/* FIXME: Duplicated in poly1305-aes128.c */
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->poly1305, (data), 1)
-
 static void
 poly1305_update (struct chacha_poly1305_ctx *ctx,
                 size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update(&ctx->poly1305, ctx->block, ctx->index, length, data);
 }
 
 static void
index da72f908ac9e251c01d7923fe487f2f416bd99b5..0b4a358c8aca3ef0384aee92e70ef19076fcff3a 100644 (file)
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
   AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
   [enable_arm_neon=auto])
 
+AC_ARG_ENABLE(arm64-asimd,
+  AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+  [enable_arm64_asimd=no])
+
 AC_ARG_ENABLE(arm64-crypto,
   AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
   [enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
         if test "x$enable_fat" = xyes ; then
           asm_path="arm64/fat $asm_path"
           OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
-          FAT_TEST_LIST="none aes pmull sha1 sha2"
+          FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
         else
+          if test "$enable_arm64_asimd" = yes ; then
+            asm_path="arm64/asimd $asm_path"
+          fi
           if test "$enable_arm64_crypto" = yes ; then
             asm_path="arm64/crypto $asm_path"
           fi
@@ -597,6 +604,7 @@ asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm cpu-facility.asm
   aes256-encrypt-2.asm aes256-decrypt-2.asm \
   cbc-aes128-encrypt-2.asm cbc-aes192-encrypt-2.asm cbc-aes256-encrypt-2.asm \
   chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+  poly1305-2core.asm \
   salsa20-2core.asm salsa20-core-internal-2.asm \
   sha1-compress-2.asm sha256-compress-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
@@ -730,6 +738,8 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_fat_chacha_2core
 #undef HAVE_NATIVE_fat_chacha_3core
 #undef HAVE_NATIVE_fat_chacha_4core
+#undef HAVE_NATIVE_poly1305_2core
+#undef HAVE_NATIVE_fat_poly1305_2core
 #undef HAVE_NATIVE_ecc_curve25519_modp
 #undef HAVE_NATIVE_ecc_curve448_modp
 #undef HAVE_NATIVE_ecc_secp192r1_modp
index fcb2ece815e499855773bf0bd2726e4e37742be8..be3d0b1edf59c8aad5d072053eb63d82bb819058 100644 (file)
@@ -53,6 +53,7 @@
 #include "aes.h"
 #include "gcm.h"
 #include "gcm-internal.h"
+#include "poly1305.h"
 #include "fat-setup.h"
 
 /* Defines from arch/arm64/include/uapi/asm/hwcap.h in Linux kernel */
@@ -74,6 +75,7 @@
 
 struct arm64_features
 {
+  int have_asimd;
   int have_aes;
   int have_pmull;
   int have_sha1;
@@ -87,6 +89,7 @@ static void
 get_arm64_features (struct arm64_features *features)
 {
   const char *s;
+  features->have_asimd = 0;
   features->have_aes = 0;
   features->have_pmull = 0;
   features->have_sha1 = 0;
@@ -99,7 +102,9 @@ get_arm64_features (struct arm64_features *features)
        const char *sep = strchr (s, ',');
        size_t length = sep ? (size_t) (sep - s) : strlen(s);
 
-       if (MATCH (s, length, "aes", 3))
+       if (MATCH (s, length, "asimd", 5))
+         features->have_asimd = 1;
+  else if (MATCH (s, length, "aes", 3))
          features->have_aes = 1;
   else if (MATCH (s, length, "pmull", 5))
          features->have_pmull = 1;
@@ -115,6 +120,8 @@ get_arm64_features (struct arm64_features *features)
     {
 #if USE_GETAUXVAL
       unsigned long hwcap = getauxval(AT_HWCAP);
+      features->have_asimd
+       = ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
       features->have_aes
        = ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
       features->have_pmull
@@ -166,6 +173,22 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
 
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, asimd);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(_nettle_poly1305_update, poly1305_update_func)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 1core)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 2core)
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -176,8 +199,9 @@ fat_init (void)
 
   verbose = getenv (ENV_VERBOSE) != NULL;
   if (verbose)
-    fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
-            features.have_aes ? " aes instructions" : "",
+    fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+            features.have_asimd ? " advanced simd" : "",
+       features.have_aes ? " aes instructions" : "",
             features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
        features.have_sha1 ? " sha1 instructions" : "",
        features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +267,22 @@ fat_init (void)
     {
       _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
     }
+  if (features.have_asimd)
+    {
+      if (verbose)
+       fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+      _nettle_chacha_core_vec = _nettle_chacha_core_asimd;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+      _nettle_poly1305_update_vec = _nettle_poly1305_update_2core;
+    }
+  else
+    {
+      _nettle_chacha_core_vec = _nettle_chacha_core_c;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+      _nettle_poly1305_update_vec = _nettle_poly1305_update_1core;
+    }
 }
 
 DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +330,29 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
                (uint32_t *state, const uint8_t *input, const uint32_t *k),
                (state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+               (uint32_t *dst, const uint32_t *src, unsigned rounds),
+               (dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+               (struct chacha_ctx *ctx,
+                size_t length,
+                uint8_t *dst,
+                const uint8_t *src),
+               (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+               (struct chacha_ctx *ctx,
+                size_t length,
+                uint8_t *dst,
+                const uint8_t *src),
+               (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_poly1305_update, unsigned,
+               (struct poly1305_ctx *ctx,
+                uint8_t *block,
+                unsigned pos,
+     size_t length,
+                const uint8_t *data),
+               (ctx, block, pos, length, data))
index 64b272440fde181902a80422f85cd809d1ddc425..677824cee476fe4e2e319223b58af85c3e2129ad 100644 (file)
@@ -196,6 +196,10 @@ typedef void chacha_crypt_func(struct chacha_ctx *ctx,
                               uint8_t *dst,
                               const uint8_t *src);
 
+struct poly1305_ctx;
+typedef unsigned poly1305_update_func(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos,
+                              size_t length, const uint8_t *data);
+
 struct aes128_ctx;
 typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key);
 typedef void aes128_invert_key_func (struct aes128_ctx *dst, const struct aes128_ctx *src);
index a4050254bb9ff15d4ad553f72bec26231a2bdcc6..935ea6389b393c9d89eaeb63e05c601147647014 100644 (file)
@@ -56,13 +56,11 @@ poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
   memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE);
 }
 
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->pctx, (data), 1)
-
 void
 poly1305_aes_update (struct poly1305_aes_ctx *ctx,
                     size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update(&ctx->pctx, ctx->block, ctx->index, length, data);
 }
 
 void
index 9932d5245eee33b097aced314accade9fcc08ac2..b55f19fed402cab082b5a7d4619adddd03161ba3 100644 (file)
@@ -53,6 +53,19 @@ void _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
 /* Process one block. */
 void _nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
                             unsigned high);
+unsigned _nettle_poly1305_update(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos, size_t length,
+                            const uint8_t *data);
+
+/* Functions available only in some configurations */
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+                            uint8_t *block, unsigned pos,
+                            size_t length, const uint8_t *data);
+
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+                            uint8_t *block, unsigned pos,
+                            size_t length, const uint8_t *data);
 
 #ifdef __cplusplus
 }
diff --git a/poly1305-update.c b/poly1305-update.c
new file mode 100644 (file)
index 0000000..180aa16
--- /dev/null
@@ -0,0 +1,114 @@
+/* poly1305-update.c
+
+   Copyright (C) 2021 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <string.h>
+
+#include "poly1305.h"
+#include "poly1305-internal.h"
+
+#if HAVE_NATIVE_poly1305_2core
+#define _nettle_poly1305_update_2core _nettle_poly1305_update
+#elif !HAVE_NATIVE_fat_poly1305_2core
+#define _nettle_poly1305_update_1core _nettle_poly1305_update
+#endif
+
+#if HAVE_NATIVE_poly1305_2core || HAVE_NATIVE_fat_poly1305_2core
+void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4);
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+                          uint8_t *block, unsigned pos,
+                          size_t length, const uint8_t *data)
+{
+  if (pos)
+  {
+    if (pos + length < POLY1305_BLOCK_SIZE)
+    {
+      memcpy (block + pos, data, length);
+      return pos + length;
+    }
+    else
+    {
+      unsigned left = POLY1305_BLOCK_SIZE - pos;
+      memcpy (block + pos, data, left);
+      data += left;
+      length -= left;
+      _nettle_poly1305_block(ctx, block, 1);
+    }
+  }
+  if (length >= 2*POLY1305_BLOCK_SIZE)
+  {
+    size_t rlen = length & -(2*POLY1305_BLOCK_SIZE);
+    _nettle_poly1305_2core(ctx, data, rlen, 1);
+    data += rlen;
+    length -= rlen;
+  }
+  if (length >= POLY1305_BLOCK_SIZE)
+  {
+    _nettle_poly1305_block(ctx, data, 1);
+    data += POLY1305_BLOCK_SIZE;
+    length -= POLY1305_BLOCK_SIZE;
+  }
+  memcpy (block, data, length);
+  return length;
+}
+#endif
+#if !HAVE_NATIVE_poly1305_2core
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+                          uint8_t *block, unsigned pos,
+                          size_t length, const uint8_t *data)
+{
+  if (pos)
+  {
+    if (pos + length < POLY1305_BLOCK_SIZE)
+    {
+      memcpy (block + pos, data, length);
+      return pos + length;
+    }
+    else
+    {
+      unsigned left = POLY1305_BLOCK_SIZE - pos;
+      memcpy (block + pos, data, left);
+      data += left;
+      length -= left;
+      _nettle_poly1305_block(ctx, block, 1);
+    }
+  }
+  for (; length >= POLY1305_BLOCK_SIZE; length -= POLY1305_BLOCK_SIZE, data += POLY1305_BLOCK_SIZE)
+    _nettle_poly1305_block(ctx, data, 1);
+  memcpy (block, data, length);
+  return length;
+}
+#endif