From: Niels Möller Date: Mon, 13 Jul 2020 19:43:37 +0000 (+0200) Subject: Three-way interleaving of chacha on Neon X-Git-Tag: nettle_3.7rc1~89 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7a9d3f59ae3cf0690135d951580516cdafc4db5d;p=thirdparty%2Fnettle.git Three-way interleaving of chacha on Neon --- diff --git a/ChangeLog b/ChangeLog index 40104add..8b496f1a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2020-07-13 Niels Möller + + * arm/neon/chacha-3core.asm: New file, 3-way interleaving of + chacha. + 2020-07-11 Niels Möller * testsuite/chacha-test.c (test_main): Delete obsolete tests for diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm new file mode 100644 index 00000000..b73df2f1 --- /dev/null +++ b/arm/neon/chacha-3core.asm @@ -0,0 +1,242 @@ +C arm/neon/chacha-3core.asm + +ifelse(< + Copyright (C) 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + + .file "chacha-3core.asm" + .fpu neon + +define(, ) +define(, ) +define(, ) + +C State, X, Y and Z representing consecutive blocks +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) + +define(, ) +define(, ) +define(, ) +define(, ) + + .text + .align 4 +.Lcount1: + .int 1,0,0,0 + + C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_chacha_3core) + vldm SRC, {X0,X1,X2,X3} + vpush {q4,q5,q6,q7} + adr r12, .Lcount1 + vld1.64 {Z3}, [r12] + + vadd.i64 Y3, X3, Z3 C Increment 64-bit counter + vadd.i64 Z3, Y3, Z3 + +.Lshared_entry: + vmov Y0, X0 + vmov Z0, X0 + vmov Y1, X1 + vmov Z1, X1 + vmov Y2, X2 + vmov Z2, X2 + vpush {Z3} + vpush {Y3} + +.Loop: + C Interleave three blocks. Note that with this scheduling, + C only two temporaries, T0 and T1, are needed. + vadd.i32 X0, X0, X1 + veor X3, X3, X0 + vrev32.16 X3, X3 C lrot 16 + vadd.i32 Y0, Y0, Y1 + vadd.i32 X2, X2, X3 + veor Y3, Y3, Y0 + veor T0, X1, X2 + vrev32.16 Y3, Y3 C lrot 16 + vadd.i32 Z0, Z0, Z1 + vshl.i32 X1, T0, #12 + vadd.i32 Y2, Y2, Y3 + veor Z3, Z3, Z0 + vsri.u32 X1, T0, #20 + veor T0, Y1, Y2 + vrev32.16 Z3, Z3 C lrot 16 + vadd.i32 X0, X0, X1 + vshl.i32 Y1, T0, #12 + vadd.i32 Z2, Z2, Z3 + veor T1, X3, X0 + vsri.u32 Y1, T0, #20 + veor T0, Z1, Z2 + vshl.i32 X3, T1, #8 + vsri.u32 X3, T1, #24 + vadd.i32 Y0, Y0, Y1 + vshl.i32 Z1, T0, #12 + vadd.i32 X2, X2, X3 + veor T1, Y3, Y0 + vsri.u32 Z1, T0, #20 + veor T0, X1, X2 + vshl.i32 Y3, T1, #8 + vsri.u32 Y3, T1, #24 + vadd.i32 Z0, Z0, Z1 + vshl.i32 X1, T0, #7 + vadd.i32 Y2, Y2, Y3 + veor T1, Z3, Z0 + vsri.u32 X1, T0, #25 + veor T0, Y1, Y2 + vshl.i32 Z3, T1, #8 + vsri.u32 Z3, T1, #24 + vshl.i32 Y1, T0, #7 + vadd.i32 Z2, Z2, Z3 + vsri.u32 Y1, T0, #25 + veor T0, Z1, Z2 + vshl.i32 Z1, T0, #7 + vsri.u32 Z1, T0, #25 + + vext.32 X1, X1, X1, #1 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #3 + + vext.32 Y1, Y1, Y1, #1 + vext.32 Y2, Y2, Y2, #2 + vext.32 Y3, Y3, Y3, #3 + + vext.32 Z1, Z1, Z1, #1 + vext.32 Z2, Z2, Z2, #2 + vext.32 Z3, Z3, Z3, #3 + + vadd.i32 X0, X0, X1 + veor X3, X3, X0 + vrev32.16 X3, X3 C lrot 16 + vadd.i32 Y0, Y0, Y1 + vadd.i32 X2, X2, X3 + veor Y3, Y3, Y0 + veor T0, X1, X2 + vrev32.16 Y3, Y3 C lrot 16 + vadd.i32 Z0, Z0, Z1 + vshl.i32 X1, T0, #12 + vadd.i32 Y2, Y2, Y3 + veor Z3, Z3, Z0 + vsri.u32 X1, T0, #20 + veor T0, Y1, Y2 + vrev32.16 Z3, Z3 C lrot 16 + vadd.i32 X0, X0, X1 + vshl.i32 Y1, T0, #12 + vadd.i32 Z2, Z2, Z3 + veor T1, X3, X0 + vsri.u32 Y1, T0, #20 + veor T0, Z1, Z2 + vshl.i32 X3, T1, #8 + vsri.u32 X3, T1, #24 + vadd.i32 Y0, Y0, Y1 + vshl.i32 Z1, T0, #12 + vadd.i32 X2, X2, X3 + veor T1, Y3, Y0 + vsri.u32 Z1, T0, #20 + veor T0, X1, X2 + vshl.i32 Y3, T1, #8 + vsri.u32 Y3, T1, #24 + vadd.i32 Z0, Z0, Z1 + vshl.i32 X1, T0, #7 + vadd.i32 Y2, Y2, Y3 + veor T1, Z3, Z0 + vsri.u32 X1, T0, #25 + veor T0, Y1, Y2 + vshl.i32 Z3, T1, #8 + vsri.u32 Z3, T1, #24 + vshl.i32 Y1, T0, #7 + vadd.i32 Z2, Z2, Z3 + vsri.u32 Y1, T0, #25 + veor T0, Z1, Z2 + vshl.i32 Z1, T0, #7 + vsri.u32 Z1, T0, #25 + + subs ROUNDS, ROUNDS, #2 + + vext.32 X1, X1, X1, #3 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #1 + + vext.32 Y1, Y1, Y1, #3 + vext.32 Y2, Y2, Y2, #2 + vext.32 Y3, Y3, Y3, #1 + + vext.32 Z1, Z1, Z1, #3 + vext.32 Z2, Z2, Z2, #2 + vext.32 Z3, Z3, Z3, #1 + + bhi .Loop + + vldm SRC, {T0,T1,T2,T3} + vadd.i32 X0, X0, T0 + vadd.i32 Y0, Y0, T0 + vadd.i32 Z0, Z0, T0 + vadd.i32 X1, X1, T1 + vadd.i32 Y1, Y1, T1 + vadd.i32 Z1, Z1, T1 + vadd.i32 X2, X2, T2 + vadd.i32 Y2, Y2, T2 + vadd.i32 Z2, Z2, T2 + + vpop {T0, T1} C updated counters + vadd.i32 X3, X3, T3 + vadd.i32 Y3, Y3, T0 + vadd.i32 Z3, Z3, T1 + + vpop {q4,q5,q6,q7} + + vstmia DST!, {X0,X1,X2,X3} + vstmia DST!, {Y0,Y1,Y2,Y3} + vstm DST, {Z0,Z1,Z2,Z3} + bx lr +EPILOGUE(_nettle_chacha_3core) + +PROLOGUE(_nettle_chacha_3core32) + vldm SRC, {X0,X1,X2,X3} + vpush {q4,q5,q6,q7} + adr r12, .Lcount1 + vld1.64 {Z3}, [r12] + vadd.i32 Z3, Y3, Y3 + + vadd.i32 Y3, X3, Z3 C Increment 32-bit counter + vadd.i32 Z3, Y3, Z3 + b .Lshared_entry +EPILOGUE(_nettle_chacha_3core32) diff --git a/chacha-crypt.c b/chacha-crypt.c index 1797bd02..59d808d1 100644 --- a/chacha-crypt.c +++ b/chacha-crypt.c @@ -54,6 +54,45 @@ #define CHACHA_ROUNDS 20 +#if HAVE_NATIVE_chacha_3core +void +chacha_crypt(struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src) +{ + uint32_t x[3*_CHACHA_STATE_LENGTH]; + + if (!length) + return; + + while (length > 2*CHACHA_BLOCK_SIZE) + { + _chacha_3core (x, ctx->state, CHACHA_ROUNDS); + ctx->state[12] += 3; + ctx->state[13] += (ctx->state[12] < 3); + if (length <= 3*CHACHA_BLOCK_SIZE) + { + memxor3 (dst, src, x, length); + return; + } + memxor3 (dst, src, x, 3*CHACHA_BLOCK_SIZE); + + length -= 3*CHACHA_BLOCK_SIZE; + dst += 3*CHACHA_BLOCK_SIZE; + src += 3*CHACHA_BLOCK_SIZE; + } + _chacha_core (x, ctx->state, CHACHA_ROUNDS); + ctx->state[13] += (++ctx->state[12] == 0); + + if (length > CHACHA_BLOCK_SIZE) + { + _chacha_core (x + _CHACHA_STATE_LENGTH, ctx->state, CHACHA_ROUNDS); + ctx->state[13] += (++ctx->state[12] == 0); + } + memxor3 (dst, src, x, length); +} +#else void chacha_crypt(struct chacha_ctx *ctx, size_t length, @@ -85,6 +124,7 @@ chacha_crypt(struct chacha_ctx *ctx, m += CHACHA_BLOCK_SIZE; } } +#endif void chacha_crypt32(struct chacha_ctx *ctx, diff --git a/chacha-internal.h b/chacha-internal.h index 1bca8e74..cc90b132 100644 --- a/chacha-internal.h +++ b/chacha-internal.h @@ -39,8 +39,13 @@ #include "nettle-types.h" #define _chacha_core _nettle_chacha_core +#define _chacha_3core _nettle_chacha_3core void _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds); +/* Functions available only in some configurations */ +void +_chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds); + #endif /* NETTLE_CHACHA_INTERNAL_H_INCLUDED */ diff --git a/configure.ac b/configure.ac index a01eb7d3..3136c1a3 100644 --- a/configure.ac +++ b/configure.ac @@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ # Assembler files which generate additional object files if they are used. asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ - chacha-core-internal-2.asm salsa20-2core.asm \ + chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -559,6 +559,7 @@ AH_VERBATIM([HAVE_NATIVE], [/* Define to 1 each of the following for which a native (ie. CPU specific) implementation of the corresponding routine exists. */ #undef HAVE_NATIVE_chacha_core +#undef HAVE_NATIVE_chacha_3core #undef HAVE_NATIVE_ecc_curve25519_modp #undef HAVE_NATIVE_ecc_curve448_modp #undef HAVE_NATIVE_ecc_secp192r1_modp