From c5e153420709f19add41808289e2093780105161 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Niels=20M=C3=B6ller?= Date: Tue, 26 Mar 2013 14:02:34 +0100 Subject: [PATCH] ARM assembly for salsa20. --- ChangeLog | 4 + armv7/salsa20-core-internal.asm | 181 ++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 armv7/salsa20-core-internal.asm diff --git a/ChangeLog b/ChangeLog index 23dfa22d..d4375996 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2013-03-26 Niels Möller + + * armv7/salsa20-core-internal.asm: New file. 45% speedup. + 2013-03-25 Niels Möller From Martin Storsjö: diff --git a/armv7/salsa20-core-internal.asm b/armv7/salsa20-core-internal.asm new file mode 100644 index 00000000..fe26e5c5 --- /dev/null +++ b/armv7/salsa20-core-internal.asm @@ -0,0 +1,181 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "salsa20-core-internal.asm" + .fpu neon + +define(, ) +define(, ) +define(, ) + +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) + +define(, < + vadd.i32 T0, $1, $4 + vshl.i32 T1, T0, #7 + vshr.u32 T0, T0, #25 + veor $2, $2, T0 + veor $2, $2, T1 + + vadd.i32 T0, $1, $2 + vshl.i32 T1, T0, #9 + vshr.u32 T0, T0, #23 + veor $3, $3, T0 + veor $3, $3, T1 + + vadd.i32 T0, $2, $3 + vshl.i32 T1, T0, #13 + vshr.u32 T0, T0, #19 + veor $4, $4, T0 + veor $4, $4, T1 + + vadd.i32 T0, $3, $4 + vshl.i32 T1, T0, #18 + vshr.u32 T0, T0, #14 + veor $1, $1, T0 + veor $1, $1, T1 +>) + + .text + .align 4 +.Lmasks: + .int 0,-1, 0,-1 + .int 0,-1,-1, 0 + .int 0, 0,-1,-1 + + C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_salsa20_core) + vldm SRC, {X0,X1,X2,X3} + + C Input rows: + C 0 1 2 3 X0 + C 4 5 6 7 X1 + C 8 9 10 11 X2 + C 12 13 14 15 X3 + C Permuted to: + C 0 5 10 15 + C 4 9 14 3 + C 8 13 2 7 + C 12 1 6 11 + + C FIXME: Construct in some other way? + adr r12, .Lmasks + vldm r12, {M0101, M0110, M0011} + + vmov S1, X1 + vmov S2, X2 + vmov S3, X3 + + C Swaps in columns 1, 3: + C 0 5 2 7 X0 ^ + C 4 1 6 3 T0 v + C 8 13 10 15 T1 ^ + C 12 9 14 11 X3 v + vmov T0, X1 + vmov T1, X2 + vbit T0, X0, M0101 + vbit X0, X1, M0101 + vbit T1, X3, M0101 + vbit X3, X2, M0101 + + C Swaps in column 1, 2: + C 0 5 2 7 X0 + C 4 9 14 3 X1 ^ + C 8 13 10 15 T1 | + C 12 1 6 11 X3 v + vmov X1, T0 + vbit X1, X3, M0110 + vbit X3, T0, M0110 + + C Swaps in columm 2,3: + C 0 5 10 15 X0 ^ + C 4 9 14 3 X1 | + C 8 13 2 7 X2 v + C 12 1 6 11 X3 + vmov X2, T1 + vbit X2, X0, M0011 + vbit X0, T1, M0011 + +.Loop: + QROUND(X0, X1, X2, X3) + + C Rotate rows, to get + C 0 5 10 15 + C 3 4 9 14 >>> 1 + C 2 7 8 13 >>> 2 + C 1 6 11 12 >>> 3 + vext.32 X1, X1, X1, #3 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #1 + + QROUND(X0, X3, X2, X1) + + subs ROUNDS, ROUNDS, #2 + C Inverse rotation + vext.32 X1, X1, X1, #1 + vext.32 X2, X2, X2, #2 + vext.32 X3, X3, X3, #3 + + bhi .Loop + + C Inverse swaps + vmov T1, X2 + vbit T1, X0, M0011 + vbit X0, X2, M0011 + + vmov T0, X1 + vbit T0, X3, M0110 + vbit X3, X1, M0110 + + vmov X1, T0 + vmov X2, T1 + vbit X1, X0, M0101 + vbit X0, T0, M0101 + vbit X2, X3, M0101 + vbit X3, T1, M0101 + + vld1.64 {T0}, [SRC] + vadd.u32 X0, X0, T0 + vadd.u32 X1, X1, S1 + vadd.u32 X2, X2, S2 + vadd.u32 X3, X3, S3 + + vstm DST, {X0,X1,X2,X3} + bx lr +EPILOGUE(_nettle_salsa20_core) + +divert(-1) +define salsastate +p/x $q0.u32 +p/x $q1.u32 +p/x $q2.u32 +p/x $q3.u32 +end -- 2.47.2