x86_64 implementation of salsa20.

author Niels Möller <nisse@lysator.liu.se>

Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)
author Niels Möller <nisse@lysator.liu.se>
Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)
diff --git a/ChangeLog b/ChangeLog

index 3206c66f62956c9cc0db24c7313deea4bb3d32c8..cb90a6f1b5269ee8db51446ebe290eecf76b56f4 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2012-04-18  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/salsa20-crypt.asm: New file.
+
  2012-04-17  Niels Möller  <nisse@lysator.liu.se>
  
         * testsuite/salsa20-test.c (test_salsa20_stream): Check that
diff --git a/configure.ac b/configure.ac

index 26339693baa416f7be40403d9fd8b0c2a53bf0b4..6bf2b8bd695a53d804e2a49d48bb98cb73ca0d1f 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -233,7 +233,7 @@ if test "x$enable_assembler" = xyes ; then
      found=no
      for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \
                  arcfour-crypt.asm camellia-crypt-internal.asm \
-                md5-compress.asm memxor.asm \
+                md5-compress.asm memxor.asm salsa20-crypt.asm \
                  serpent-encrypt.asm serpent-decrypt.asm \
                  sha1-compress.asm machine.m4; do
  #       echo "Looking for $srcdir/$asm_path/$tmp_f"
diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm

new file mode 100644 (file)

index 0000000..799d574
--- /dev/null
+++ b/x86_64/salsa20-crypt.asm
@@ -0,0 +1,270 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2012 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+define(<CTX>, <%rdi>)
+define(<LENGTH>, <%rsi>)
+define(<DST>, <%rdx>)
+define(<SRC>, <%rcx>)
+define(<T64>, <%r8>)
+define(<POS>, <%r9>)
+define(<X0>, <%xmm0>)
+define(<X1>, <%xmm1>)
+define(<X2>, <%xmm2>)
+define(<X3>, <%xmm3>)
+define(<T0>, <%xmm4>)
+define(<T1>, <%xmm5>)
+define(<M0101>, <%xmm6>)
+define(<M0110>, <%xmm7>)
+define(<M0011>, <%xmm8>)
+define(<COUNT>, <%rax>)
+
+C Possible improvements:
+C 
+C Do two blocks (or more) at a time in parallel, to avoid limitations
+C due to data dependencies.
+C 
+C Avoid redoing the permutation of the input for each block (all but
+C the two counter words are constant). Could also keep the input in
+C registers.
+
+C QROUND(x0, x1, x2, x3)
+define(<QROUND>, <
+       movaps  $4, T0          C 0
+       paddd   $1, T0          C 1
+       movaps  T0, T1          C 2
+       pslld   <$>7, T0        C 2
+       psrld   <$>25, T1       C 3
+       pxor    T0, $2          C 3
+       pxor    T1, $2          C 4
+
+       movaps  $1, T0          C 0
+       paddd   $2, T0          C 5
+       movaps  T0, T1          C 6
+       pslld   <$>9, T0        C 6
+       psrld   <$>23, T1       C 7
+       pxor    T0, $3          C 7
+       pxor    T1, $3          C 8
+
+       movaps  $2, T0          C 0
+       paddd   $3, T0          C 9
+       movaps  T0, T1          C 10
+       pslld   <$>13, T0       C 10
+       psrld   <$>19, T1       C 11
+       pxor    T0, $4          C 11
+       pxor    T1, $4          C 12
+
+       movaps  $3, T0          C 0
+       paddd   $4, T0          C 13
+       movaps  T0, T1          C 14
+       pslld   <$>18, T0       C 14
+       psrld   <$>14, T1       C 15
+       pxor    T0, $1          C 15
+       pxor    T1, $1          C 16
+>)
+
+C SWAP(x0, x1, mask)
+C Swaps bits in x0 and x1, with bits selected by the mask
+define(<SWAP>, <
+       movaps  $1, T0
+       pxor    $2, $1
+       pand    $3, $1
+       pxor    $1, $2
+       pxor    T0, $1
+>)
+
+       .file "salsa20.asm"
+       
+       C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length,
+       C               uint8_t *dst, const uint8_t *src)
+       .text
+       ALIGN(4)
+PROLOGUE(nettle_salsa20_crypt)
+       W64_ENTRY(4, 9) 
+
+       test    LENGTH, LENGTH
+       jz      .Lend
+
+       C Load mask registers
+       mov     $-1, XREG(COUNT)
+       movd    XREG(COUNT), M0101
+       pshufd  $0x09, M0101, M0011     C 01 01 00 00
+       pshufd  $0x41, M0101, M0110     C 01 00 00 01
+       pshufd  $0x22, M0101, M0101     C 01 00 01 00
+       
+.Lblock_loop:
+       movups  (CTX), X0
+       movups  16(CTX), X1
+       movups  32(CTX), X2
+       movups  48(CTX), X3
+
+       C On input, each xmm register is one row. We start with
+       C
+       C        0  1  2  3
+       C        4  5  6  7
+       C        8  9 10 11
+       C       12 13 14 15
+       C
+       C Diagrams are in little-endian order, with least significant word to
+       C the left. We rotate the columns, to get instead
+       C
+       C        0  5 10 15
+       C        4  9 14  3
+       C        8 13  2  7
+       C       12  1  6 11
+       C 
+       C The original rows are now diagonals.
+       SWAP(X0, X1, M0101)
+       SWAP(X2, X3, M0101)
+       SWAP(X1, X3, M0110)
+       SWAP(X0, X2, M0011)     
+
+       movl    $10, XREG(COUNT)
+       ALIGN(4)
+.Loop:
+       QROUND(X0, X1, X2, X3)
+       C For the row operations, we first rotate the rows, to get
+       C       
+       C       0 5 10 15
+       C       3 4  9 14
+       C       2 7  8 13
+       C       1 6 11 12
+       C 
+       C Now the original rows are turned into into columns. (This
+       C SIMD hack described in djb's papers).
+
+       pshufd  $0x93, X1, X1   C       11 00 01 10 (least sign. left)
+       pshufd  $0x4e, X2, X2   C       10 11 00 01
+       pshufd  $0x39, X3, X3   C       01 10 11 00
+
+       QROUND(X0, X3, X2, X1)
+
+       C Inverse rotation of the rows
+       pshufd  $0x39, X1, X1   C       01 10 11 00
+       pshufd  $0x4e, X2, X2   C       10 11 00 01
+       pshufd  $0x93, X3, X3   C       11 00 01 10
+
+       decl    XREG(COUNT)
+       jnz     .Loop
+
+       SWAP(X0, X2, M0011)     
+       SWAP(X1, X3, M0110)
+       SWAP(X0, X1, M0101)
+       SWAP(X2, X3, M0101)
+
+       movups  (CTX), T0
+       movups  16(CTX), T1
+       paddd   T0, X0
+       paddd   T1, X1
+       movups  32(CTX), T0
+       movups  48(CTX), T1
+       paddd   T0, X2
+       paddd   T1, X3
+
+       C Increment block counter
+       incq    32(CTX)
+
+       cmp     $64, LENGTH
+       jc      .Lfinal_xor
+
+       movups  48(SRC), T1
+       pxor    T1, X3
+       movups  X3, 48(DST)
+.Lxor3:
+       movups  32(SRC), T0
+       pxor    T0, X2
+       movups  X2, 32(DST)
+.Lxor2:
+       movups  16(SRC), T1
+       pxor    T1, X1
+       movups  X1, 16(DST)
+.Lxor1:
+       movups  (SRC), T0       
+       pxor    T0, X0
+       movups  X0, (DST)
+
+       lea     64(SRC), SRC
+       lea     64(DST), DST
+       sub     $64, LENGTH
+       ja      .Lblock_loop
+.Lend:
+       W64_EXIT(4, 9)
+       ret
+
+.Lfinal_xor:
+       cmp     $32, LENGTH
+       jz      .Lxor2
+       jc      .Llt32
+       cmp     $48, LENGTH
+       jz      .Lxor3
+       jc      .Llt48
+       movaps  X3, T0
+       call    .Lpartial
+       jmp     .Lxor3
+.Llt48:
+       movaps  X2, T0
+       call    .Lpartial
+       jmp     .Lxor2
+.Llt32:
+       cmp     $16, LENGTH
+       jz      .Lxor1
+       jc      .Llt16
+       movaps  X1, T0
+       call    .Lpartial
+       jmp     .Lxor1
+.Llt16:
+       movaps  X0, T0
+       call    .Lpartial
+       jmp     .Lend
+
+.Lpartial:
+       mov     LENGTH, POS
+       and     $-16, POS
+       test    $8, LENGTH
+       jz      .Llt8
+       movq    T0, T64
+       xor     (SRC, POS), T64
+       mov     T64, (DST, POS)
+       lea     8(POS), POS
+       pshufd  $0xee, T0, T0           C 10 11 10 11
+.Llt8:
+       movq    T0, T64
+       test    $4, LENGTH
+       jz      .Llt4
+       mov     XREG(T64), XREG(COUNT)
+       xor     (SRC, POS), XREG(COUNT)
+       mov     XREG(COUNT), (DST, POS)
+       lea     4(POS), POS
+       shr     $32, T64
+.Llt4:
+       test    $2, LENGTH
+       jz      .Llt2
+       mov     WREG(T64), WREG(COUNT)
+       xor     (SRC, POS), WREG(COUNT)
+       mov     WREG(COUNT), (DST, POS)
+       lea     2(POS), POS
+       shr     $16, XREG(T64)
+.Llt2:
+       test    $1, LENGTH
+       jz      .Lpartial_done
+       xor     (SRC, POS), LREG(T64)
+       mov     LREG(T64), (DST, POS)
+.Lpartial_done:
+       ret
+
+EPILOGUE(nettle_salsa20_crypt)
author	Niels Möller <nisse@lysator.liu.se>
	Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Wed, 18 Apr 2012 19:27:58 +0000 (21:27 +0200)
ChangeLog		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
x86_64/salsa20-crypt.asm	[new file with mode: 0644]	patch \| blob