Reorganized x86_64 memxor.

author Niels Möller <nisse@lysator.liu.se>

Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)
author Niels Möller <nisse@lysator.liu.se>
Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)
diff --git a/ChangeLog b/ChangeLog

index 34f9ac436ef1bb9785d669ad15572a5e5eb65910..e36bc8217e4b76bfb34f78820fba5d11ad6e247b 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2014-11-24  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/memxor3.asm (memxor3): New file, code moved from old
+       memxor.asm.
+       * x86_64/memxor.asm (memxor): Rewritten, no longer jumps into
+       memxor3.
+
+       * configure.ac (asm_replace_list): Added memxor.asm and
+       memxor3.asm.
+
  2014-10-23  Niels Möller  <nisse@lysator.liu.se>
  
         * configure.ac (IF_ASM): New substituted variable.
diff --git a/configure.ac b/configure.ac

index 78bcce2eec92b4e8c1cf6717181aa3cf3a389d99..76beb3160853abde4dfaafcb90fcc35a558c6d2f 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -272,7 +272,8 @@ fi
  # to a new object file).
  asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
                 arcfour-crypt.asm camellia-crypt-internal.asm \
-               md5-compress.asm poly1305-internal.asm \
+               md5-compress.asm memxor.asm memxor3.asm \
+               poly1305-internal.asm \
                 chacha-core-internal.asm \
                 salsa20-crypt.asm salsa20-core-internal.asm \
                 serpent-encrypt.asm serpent-decrypt.asm \
diff --git a/x86_64/memxor.asm b/x86_64/memxor.asm

index e14e31a13f4617bc202bb66239703caedee4f47d..69d6cfe382a12e8a0d2ace1d1019a9db44d23f24 100644 (file)
--- a/x86_64/memxor.asm
+++ b/x86_64/memxor.asm
@@ -1,7 +1,7 @@
  C x86_64/memxor.asm
  
  ifelse(<
-   Copyright (C) 2010, Niels Möller
+   Copyright (C) 2010, 2014, Niels Möller
  
     This file is part of GNU Nettle.
  
@@ -32,9 +32,8 @@ ifelse(<
  
  C Register usage:
  define(<DST>, <%rax>) C Originally in %rdi
-define(<AP>, <%rsi>)
-define(<BP>, <%rdx>)
-define(<N>, <%r10>)
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
  define(<TMP>, <%r8>)
  define(<TMP2>, <%r9>)
  define(<CNT>, <%rdi>)
@@ -53,20 +52,7 @@ define(<USE_SSE2>, <no>)
  
  PROLOGUE(nettle_memxor)
         W64_ENTRY(3, 0)
-       mov     %rdx, %r10
-       mov     %rdi, %rdx
-       jmp     .Lmemxor3_entry
-EPILOGUE(nettle_memxor)
  
-       C memxor3(void *dst, const void *a, const void *b, size_t n)
-       C                 %rdi              %rsi              %rdx      %rcx
-       ALIGN(16)
-       
-PROLOGUE(nettle_memxor3)
-       W64_ENTRY(4, 0)
-       C %cl needed for shift count, so move away N
-       mov     %rcx, N
-.Lmemxor3_entry:
         test    N, N
         C Get number of unaligned bytes at the end
         C %rdi is used as CNT, %rax as DST and as return value
@@ -87,9 +73,8 @@ PROLOGUE(nettle_memxor3)
  .Lalign_loop:
         
         sub     $1, N
-       movb    (AP, N), LREG(TMP)
-       xorb    (BP, N), LREG(TMP)
-       movb    LREG(TMP), (DST, N)
+       movb    (SRC, N), LREG(TMP)
+       xorb    LREG(TMP), (DST, N)
         sub     $1, CNT
         jnz     .Lalign_loop
  
@@ -98,83 +83,7 @@ ifelse(USE_SSE2, yes, <
         cmp     $16, N
         jnc     .Lsse2_case
  >)
-       C Check for the case that AP and BP have the same alignment,
-       C but different from DST.
-       mov     AP, TMP
-       sub     BP, TMP
-       test    $7, TMP
-       jnz     .Lno_shift_case
-       mov     AP, %rcx
-       sub     DST, %rcx
-       and     $7, %rcx
-       jz      .Lno_shift_case
-       sub     %rcx, AP
-       sub     %rcx, BP
-       shl     $3, %rcx
-
-       C Unrolling, with aligned values alternating in S0 and S1
-       test    $8, N
-       jnz     .Lshift_odd
-       mov     (AP, N), S1
-       xor     (BP, N), S1
-       jmp     .Lshift_next
-
-.Lshift_odd:
-       mov     -8(AP, N), S1
-       mov     (AP, N), S0
-       xor     -8(BP, N), S1
-       xor     (BP, N), S0
-       mov     S1, TMP
-       shr     %cl, TMP
-       neg     %cl
-       shl     %cl, S0
-       neg     %cl
-       
-       or      S0, TMP
-       mov     TMP, -8(DST, N)
-       sub     $8, N
-       jz      .Ldone
-       jmp     .Lshift_next
-
-       ALIGN(16)
  
-.Lshift_loop:
-       mov     8(AP, N), S0
-       xor     8(BP, N), S0
-       mov     S0, TMP
-       shr     %cl, TMP
-       neg     %cl
-       shl     %cl, S1
-       neg     %cl
-       or      S1, TMP
-       mov     TMP, 8(DST, N)
-
-       mov     (AP, N), S1
-       xor     (BP, N), S1
-       mov     S1, TMP
-       shr     %cl, TMP
-       neg     %cl
-       shl     %cl, S0
-       neg     %cl
-       or      S0, TMP
-       mov     TMP, (DST, N)
-.Lshift_next:
-       sub     $16, N
-       C FIXME: Handle the case N == 16 specially,
-       C like in the non-shifted case? 
-C      ja      .Lshift_loop
-C      jz      .Ldone
-       jnc     .Lshift_loop
-
-       add     $15, N
-       jnc     .Ldone
-
-       shr     $3, %rcx
-       add     %rcx, AP
-       add     %rcx, BP
-       jmp     .Lfinal_loop
-       
-.Lno_shift_case:
         C Next destination word is -8(DST, N)
         C Setup for unrolling
         test    $8, N
@@ -183,21 +92,18 @@ C  jz      .Ldone
         sub     $8, N
         jz      .Lone_word
  
-       mov     (AP, N), TMP
-       xor     (BP, N), TMP
-       mov     TMP, (DST, N)
+       mov     (SRC, N), TMP
+       xor     TMP, (DST, N)
         
         jmp     .Lword_next
  
         ALIGN(16)
  
  .Lword_loop:
-       mov     8(AP, N), TMP
-       mov     (AP, N), TMP2
-       xor     8(BP, N), TMP
-       xor     (BP, N), TMP2
-       mov     TMP, 8(DST, N)
-       mov     TMP2, (DST, N)
+       mov     8(SRC, N), TMP
+       mov     (SRC, N), TMP2
+       xor     TMP, 8(DST, N)
+       xor     TMP2, (DST, N)
  
  .Lword_next:
         sub     $16, N
@@ -205,33 +111,28 @@ C         jz      .Ldone
         jnz     .Lfinal
  
         C Final operation is word aligned
-       mov     8(AP, N), TMP
-       xor     8(BP, N), TMP
-       mov     TMP, 8(DST, N)
+       mov     8(SRC, N), TMP
+       xor     TMP, 8(DST, N)
         
  .Lone_word:
-       mov     (AP, N), TMP
-       xor     (BP, N), TMP
-       mov     TMP, (DST, N)
+       mov     (SRC, N), TMP
+       xor     TMP, (DST, N)
  
-       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-       W64_EXIT(4, 0)
+       W64_EXIT(3, 0)
         ret
  
  .Lfinal:
         add     $15, N
  
  .Lfinal_loop:
-       movb    (AP, N), LREG(TMP)
-       xorb    (BP, N), LREG(TMP)
-       movb    LREG(TMP), (DST, N)
+       movb    (SRC, N), LREG(TMP)
+       xorb    LREG(TMP), (DST, N)
  .Lfinal_next:
         sub     $1, N
         jnc     .Lfinal_loop
  
  .Ldone:
-       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-       W64_EXIT(4, 0)
+       W64_EXIT(3, 0)
         ret
  
  ifelse(USE_SSE2, yes, <
@@ -241,15 +142,14 @@ ifelse(USE_SSE2, yes, <
         test    $8, TMP
         jz      .Lsse2_next
         sub     $8, N
-       mov     (AP, N), TMP
-       xor     (BP, N), TMP
-       mov     TMP, (DST, N)
+       mov     (SRC, N), TMP
+       xor     TMP, (DST, N)
         jmp     .Lsse2_next
  
         ALIGN(16)
  .Lsse2_loop:
-       movdqu  (AP, N), %xmm0
-       movdqu  (BP, N), %xmm1
+       movdqu  (SRC, N), %xmm0
+       movdqa  (DST, N), %xmm1
         pxor    %xmm0, %xmm1
         movdqa  %xmm1, (DST, N)
  .Lsse2_next:
@@ -261,14 +161,13 @@ ifelse(USE_SSE2, yes, <
         jnz     .Lfinal         
  
         C Final operation is aligned
-       movdqu  (AP), %xmm0
-       movdqu  (BP), %xmm1
+       movdqu  (SRC), %xmm0
+       movdqa  (DST), %xmm1
         pxor    %xmm0, %xmm1
         movdqa  %xmm1, (DST)
-       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
-       W64_EXIT(4, 0)
+
+       W64_EXIT(3, 0)
         ret
  >)     
-       
  
-EPILOGUE(nettle_memxor3)
+EPILOGUE(nettle_memxor)
diff --git a/x86_64/memxor3.asm b/x86_64/memxor3.asm

new file mode 100644 (file)

index 0000000..8ff3e79
--- /dev/null
+++ b/x86_64/memxor3.asm
@@ -0,0 +1,263 @@
+C x86_64/memxor3.asm
+
+ifelse(<
+   Copyright (C) 2010, 2014 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<AP>, <%rsi>)
+define(<BP>, <%rdx>)
+define(<N>, <%r10>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+
+define(<USE_SSE2>, <no>)
+
+       .file "memxor3.asm"
+
+       .text
+
+       C memxor3(void *dst, const void *a, const void *b, size_t n)
+       C                 %rdi              %rsi              %rdx      %rcx
+       ALIGN(16)
+       
+PROLOGUE(nettle_memxor3)
+       W64_ENTRY(4, 0)
+       C %cl needed for shift count, so move away N
+       mov     %rcx, N
+.Lmemxor3_entry:
+       test    N, N
+       C Get number of unaligned bytes at the end
+       C %rdi is used as CNT, %rax as DST and as return value
+       mov     %rdi, %rax
+       jz      .Ldone
+       add     N, CNT
+       and     $7, CNT
+       
+       jz      .Laligned
+
+       cmp     $8, N
+       jc      .Lfinal_next
+
+       C FIXME: Instead of this loop, could try cmov with memory
+       C destination, as a sequence of one 8-bit, one 16-bit and one
+       C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+       C that step has to use a conditional).
+.Lalign_loop:
+       
+       sub     $1, N
+       movb    (AP, N), LREG(TMP)
+       xorb    (BP, N), LREG(TMP)
+       movb    LREG(TMP), (DST, N)
+       sub     $1, CNT
+       jnz     .Lalign_loop
+
+.Laligned:
+ifelse(USE_SSE2, yes, <
+       cmp     $16, N
+       jnc     .Lsse2_case
+>)
+       C Check for the case that AP and BP have the same alignment,
+       C but different from DST.
+       mov     AP, TMP
+       sub     BP, TMP
+       test    $7, TMP
+       jnz     .Lno_shift_case
+       mov     AP, %rcx
+       sub     DST, %rcx
+       and     $7, %rcx
+       jz      .Lno_shift_case
+       sub     %rcx, AP
+       sub     %rcx, BP
+       shl     $3, %rcx
+
+       C Unrolling, with aligned values alternating in S0 and S1
+       test    $8, N
+       jnz     .Lshift_odd
+       mov     (AP, N), S1
+       xor     (BP, N), S1
+       jmp     .Lshift_next
+
+.Lshift_odd:
+       mov     -8(AP, N), S1
+       mov     (AP, N), S0
+       xor     -8(BP, N), S1
+       xor     (BP, N), S0
+       mov     S1, TMP
+       shr     %cl, TMP
+       neg     %cl
+       shl     %cl, S0
+       neg     %cl
+       
+       or      S0, TMP
+       mov     TMP, -8(DST, N)
+       sub     $8, N
+       jz      .Ldone
+       jmp     .Lshift_next
+
+       ALIGN(16)
+
+.Lshift_loop:
+       mov     8(AP, N), S0
+       xor     8(BP, N), S0
+       mov     S0, TMP
+       shr     %cl, TMP
+       neg     %cl
+       shl     %cl, S1
+       neg     %cl
+       or      S1, TMP
+       mov     TMP, 8(DST, N)
+
+       mov     (AP, N), S1
+       xor     (BP, N), S1
+       mov     S1, TMP
+       shr     %cl, TMP
+       neg     %cl
+       shl     %cl, S0
+       neg     %cl
+       or      S0, TMP
+       mov     TMP, (DST, N)
+.Lshift_next:
+       sub     $16, N
+       C FIXME: Handle the case N == 16 specially,
+       C like in the non-shifted case? 
+C      ja      .Lshift_loop
+C      jz      .Ldone
+       jnc     .Lshift_loop
+
+       add     $15, N
+       jnc     .Ldone
+
+       shr     $3, %rcx
+       add     %rcx, AP
+       add     %rcx, BP
+       jmp     .Lfinal_loop
+       
+.Lno_shift_case:
+       C Next destination word is -8(DST, N)
+       C Setup for unrolling
+       test    $8, N
+       jz      .Lword_next
+
+       sub     $8, N
+       jz      .Lone_word
+
+       mov     (AP, N), TMP
+       xor     (BP, N), TMP
+       mov     TMP, (DST, N)
+       
+       jmp     .Lword_next
+
+       ALIGN(16)
+
+.Lword_loop:
+       mov     8(AP, N), TMP
+       mov     (AP, N), TMP2
+       xor     8(BP, N), TMP
+       xor     (BP, N), TMP2
+       mov     TMP, 8(DST, N)
+       mov     TMP2, (DST, N)
+
+.Lword_next:
+       sub     $16, N
+       ja      .Lword_loop     C Not zero and no carry
+       jnz     .Lfinal
+
+       C Final operation is word aligned
+       mov     8(AP, N), TMP
+       xor     8(BP, N), TMP
+       mov     TMP, 8(DST, N)
+       
+.Lone_word:
+       mov     (AP, N), TMP
+       xor     (BP, N), TMP
+       mov     TMP, (DST, N)
+
+       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+       W64_EXIT(4, 0)
+       ret
+
+.Lfinal:
+       add     $15, N
+
+.Lfinal_loop:
+       movb    (AP, N), LREG(TMP)
+       xorb    (BP, N), LREG(TMP)
+       movb    LREG(TMP), (DST, N)
+.Lfinal_next:
+       sub     $1, N
+       jnc     .Lfinal_loop
+
+.Ldone:
+       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+       W64_EXIT(4, 0)
+       ret
+
+ifelse(USE_SSE2, yes, <
+
+.Lsse2_case:
+       lea     (DST, N), TMP
+       test    $8, TMP
+       jz      .Lsse2_next
+       sub     $8, N
+       mov     (AP, N), TMP
+       xor     (BP, N), TMP
+       mov     TMP, (DST, N)
+       jmp     .Lsse2_next
+
+       ALIGN(16)
+.Lsse2_loop:
+       movdqu  (AP, N), %xmm0
+       movdqu  (BP, N), %xmm1
+       pxor    %xmm0, %xmm1
+       movdqa  %xmm1, (DST, N)
+.Lsse2_next:
+       sub     $16, N
+       ja      .Lsse2_loop
+       
+       C FIXME: See if we can do a full word first, before the
+       C byte-wise final loop.
+       jnz     .Lfinal         
+
+       C Final operation is aligned
+       movdqu  (AP), %xmm0
+       movdqu  (BP), %xmm1
+       pxor    %xmm0, %xmm1
+       movdqa  %xmm1, (DST)
+       C ENTRY might have been 3 args, too, but it doesn't matter for the exit
+       W64_EXIT(4, 0)
+       ret
+>)     
+       
+
+EPILOGUE(nettle_memxor3)
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 24 Nov 2014 19:17:59 +0000 (20:17 +0100)
ChangeLog		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
x86_64/memxor.asm		patch \| blob \| blame \| history
x86_64/memxor3.asm	[new file with mode: 0644]	patch \| blob