New x86_64 assembly for gcm hashing.

author Niels Möller <nisse@lysator.liu.se>

Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
author Niels Möller <nisse@lysator.liu.se>
Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
diff --git a/ChangeLog b/ChangeLog

index 78ffcbb7e0b06bd2dc7a4bae999dbe8969f9e217..46bb9b56aa10ce6e15bedc34c2712bce9e8907be 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-09-22  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/gcm-hash8.asm: New file.
+       * x86_64/gcm-gf-mul-8.asm: Deleted.
+
+       * configure.ac (asm_nettle_optional_list): Look for gcm-hash8.asm,
+       not gcm-gf-mul-8.asm.
+       * gcm.c [HAVE_NATIVE_gcm_hash8]: Make use of (optional) assembly
+       implementation.
+
  2013-09-21  Niels Möller  <nisse@lysator.liu.se>
  
         * Makefile.in (des.po): Add same dependencies as for des.o.
diff --git a/configure.ac b/configure.ac

index 72da6d78de566f0c8d00d3cb0ecb06345543f4e5..037fbe97278767962ae71cf17c4f9e726fac5860 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -266,7 +266,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
                 sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
  
  # Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-gf-mul-8.asm"
+asm_nettle_optional_list="gcm-hash8.asm"
  asm_hogweed_optional_list=""
  if test "x$enable_public_key" = "xyes" ; then
    asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \
@@ -347,7 +347,7 @@ AH_VERBATIM([HAVE_NATIVE],
  #undef HAVE_NATIVE_ecc_384_redc
  #undef HAVE_NATIVE_ecc_521_modp
  #undef HAVE_NATIVE_ecc_521_redc
-#undef HAVE_NATIVE_gcm_gf_mul_8])
+#undef HAVE_NATIVE_gcm_hash8])
  
  # Besides getting correct dependencies, the explicit rules also tell
  # make that the .s files "ought to exist", so they are preferred over
diff --git a/gcm.c b/gcm.c

index 7b387bacfac88826a8038fc05c2437fb4b1b244e..c9ea30bc1a696849b33daa8dd1c4782854afe4e0 100644 (file)
--- a/gcm.c
+++ b/gcm.c
@@ -214,12 +214,13 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
    memcpy (x->b, Z.b, sizeof(Z));
  }
  # elif GCM_TABLE_BITS == 8
-#  if HAVE_NATIVE_gcm_gf_mul_8
+#  if HAVE_NATIVE_gcm_hash8
  
-#define gcm_gf_mul _nettle_gcm_gf_mul_8
+#define gcm_hash _nettle_gcm_hash8
  void
-gcm_gf_mul (union gcm_block *x, const union gcm_block *y);
-#  else /* !HAVE_NATIVE_gcm_gf_mul_8 */
+_nettle_gcm_hash8 (const struct gcm_key *key, union gcm_block *x,
+                  size_t length, const uint8_t *data);
+#  else /* !HAVE_NATIVE_gcm_hash8 */
  static const uint16_t
  shift_table[0x100] = {
    W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
@@ -310,7 +311,7 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
    gcm_gf_shift_8(&Z);
    gcm_gf_add(x, &Z, &table[x->b[0]]);
  }
-#  endif /* ! HAVE_NATIVE_gcm_gf_mul_8 */
+#  endif /* ! HAVE_NATIVE_gcm_hash8 */
  # else /* GCM_TABLE_BITS != 8 */
  #  error Unsupported table size. 
  # endif /* GCM_TABLE_BITS != 8 */
@@ -353,6 +354,7 @@ gcm_set_key(struct gcm_key *key,
  #endif
  }
  
+#ifndef gcm_hash
  static void
  gcm_hash(const struct gcm_key *key, union gcm_block *x,
          size_t length, const uint8_t *data)
@@ -369,6 +371,7 @@ gcm_hash(const struct gcm_key *key, union gcm_block *x,
        gcm_gf_mul (x, key->h);
      }
  }
+#endif /* !gcm_hash */
  
  static void
  gcm_hash_sizes(const struct gcm_key *key, union gcm_block *x,
diff --git a/x86_64/gcm-gf-mul-8.asm b/x86_64/gcm-hash8.asm

similarity index 66%

rename from x86_64/gcm-gf-mul-8.asm

rename to x86_64/gcm-hash8.asm

index 2296ba8738bac4840ad6b28729b0f5a0fd091e85..3a3f012eafa15fe0206d244f4e310ea77c57815e 100644 (file)
--- a/x86_64/gcm-gf-mul-8.asm
+++ b/x86_64/gcm-hash8.asm
@@ -1,17 +1,17 @@
  C nettle, low-level cryptographics library
-C 
+C
  C Copyright (C) 2013, Niels Möller
-C  
+C
  C The nettle library is free software; you can redistribute it and/or modify
  C it under the terms of the GNU Lesser General Public License as published by
  C the Free Software Foundation; either version 2.1 of the License, or (at your
  C option) any later version.
-C 
+C
  C The nettle library is distributed in the hope that it will be useful, but
  C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  C License for more details.
-C 
+C
  C You should have received a copy of the GNU Lesser General Public License
  C along with the nettle library; see the file COPYING.LIB.  If not, write to
  C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
@@ -19,47 +19,51 @@ C MA 02111-1301, USA.
  
  C Register usage:
  
-define(<XP>, <%rdi>)
-define(<TABLE>, <%rsi>)
-define(<XW>, <%rax>)
-define(<CNT>, <%ecx>)
-define(<Z0>, <%rdx>)
-define(<Z1>, <%r8>)
-define(<T0>, <%r9>)
-define(<T1>, <%r10>)
-define(<T2>, <%r11>)
-define(<SHIFT_TABLE>, <%rbx>)
-       
-C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs
-C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350.
-
-       .file "gcm-gf-mul-8.asm"
-       
-       C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table)
+define(<KEY>, <%rdi>)
+define(<XP>, <%rsi>)
+define(<LENGTH>, <%rdx>)
+define(<SRC>, <%rcx>)
+define(<X0>, <%rax>)
+define(<X1>, <%rbx>)
+define(<CNT>, <%ebp>)
+define(<T0>, <%r8>)
+define(<T1>, <%r9>)
+define(<T2>, <%r10>)
+define(<Z0>, <%r11>)
+define(<Z1>, <%r12>)
+define(<SHIFT_TABLE>, <%r13>)
+
+       .file "gcm-hash8.asm"
+
+       C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+       C                size_t length, const uint8_t *data)
+
         .text
         ALIGN(16)
-PROLOGUE(_nettle_gcm_gf_mul_8)
-       W64_ENTRY(2, 0)
+PROLOGUE(_nettle_gcm_hash8)
+       W64_ENTRY(4, 0)
         push    %rbx
-       mov     8(XP), XW
-       rol     $8, XW
-       movzbl  LREG(XW), XREG(T0)
-       shl     $4, T0
-       mov     (TABLE, T0), Z0
-       mov     8(TABLE, T0), Z1
+       push    %rbp
+       push    %r12
+       push    %r13
+       sub     $16, LENGTH
         lea     .Lshift_table(%rip), SHIFT_TABLE
-       movl    $7, CNT
-       call    .Lmul_word
-       mov     (XP), XW
-       movl    $8, CNT
-       call    .Lmul_word
-       mov     Z0, (XP)
-       mov     Z1, 8(XP)
-       W64_EXIT(2, 0)
-       pop     %rbx
-       ret
+       mov     (XP), X0
+       mov     8(XP), X1
+       jc      .Lfinal
+ALIGN(16)
+.Lblock_loop:
+
+       xor (SRC), X0
+       xor 8(SRC), X1
+
+.Lblock_mul:
+       rol     $8, X1
+       movzbl  LREG(X1), XREG(T1)
+       shl     $4, T1
+       mov     (KEY, T1), Z0
+       mov     8(KEY, T1), Z1
  
-.Lmul_word:
         C shift Z1, Z0, transforming
         C +-----------------------+-----------------------+
         C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00|
@@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8)
         C +-----------------------+-----------------+-----+
         C                               xor         |T[15]|
         C                                           +-----+
+
+       mov     $7, CNT
+
+ALIGN(16)
+.Loop_X1:
         mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
         mov     Z0, T0
-       shl     $8, Z1          C Use shld?
         shl     $8, Z0
-       shr     $56, T1
         shr     $56, T0
         movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
-       rol     $8, XW
+       xor     T1, Z0
+       rol     $8, X1
+       movzbl  LREG(X1), XREG(T2)
+       shl     $4, T2
+       xor     (KEY, T2), Z0
         add     T0, Z1
+       xor     8(KEY, T2), Z1
+       decl    CNT
+       jne     .Loop_X1
+
+       mov     $7, CNT
+
+ALIGN(16)
+.Loop_X0:
+       mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
+       mov     Z0, T0
+       shl     $8, Z0
+       shr     $56, T0
+       movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
         xor     T1, Z0
-       movzbl  LREG(XW), XREG(T2)
+       rol     $8, X0
+       movzbl  LREG(X0), XREG(T2)
         shl     $4, T2
-       xor     (TABLE, T2), Z0
-       xor     8(TABLE, T2), Z1
+       xor     (KEY, T2), Z0
+       add     T0, Z1
+       xor     8(KEY, T2), Z1
         decl    CNT
-       jne     .Lmul_word
+       jne     .Loop_X0
+
+       mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
+       mov     Z0, T0
+       shl     $8, Z0
+       shr     $56, T0
+       movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
+       xor     T1, Z0
+       rol     $8, X0
+       movzbl  LREG(X0), XREG(T2)
+       shl     $4, T2
+       mov     (KEY, T2), X0
+       xor     Z0, X0
+       add     T0, Z1
+       mov     8(KEY, T2), X1
+       xor     Z1, X1
+
+       add     $16, SRC
+       sub     $16, LENGTH
+       jnc     .Lblock_loop
+
+.Lfinal:
+       add     $16, LENGTH
+       jnz     .Lpartial
+
+       mov     X0, (XP)
+       mov     X1, 8(XP)
+
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       W64_EXIT(2, 0)
+       ret
+
+.Lpartial:
+       C Read and xor partial block, then jump back into the loop
+       C with LENGTH == 0.
+
+       cmp     $8, LENGTH
+       jc      .Llt8
+
+       C       8 <= LENGTH < 16
+       xor     (SRC), X0
+       add     $8, SRC
+       sub     $8, LENGTH
+       jz      .Lblock_mul
+       call    .Lread_bytes
+       xor     T0, X1
+       jmp     .Lblock_mul
+
+.Llt8: C 0 < LENGTH < 8
+       call    .Lread_bytes
+       xor     T0, X0
+       jmp     .Lblock_mul
+
+C Read 0 < LENGTH < 8 bytes at SRC, result in T0
+.Lread_bytes:
+       xor     T0, T0
+       sub     $1, SRC
+ALIGN(16)
+.Lread_loop:
+       shl     $8, T0
+       orb     (SRC, LENGTH), LREG(T0)
+.Lread_next:
+       sub     $1, LENGTH
+       jnz     .Lread_loop
         ret
-       
-EPILOGUE(_nettle_gcm_gf_mul_8)
+EPILOGUE(_nettle_gcm_hash8)
  
  define(<W>, <0x$2$1>)
         .section .rodata
@@ -126,5 +223,3 @@ define(<W>, <0x$2$1>)
  .hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e)
  .hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae)
  .hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be)
-       
-
author	Niels Möller <nisse@lysator.liu.se>
	Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
ChangeLog		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
gcm.c		patch \| blob \| blame \| history
x86_64/gcm-hash8.asm	[moved from x86_64/gcm-gf-mul-8.asm with 66% similarity]	patch \| blob \| blame \| history