]> git.ipfire.org Git - thirdparty/nettle.git/commitdiff
New x86_64 assembly for gcm hashing.
authorNiels Möller <nisse@lysator.liu.se>
Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
committerNiels Möller <nisse@lysator.liu.se>
Sun, 22 Sep 2013 18:48:57 +0000 (20:48 +0200)
ChangeLog
configure.ac
gcm.c
x86_64/gcm-hash8.asm [moved from x86_64/gcm-gf-mul-8.asm with 66% similarity]

index 78ffcbb7e0b06bd2dc7a4bae999dbe8969f9e217..46bb9b56aa10ce6e15bedc34c2712bce9e8907be 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-09-22  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/gcm-hash8.asm: New file.
+       * x86_64/gcm-gf-mul-8.asm: Deleted.
+
+       * configure.ac (asm_nettle_optional_list): Look for gcm-hash8.asm,
+       not gcm-gf-mul-8.asm.
+       * gcm.c [HAVE_NATIVE_gcm_hash8]: Make use of (optional) assembly
+       implementation.
+
 2013-09-21  Niels Möller  <nisse@lysator.liu.se>
 
        * Makefile.in (des.po): Add same dependencies as for des.o.
index 72da6d78de566f0c8d00d3cb0ecb06345543f4e5..037fbe97278767962ae71cf17c4f9e726fac5860 100644 (file)
@@ -266,7 +266,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
                sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-gf-mul-8.asm"
+asm_nettle_optional_list="gcm-hash8.asm"
 asm_hogweed_optional_list=""
 if test "x$enable_public_key" = "xyes" ; then
   asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \
@@ -347,7 +347,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_384_redc
 #undef HAVE_NATIVE_ecc_521_modp
 #undef HAVE_NATIVE_ecc_521_redc
-#undef HAVE_NATIVE_gcm_gf_mul_8])
+#undef HAVE_NATIVE_gcm_hash8])
 
 # Besides getting correct dependencies, the explicit rules also tell
 # make that the .s files "ought to exist", so they are preferred over
diff --git a/gcm.c b/gcm.c
index 7b387bacfac88826a8038fc05c2437fb4b1b244e..c9ea30bc1a696849b33daa8dd1c4782854afe4e0 100644 (file)
--- a/gcm.c
+++ b/gcm.c
@@ -214,12 +214,13 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
-#  if HAVE_NATIVE_gcm_gf_mul_8
+#  if HAVE_NATIVE_gcm_hash8
 
-#define gcm_gf_mul _nettle_gcm_gf_mul_8
+#define gcm_hash _nettle_gcm_hash8
 void
-gcm_gf_mul (union gcm_block *x, const union gcm_block *y);
-#  else /* !HAVE_NATIVE_gcm_gf_mul_8 */
+_nettle_gcm_hash8 (const struct gcm_key *key, union gcm_block *x,
+                  size_t length, const uint8_t *data);
+#  else /* !HAVE_NATIVE_gcm_hash8 */
 static const uint16_t
 shift_table[0x100] = {
   W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
@@ -310,7 +311,7 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
   gcm_gf_shift_8(&Z);
   gcm_gf_add(x, &Z, &table[x->b[0]]);
 }
-#  endif /* ! HAVE_NATIVE_gcm_gf_mul_8 */
+#  endif /* ! HAVE_NATIVE_gcm_hash8 */
 # else /* GCM_TABLE_BITS != 8 */
 #  error Unsupported table size. 
 # endif /* GCM_TABLE_BITS != 8 */
@@ -353,6 +354,7 @@ gcm_set_key(struct gcm_key *key,
 #endif
 }
 
+#ifndef gcm_hash
 static void
 gcm_hash(const struct gcm_key *key, union gcm_block *x,
         size_t length, const uint8_t *data)
@@ -369,6 +371,7 @@ gcm_hash(const struct gcm_key *key, union gcm_block *x,
       gcm_gf_mul (x, key->h);
     }
 }
+#endif /* !gcm_hash */
 
 static void
 gcm_hash_sizes(const struct gcm_key *key, union gcm_block *x,
similarity index 66%
rename from x86_64/gcm-gf-mul-8.asm
rename to x86_64/gcm-hash8.asm
index 2296ba8738bac4840ad6b28729b0f5a0fd091e85..3a3f012eafa15fe0206d244f4e310ea77c57815e 100644 (file)
@@ -1,17 +1,17 @@
 C nettle, low-level cryptographics library
-C 
+C
 C Copyright (C) 2013, Niels Möller
-C  
+C
 C The nettle library is free software; you can redistribute it and/or modify
 C it under the terms of the GNU Lesser General Public License as published by
 C the Free Software Foundation; either version 2.1 of the License, or (at your
 C option) any later version.
-C 
+C
 C The nettle library is distributed in the hope that it will be useful, but
 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 C License for more details.
-C 
+C
 C You should have received a copy of the GNU Lesser General Public License
 C along with the nettle library; see the file COPYING.LIB.  If not, write to
 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
@@ -19,47 +19,51 @@ C MA 02111-1301, USA.
 
 C Register usage:
 
-define(<XP>, <%rdi>)
-define(<TABLE>, <%rsi>)
-define(<XW>, <%rax>)
-define(<CNT>, <%ecx>)
-define(<Z0>, <%rdx>)
-define(<Z1>, <%r8>)
-define(<T0>, <%r9>)
-define(<T1>, <%r10>)
-define(<T2>, <%r11>)
-define(<SHIFT_TABLE>, <%rbx>)
-       
-C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs
-C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350.
-
-       .file "gcm-gf-mul-8.asm"
-       
-       C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table)
+define(<KEY>, <%rdi>)
+define(<XP>, <%rsi>)
+define(<LENGTH>, <%rdx>)
+define(<SRC>, <%rcx>)
+define(<X0>, <%rax>)
+define(<X1>, <%rbx>)
+define(<CNT>, <%ebp>)
+define(<T0>, <%r8>)
+define(<T1>, <%r9>)
+define(<T2>, <%r10>)
+define(<Z0>, <%r11>)
+define(<Z1>, <%r12>)
+define(<SHIFT_TABLE>, <%r13>)
+
+       .file "gcm-hash8.asm"
+
+       C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+       C                size_t length, const uint8_t *data)
+
        .text
        ALIGN(16)
-PROLOGUE(_nettle_gcm_gf_mul_8)
-       W64_ENTRY(2, 0)
+PROLOGUE(_nettle_gcm_hash8)
+       W64_ENTRY(4, 0)
        push    %rbx
-       mov     8(XP), XW
-       rol     $8, XW
-       movzbl  LREG(XW), XREG(T0)
-       shl     $4, T0
-       mov     (TABLE, T0), Z0
-       mov     8(TABLE, T0), Z1
+       push    %rbp
+       push    %r12
+       push    %r13
+       sub     $16, LENGTH
        lea     .Lshift_table(%rip), SHIFT_TABLE
-       movl    $7, CNT
-       call    .Lmul_word
-       mov     (XP), XW
-       movl    $8, CNT
-       call    .Lmul_word
-       mov     Z0, (XP)
-       mov     Z1, 8(XP)
-       W64_EXIT(2, 0)
-       pop     %rbx
-       ret
+       mov     (XP), X0
+       mov     8(XP), X1
+       jc      .Lfinal
+ALIGN(16)
+.Lblock_loop:
+
+       xor (SRC), X0
+       xor 8(SRC), X1
+
+.Lblock_mul:
+       rol     $8, X1
+       movzbl  LREG(X1), XREG(T1)
+       shl     $4, T1
+       mov     (KEY, T1), Z0
+       mov     8(KEY, T1), Z1
 
-.Lmul_word:
        C shift Z1, Z0, transforming
        C +-----------------------+-----------------------+
        C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00|
@@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8)
        C +-----------------------+-----------------+-----+
        C                               xor         |T[15]|
        C                                           +-----+
+
+       mov     $7, CNT
+
+ALIGN(16)
+.Loop_X1:
        mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
        mov     Z0, T0
-       shl     $8, Z1          C Use shld?
        shl     $8, Z0
-       shr     $56, T1
        shr     $56, T0
        movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
-       rol     $8, XW
+       xor     T1, Z0
+       rol     $8, X1
+       movzbl  LREG(X1), XREG(T2)
+       shl     $4, T2
+       xor     (KEY, T2), Z0
        add     T0, Z1
+       xor     8(KEY, T2), Z1
+       decl    CNT
+       jne     .Loop_X1
+
+       mov     $7, CNT
+
+ALIGN(16)
+.Loop_X0:
+       mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
+       mov     Z0, T0
+       shl     $8, Z0
+       shr     $56, T0
+       movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
        xor     T1, Z0
-       movzbl  LREG(XW), XREG(T2)
+       rol     $8, X0
+       movzbl  LREG(X0), XREG(T2)
        shl     $4, T2
-       xor     (TABLE, T2), Z0
-       xor     8(TABLE, T2), Z1
+       xor     (KEY, T2), Z0
+       add     T0, Z1
+       xor     8(KEY, T2), Z1
        decl    CNT
-       jne     .Lmul_word
+       jne     .Loop_X0
+
+       mov     Z1, T1
+       shr     $56, T1
+       shl     $8, Z1
+       mov     Z0, T0
+       shl     $8, Z0
+       shr     $56, T0
+       movzwl  (SHIFT_TABLE, T1, 2), XREG(T1)
+       xor     T1, Z0
+       rol     $8, X0
+       movzbl  LREG(X0), XREG(T2)
+       shl     $4, T2
+       mov     (KEY, T2), X0
+       xor     Z0, X0
+       add     T0, Z1
+       mov     8(KEY, T2), X1
+       xor     Z1, X1
+
+       add     $16, SRC
+       sub     $16, LENGTH
+       jnc     .Lblock_loop
+
+.Lfinal:
+       add     $16, LENGTH
+       jnz     .Lpartial
+
+       mov     X0, (XP)
+       mov     X1, 8(XP)
+
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       W64_EXIT(2, 0)
+       ret
+
+.Lpartial:
+       C Read and xor partial block, then jump back into the loop
+       C with LENGTH == 0.
+
+       cmp     $8, LENGTH
+       jc      .Llt8
+
+       C       8 <= LENGTH < 16
+       xor     (SRC), X0
+       add     $8, SRC
+       sub     $8, LENGTH
+       jz      .Lblock_mul
+       call    .Lread_bytes
+       xor     T0, X1
+       jmp     .Lblock_mul
+
+.Llt8: C 0 < LENGTH < 8
+       call    .Lread_bytes
+       xor     T0, X0
+       jmp     .Lblock_mul
+
+C Read 0 < LENGTH < 8 bytes at SRC, result in T0
+.Lread_bytes:
+       xor     T0, T0
+       sub     $1, SRC
+ALIGN(16)
+.Lread_loop:
+       shl     $8, T0
+       orb     (SRC, LENGTH), LREG(T0)
+.Lread_next:
+       sub     $1, LENGTH
+       jnz     .Lread_loop
        ret
-       
-EPILOGUE(_nettle_gcm_gf_mul_8)
+EPILOGUE(_nettle_gcm_hash8)
 
 define(<W>, <0x$2$1>)
        .section .rodata
@@ -126,5 +223,3 @@ define(<W>, <0x$2$1>)
 .hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e)
 .hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae)
 .hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be)
-       
-