x86_64 assembly for umac_nh_n.

author Niels Möller <nisse@lysator.liu.se>

Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)
author Niels Möller <nisse@lysator.liu.se>
Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)
diff --git a/ChangeLog b/ChangeLog

index 155af0edd44b4223e4ec791bd604231a51d7b2f3..9ab115b1197855d280c2f59b64307c3bfb7b03be 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
  2013-04-16  Niels Möller  <nisse@lysator.liu.se>
  
+       * x86_64/umac-nh-n.asm: New file, 3.5 time speedup.
+
         * umac32.c (umac32_digest): Fix nonce caching.
         * umac64.c (umac64_digest): Likewise.
  
diff --git a/x86_64/umac-nh-n.asm b/x86_64/umac-nh-n.asm

new file mode 100644 (file)

index 0000000..fd145cc
--- /dev/null
+++ b/x86_64/umac-nh-n.asm
@@ -0,0 +1,261 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2013 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+define(<OUT>, <%rdi>)
+define(<ITERS>, <%rsi>)
+define(<KEY>, <%rdx>)
+define(<LENGTH>, <%rcx>)
+define(<MSG>, <%r8>)
+
+define(<XM0>, <%xmm0>)
+define(<XM1>, <%xmm1>)
+define(<XM2>, <%xmm2>)
+define(<XM3>, <%xmm3>)
+define(<XK0>, <%xmm4>)
+define(<XK1>, <%xmm5>)
+define(<XK2>, <%xmm6>)
+define(<XK3>, <%xmm7>)
+define(<XT0>, <%xmm8>)
+define(<XT1>, <%xmm9>)
+define(<XT2>, <%xmm10>)
+define(<XT3>, <%xmm11>)
+define(<XY0>, <%xmm12>)
+define(<XY1>, <%xmm13>)
+
+C Copy [0,1,2,3] to [1,1,3,3]
+define(<HI2LO>, <pshufd        <$>0xf5,>)
+C FIXME: Would be nice if we could force the key array to be 16-byte
+C aligned.
+
+       .file "umac-nh-n.asm"
+       
+       C umac_nh_n(uint64_t *out, unsigned n, const uint32_t *key,
+       C           unsigned length, const uint8_t *msg)
+       .text
+       ALIGN(4)
+PROLOGUE(_nettle_umac_nh_n)
+       W64_ENTRY(5, 14)
+       pxor    XY0, XY0
+       cmp     $3, ITERS
+       jc      .Lnh2
+       je      .Lnh3
+
+.Lnh4:
+       movups  (KEY), XK0
+       movups  16(KEY), XT2
+       movups  32(KEY), XK2    
+       lea     48(KEY), KEY
+       C Leave XK2 untuched, and put permuted keys in XK0, XK1, XT2, XT3
+       movaps  XK0, XT0
+       movaps  XK0, XK1
+       
+       punpcklqdq XT2, XK0     C [0,1,4,5]
+       punpckhqdq XT2, XK1     C [2,3,6,7]
+       movaps  XT2, XT3
+       punpcklqdq XK2, XT2     C [4,5, 8, 9]
+       punpckhqdq XK2, XT3     C [6,7,10,11]
+
+       movaps  XY0, XY1
+       
+.Loop4:
+       movups  (MSG), XT0
+       movups  16(MSG), XT1
+
+       pshufd  $0xee, XT1, XM3 C [6,7,6,7]
+       pshufd  $0x44, XT1, XM2 C [4,5,4,5]
+       pshufd  $0xee, XT0, XM1 C [2,3,2,3]
+       pshufd  $0x44, XT0, XM0 C [0,1,0,1]
+
+       paddd   XM0, XK0
+       paddd   XM1, XK1
+       paddd   XM2, XT2
+       paddd   XM3, XT3
+
+       HI2LO   XK0, XT0
+       HI2LO   XT2, XT1
+       pmuludq XK0, XT2
+       pmuludq XT0, XT1
+       paddq   XT2, XY0
+       paddq   XT1, XY0
+
+       HI2LO   XK1, XT0
+       HI2LO   XT3, XT1
+       pmuludq XK1, XT3
+       pmuludq XT0, XT1
+       paddq   XT3, XY0
+       paddq   XT1, XY0
+
+       movaps  XK2, XK0
+       movaps  XK2, XK1
+       movups  (KEY), XT2
+       movups  16(KEY), XK2
+       punpcklqdq XT2, XK0     C [ 8, 9,12,13]
+       punpckhqdq XT2, XK1     C [10,11,14,15]
+       movaps  XT2, XT3
+
+       punpcklqdq XK2, XT2     C [12,13,16,17]
+       punpckhqdq XK2, XT3     C [14,15,18,19]
+
+       paddd   XK0, XM0
+       paddd   XK1, XM1
+       paddd   XT2, XM2
+       paddd   XT3, XM3
+
+       HI2LO   XM0, XT0
+       HI2LO   XM2, XT1
+       pmuludq XM0, XM2
+       pmuludq XT0, XT1
+       paddq   XM2, XY1
+       paddq   XT1, XY1
+
+       HI2LO   XM1, XT0
+       HI2LO   XM3, XT1
+       pmuludq XM1, XM3
+       pmuludq XT0, XT1
+       paddq   XM3, XY1
+       paddq   XT1, XY1
+
+       subl    $32, XREG(LENGTH)
+       lea     32(MSG), MSG
+       lea     32(KEY), KEY
+       ja      .Loop4
+
+       movups  XY0, (OUT)
+       movups  XY1, 16(OUT)
+
+       W64_EXIT(5, 14)
+       ret
+       
+.Lnh3:
+       movups  (KEY), XK0
+       movups  16(KEY), XK1
+       movaps  XY0, XY1
+.Loop3:
+       lea     32(KEY), KEY
+       movups  (MSG), XT0
+       movups  16(MSG), XT1
+       movups  (KEY), XK2
+       movups  16(KEY), XK3
+       pshufd  $0xee, XT1, XM3 C [6,7,6,7]
+       pshufd  $0x44, XT1, XM2 C [4,5,4,5]
+       pshufd  $0xee, XT0, XM1 C [2,3,2,3]
+       pshufd  $0x44, XT0, XM0 C [0,1,0,1]
+
+       C Iteration 2
+       paddd   XK2, XT0
+       paddd   XK3, XT1
+       HI2LO   XT0, XT2
+       HI2LO   XT1, XT3
+       pmuludq XT0, XT1
+       pmuludq XT2, XT3
+       paddq   XT1, XY1
+       paddq   XT3, XY1
+
+       C Iteration 0,1
+       movaps  XK0, XT0
+       punpcklqdq XK1, XK0     C [0,1,4,5]
+       punpckhqdq XK1, XT0     C [2,3,6,7]
+       paddd   XK0, XM0
+       paddd   XT0, XM1
+       movaps  XK2, XK0
+       movaps  XK1, XT0
+       punpcklqdq XK2, XK1     C [4,5,8,9]
+       punpckhqdq XK2, XT0     C [6,7,10,11]
+       paddd   XK1, XM2
+       paddd   XT0, XM3
+
+       HI2LO   XM0, XT0
+       HI2LO   XM2, XT1
+       pmuludq XM0, XM2
+       pmuludq XT0, XT1
+       paddq   XM2, XY0
+       paddq   XT1, XY0
+       
+       HI2LO   XM1, XT0
+       HI2LO   XM3, XT1
+       pmuludq XM1, XM3
+       pmuludq XT0, XT1
+       paddq   XM3, XY0
+       paddq   XT1, XY0
+       subl    $32, XREG(LENGTH)
+       lea     32(MSG), MSG
+       movaps  XK2, XK0
+       movaps  XK3, XK1
+
+       ja      .Loop3
+
+       pshufd  $0xe, XY1, XT0
+       paddq   XT0, XY1
+       movups  XY0, (OUT)
+       movlpd  XY1, 16(OUT)
+
+       W64_EXIT(5, 14)
+       ret
+       
+.Lnh2:
+       C Explode message as [0,1,0,1] [2,3,2,3] [4,5,4,5] [6,7, 6, 7]
+       C Interleave keys as [0,1,4,5] [2,3,6,7] [4,5,8,9] [7,8,10,11]
+       movups  (KEY), XK0
+       lea     16(KEY), KEY
+.Loop2:
+       movups  (MSG), XM0
+       movups  16(MSG), XM1
+       pshufd  $0xee, XM1, XM3 C [6,7,6,7]
+       pshufd  $0x44, XM1, XM2 C [4,5,4,5]
+       pshufd  $0xee, XM0, XM1 C [2,3,2,3]
+       pshufd  $0x44, XM0, XM0 C [0,1,0,1]
+
+       movups  (KEY), XK1
+       movups  16(KEY), XK2
+       movaps  XK0, XT0
+       punpcklqdq XK1, XK0     C [0,1,4,5]
+       punpckhqdq XK1, XT0     C [2,3,6,7]
+       paddd   XK0, XM0
+       paddd   XT0, XM1
+       movaps  XK2, XK0
+       movaps  XK1, XT0
+       punpcklqdq XK2, XK1     C [4,5,8,9]
+       punpckhqdq XK2, XT0     C [6,7,10,11]
+       paddd   XK1, XM2
+       paddd   XT0, XM3
+
+       HI2LO   XM0, XT0
+       HI2LO   XM2, XT1
+       pmuludq XM0, XM2
+       pmuludq XT0, XT1
+       paddq   XM2, XY0
+       paddq   XT1, XY0
+       
+       HI2LO   XM1, XT0
+       HI2LO   XM3, XT1
+       pmuludq XM1, XM3
+       pmuludq XT0, XT1
+       paddq   XM3, XY0
+       paddq   XT1, XY0
+       subl    $32, XREG(LENGTH)
+       lea     32(MSG), MSG
+       lea     32(KEY), KEY
+
+       ja      .Loop2
+
+       movups  XY0, (OUT)
+.Lend:
+       W64_EXIT(5, 14)
+       ret
+EPILOGUE(_nettle_umac_nh_n)
author	Niels Möller <nisse@lysator.liu.se>
	Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Tue, 16 Apr 2013 08:23:21 +0000 (10:23 +0200)
ChangeLog		patch \| blob \| blame \| history
x86_64/umac-nh-n.asm	[new file with mode: 0644]	patch \| blob