x86_64 sha3: Go via memory for moves between general registers and xmm registers.

author Niels Möller <nisse@lysator.liu.se>

Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)
author Niels Möller <nisse@lysator.liu.se>
Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)
diff --git a/ChangeLog b/ChangeLog

index 09a744f82f89a097844a98d57e7835601cdc31f4..6aab71e6295b49b9241a00afbb0570b76ce0ae0c 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-04-11  Niels Möller  <nisse@lysator.liu.se>
+
+       * x86_64/sha3-permute.asm: Go via memory for moves between general
+       registers and xmm registers.
+
  2013-04-06  Niels Möller  <nisse@lysator.liu.se>
  
         From Edgar E. Iglesias:
diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm

index 360a1f44e02d1367f476a1a2d1b9a245e30f3ddf..f58d787c33bce318a5374616e662c94d67a189c6 100644 (file)
--- a/x86_64/sha3-permute.asm
+++ b/x86_64/sha3-permute.asm
@@ -72,9 +72,21 @@ define(<STATE>, <OFFSET($1)(CTX)>)
  
  define(<SWAP64>, <pshufd       <$>0x4e,>)
  
+define(<DIRECT_MOVQ>, <no>)
+
+C MOVQ(src, dst), for moves between a general register and an xmm
+C register.
+
+ifelse(DIRECT_MOVQ, yes, <
  C movq calls that are equal to the corresponding movd,
  C where the Apple assembler requires them to be written as movd.
-define(<MOVQ>, <movd>)
+define(<MOVQ>, <movd   $1, $2>)
+>, <
+C Moving via (cached) memory is generally faster.
+define(<MOVQ>, <
+       movq    $1, (CTX)
+       movq    (CTX), $2
+>)>)
  
  C ROTL64(rot, register, temp)
  C Caller needs to or together the result.
@@ -151,12 +163,12 @@ PROLOGUE(nettle_sha3_permute)
  
         SWAP64  C34, C34                C Holds C4, C3
         movdqa  C12, D34
-       MOVQ    C0, D12
+       MOVQ(C0, D12)
         punpcklqdq      C12, D12        C Holds C0, C1
         punpckhqdq      C34, D34        C Holds C2, C3
         punpcklqdq      D12, C34        C Holds C4, C0
-       MOVQ    C34, D0
-       MOVQ    C12, T0
+       MOVQ(C34, D0)
+       MOVQ(C12, T0)
         rolq    $1, T0
         xorq    T0, D0
  
@@ -240,8 +252,8 @@ PROLOGUE(nettle_sha3_permute)
         C   `-_________-^`-^
         
         rolq    $36, A05
-       MOVQ    A05, W0
-       MOVQ    A0607, A05
+       MOVQ(A05, W0)
+       MOVQ(A0607, A05)
         rolq    $44, A05                C Done A05
         ROTL64(6, A0607, W1)
         por     A0607, W1
@@ -264,8 +276,8 @@ PROLOGUE(nettle_sha3_permute)
  
         rolq    $42, A10                C 42 + 25 = 3 (mod 64)
         SWAP64  A1112, W0
-       MOVQ    A10, A1112
-       MOVQ    W0, A10
+       MOVQ(A10, A1112)
+       MOVQ(W0, A10)
         rolq    $43, A10                C Done A10
  
         punpcklqdq      A1314, A1112
@@ -289,8 +301,8 @@ PROLOGUE(nettle_sha3_permute)
  
         SWAP64  A1819, W0
         rolq    $41, A15
-       MOVQ    A15, W1
-       MOVQ    A1819, A15
+       MOVQ(A15, W1)
+       MOVQ(A1819, A15)
         rolq    $21, A15                C Done A15
         SWAP64  A1617, A1819
         ROTL64(45, A1617, W2)
@@ -312,7 +324,7 @@ PROLOGUE(nettle_sha3_permute)
         C    \_______/
  
         rolq    $18, A20
-       MOVQ    A20, W0
+       MOVQ(A20, W0)
         SWAP64  A2324, W1
         movd    W1, A20
         rolq    $14, A20                C Done A20
@@ -390,21 +402,21 @@ PROLOGUE(nettle_sha3_permute)
         C Swap (A05, A10) <->  A0102, and (A15, A20) <->  A0304,
         C and also copy to C12 and C34 while at it.
         
-       MOVQ    A05, C12
-       MOVQ    A15, C34
-       MOVQ    A10, W0
-       MOVQ    A20, W1
+       MOVQ(A05, C12)
+       MOVQ(A15, C34)
+       MOVQ(A10, W0)
+       MOVQ(A20, W1)
         movq    A00, C0
         punpcklqdq      W0, C12
         punpcklqdq      W1, C34
-       MOVQ    A0102, A05
-       MOVQ    A0304, A15
+       MOVQ(A0102, A05)
+       MOVQ(A0304, A15)
         psrldq  $8, A0102
         psrldq  $8, A0304
         xorq    A05, C0
         xorq    A15, C0
-       MOVQ    A0102, A10
-       MOVQ    A0304, A20
+       MOVQ(A0102, A10)
+       MOVQ(A0304, A20)
  
         movdqa  C12, A0102
         movdqa  C34, A0304
author	Niels Möller <nisse@lysator.liu.se>
	Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Thu, 11 Apr 2013 06:28:16 +0000 (08:28 +0200)
ChangeLog		patch \| blob \| blame \| history
x86_64/sha3-permute.asm		patch \| blob \| blame \| history