From: Niels Möller <nisse@lysator.liu.se>
Date: Thu, 11 Apr 2013 06:28:16 +0000 (+0200)
Subject: x86_64 sha3: Go via memory for moves between general registers and xmm registers.
X-Git-Tag: nettle_2.7_release_20130424~64
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fb709927da0c6088e80cf21fd988542e5e711866;p=thirdparty%2Fnettle.git

x86_64 sha3: Go via memory for moves between general registers and xmm registers.
---

diff --git a/ChangeLog b/ChangeLog
index 09a744f8..6aab71e6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-04-11  Niels MÃ¶ller  <nisse@lysator.liu.se>
+
+	* x86_64/sha3-permute.asm: Go via memory for moves between general
+	registers and xmm registers.
+
 2013-04-06  Niels MÃ¶ller  <nisse@lysator.liu.se>
 
 	From Edgar E. Iglesias:
diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm
index 360a1f44..f58d787c 100644
--- a/x86_64/sha3-permute.asm
+++ b/x86_64/sha3-permute.asm
@@ -72,9 +72,21 @@ define(<STATE>, <OFFSET($1)(CTX)>)
 
 define(<SWAP64>, <pshufd	<$>0x4e,>)
 
+define(<DIRECT_MOVQ>, <no>)
+
+C MOVQ(src, dst), for moves between a general register and an xmm
+C register.
+
+ifelse(DIRECT_MOVQ, yes, <
 C movq calls that are equal to the corresponding movd,
 C where the Apple assembler requires them to be written as movd.
-define(<MOVQ>, <movd>)
+define(<MOVQ>, <movd	$1, $2>)
+>, <
+C Moving via (cached) memory is generally faster.
+define(<MOVQ>, <
+	movq	$1, (CTX)
+	movq	(CTX), $2
+>)>)
 
 C ROTL64(rot, register, temp)
 C Caller needs to or together the result.
@@ -151,12 +163,12 @@ PROLOGUE(nettle_sha3_permute)
 
 	SWAP64	C34, C34		C Holds C4, C3
 	movdqa	C12, D34
-	MOVQ	C0, D12
+	MOVQ(C0, D12)
 	punpcklqdq	C12, D12	C Holds C0, C1
 	punpckhqdq	C34, D34	C Holds C2, C3
 	punpcklqdq	D12, C34	C Holds	C4, C0
-	MOVQ	C34, D0
-	MOVQ	C12, T0
+	MOVQ(C34, D0)
+	MOVQ(C12, T0)
 	rolq	$1, T0
 	xorq	T0, D0
 
@@ -240,8 +252,8 @@ PROLOGUE(nettle_sha3_permute)
 	C   `-_________-^`-^
 	
 	rolq	$36, A05
-	MOVQ	A05, W0
-	MOVQ	A0607, A05
+	MOVQ(A05, W0)
+	MOVQ(A0607, A05)
 	rolq	$44, A05		C Done A05
 	ROTL64(6, A0607, W1)
 	por	A0607, W1
@@ -264,8 +276,8 @@ PROLOGUE(nettle_sha3_permute)
 
 	rolq	$42, A10		C 42 + 25 = 3 (mod 64)
 	SWAP64	A1112, W0
-	MOVQ	A10, A1112
-	MOVQ	W0, A10
+	MOVQ(A10, A1112)
+	MOVQ(W0, A10)
 	rolq	$43, A10		C Done A10
 
 	punpcklqdq	A1314, A1112
@@ -289,8 +301,8 @@ PROLOGUE(nettle_sha3_permute)
 
 	SWAP64	A1819, W0
 	rolq	$41, A15
-	MOVQ	A15, W1
-	MOVQ	A1819, A15
+	MOVQ(A15, W1)
+	MOVQ(A1819, A15)
 	rolq	$21, A15		C Done A15
 	SWAP64	A1617, A1819
 	ROTL64(45, A1617, W2)
@@ -312,7 +324,7 @@ PROLOGUE(nettle_sha3_permute)
 	C    \_______/
 
 	rolq	$18, A20
-	MOVQ	A20, W0
+	MOVQ(A20, W0)
 	SWAP64	A2324, W1
 	movd	W1, A20
 	rolq	$14, A20		C Done A20
@@ -390,21 +402,21 @@ PROLOGUE(nettle_sha3_permute)
 	C Swap (A05, A10) <->  A0102, and (A15, A20) <->  A0304,
 	C and also copy to C12 and C34 while at it.
 	
-	MOVQ	A05, C12
-	MOVQ	A15, C34
-	MOVQ	A10, W0
-	MOVQ	A20, W1
+	MOVQ(A05, C12)
+	MOVQ(A15, C34)
+	MOVQ(A10, W0)
+	MOVQ(A20, W1)
 	movq	A00, C0
 	punpcklqdq	W0, C12
 	punpcklqdq	W1, C34
-	MOVQ	A0102, A05
-	MOVQ	A0304, A15
+	MOVQ(A0102, A05)
+	MOVQ(A0304, A15)
 	psrldq	$8, A0102
 	psrldq	$8, A0304
 	xorq	A05, C0
 	xorq	A15, C0
-	MOVQ	A0102, A10
-	MOVQ	A0304, A20
+	MOVQ(A0102, A10)
+	MOVQ(A0304, A20)
 
 	movdqa	C12, A0102
 	movdqa	C34, A0304