From: Niels Möller Date: Thu, 11 Apr 2013 06:28:16 +0000 (+0200) Subject: x86_64 sha3: Go via memory for moves between general registers and xmm registers. X-Git-Tag: nettle_2.7_release_20130424~64 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fb709927da0c6088e80cf21fd988542e5e711866;p=thirdparty%2Fnettle.git x86_64 sha3: Go via memory for moves between general registers and xmm registers. --- diff --git a/ChangeLog b/ChangeLog index 09a744f8..6aab71e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2013-04-11 Niels Möller + + * x86_64/sha3-permute.asm: Go via memory for moves between general + registers and xmm registers. + 2013-04-06 Niels Möller From Edgar E. Iglesias: diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm index 360a1f44..f58d787c 100644 --- a/x86_64/sha3-permute.asm +++ b/x86_64/sha3-permute.asm @@ -72,9 +72,21 @@ define(, ) define(, 0x4e,>) +define(, ) + +C MOVQ(src, dst), for moves between a general register and an xmm +C register. + +ifelse(DIRECT_MOVQ, yes, < C movq calls that are equal to the corresponding movd, C where the Apple assembler requires them to be written as movd. -define(, ) +define(, ) +>, < +C Moving via (cached) memory is generally faster. +define(, < + movq $1, (CTX) + movq (CTX), $2 +>)>) C ROTL64(rot, register, temp) C Caller needs to or together the result. @@ -151,12 +163,12 @@ PROLOGUE(nettle_sha3_permute) SWAP64 C34, C34 C Holds C4, C3 movdqa C12, D34 - MOVQ C0, D12 + MOVQ(C0, D12) punpcklqdq C12, D12 C Holds C0, C1 punpckhqdq C34, D34 C Holds C2, C3 punpcklqdq D12, C34 C Holds C4, C0 - MOVQ C34, D0 - MOVQ C12, T0 + MOVQ(C34, D0) + MOVQ(C12, T0) rolq $1, T0 xorq T0, D0 @@ -240,8 +252,8 @@ PROLOGUE(nettle_sha3_permute) C `-_________-^`-^ rolq $36, A05 - MOVQ A05, W0 - MOVQ A0607, A05 + MOVQ(A05, W0) + MOVQ(A0607, A05) rolq $44, A05 C Done A05 ROTL64(6, A0607, W1) por A0607, W1 @@ -264,8 +276,8 @@ PROLOGUE(nettle_sha3_permute) rolq $42, A10 C 42 + 25 = 3 (mod 64) SWAP64 A1112, W0 - MOVQ A10, A1112 - MOVQ W0, A10 + MOVQ(A10, A1112) + MOVQ(W0, A10) rolq $43, A10 C Done A10 punpcklqdq A1314, A1112 @@ -289,8 +301,8 @@ PROLOGUE(nettle_sha3_permute) SWAP64 A1819, W0 rolq $41, A15 - MOVQ A15, W1 - MOVQ A1819, A15 + MOVQ(A15, W1) + MOVQ(A1819, A15) rolq $21, A15 C Done A15 SWAP64 A1617, A1819 ROTL64(45, A1617, W2) @@ -312,7 +324,7 @@ PROLOGUE(nettle_sha3_permute) C \_______/ rolq $18, A20 - MOVQ A20, W0 + MOVQ(A20, W0) SWAP64 A2324, W1 movd W1, A20 rolq $14, A20 C Done A20 @@ -390,21 +402,21 @@ PROLOGUE(nettle_sha3_permute) C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, C and also copy to C12 and C34 while at it. - MOVQ A05, C12 - MOVQ A15, C34 - MOVQ A10, W0 - MOVQ A20, W1 + MOVQ(A05, C12) + MOVQ(A15, C34) + MOVQ(A10, W0) + MOVQ(A20, W1) movq A00, C0 punpcklqdq W0, C12 punpcklqdq W1, C34 - MOVQ A0102, A05 - MOVQ A0304, A15 + MOVQ(A0102, A05) + MOVQ(A0304, A15) psrldq $8, A0102 psrldq $8, A0304 xorq A05, C0 xorq A15, C0 - MOVQ A0102, A10 - MOVQ A0304, A20 + MOVQ(A0102, A10) + MOVQ(A0304, A20) movdqa C12, A0102 movdqa C34, A0304