+2013-04-11 Niels Möller <nisse@lysator.liu.se>
+
+ * x86_64/sha3-permute.asm: Go via memory for moves between general
+ registers and xmm registers.
+
2013-04-06 Niels Möller <nisse@lysator.liu.se>
From Edgar E. Iglesias:
define(<SWAP64>, <pshufd <$>0x4e,>)
+define(<DIRECT_MOVQ>, <no>)
+
+C MOVQ(src, dst), for moves between a general register and an xmm
+C register.
+
+ifelse(DIRECT_MOVQ, yes, <
C movq calls that are equal to the corresponding movd,
C where the Apple assembler requires them to be written as movd.
-define(<MOVQ>, <movd>)
+define(<MOVQ>, <movd $1, $2>)
+>, <
+C Moving via (cached) memory is generally faster.
+define(<MOVQ>, <
+ movq $1, (CTX)
+ movq (CTX), $2
+>)>)
C ROTL64(rot, register, temp)
C Caller needs to or together the result.
SWAP64 C34, C34 C Holds C4, C3
movdqa C12, D34
- MOVQ C0, D12
+ MOVQ(C0, D12)
punpcklqdq C12, D12 C Holds C0, C1
punpckhqdq C34, D34 C Holds C2, C3
punpcklqdq D12, C34 C Holds C4, C0
- MOVQ C34, D0
- MOVQ C12, T0
+ MOVQ(C34, D0)
+ MOVQ(C12, T0)
rolq $1, T0
xorq T0, D0
C `-_________-^`-^
rolq $36, A05
- MOVQ A05, W0
- MOVQ A0607, A05
+ MOVQ(A05, W0)
+ MOVQ(A0607, A05)
rolq $44, A05 C Done A05
ROTL64(6, A0607, W1)
por A0607, W1
rolq $42, A10 C 42 + 25 = 3 (mod 64)
SWAP64 A1112, W0
- MOVQ A10, A1112
- MOVQ W0, A10
+ MOVQ(A10, A1112)
+ MOVQ(W0, A10)
rolq $43, A10 C Done A10
punpcklqdq A1314, A1112
SWAP64 A1819, W0
rolq $41, A15
- MOVQ A15, W1
- MOVQ A1819, A15
+ MOVQ(A15, W1)
+ MOVQ(A1819, A15)
rolq $21, A15 C Done A15
SWAP64 A1617, A1819
ROTL64(45, A1617, W2)
C \_______/
rolq $18, A20
- MOVQ A20, W0
+ MOVQ(A20, W0)
SWAP64 A2324, W1
movd W1, A20
rolq $14, A20 C Done A20
C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304,
C and also copy to C12 and C34 while at it.
- MOVQ A05, C12
- MOVQ A15, C34
- MOVQ A10, W0
- MOVQ A20, W1
+ MOVQ(A05, C12)
+ MOVQ(A15, C34)
+ MOVQ(A10, W0)
+ MOVQ(A20, W1)
movq A00, C0
punpcklqdq W0, C12
punpcklqdq W1, C34
- MOVQ A0102, A05
- MOVQ A0304, A15
+ MOVQ(A0102, A05)
+ MOVQ(A0304, A15)
psrldq $8, A0102
psrldq $8, A0304
xorq A05, C0
xorq A15, C0
- MOVQ A0102, A10
- MOVQ A0304, A20
+ MOVQ(A0102, A10)
+ MOVQ(A0304, A20)
movdqa C12, A0102
movdqa C34, A0304