From: Niels Möller Date: Thu, 13 Dec 2012 08:53:22 +0000 (+0100) Subject: Rewrote x86_64 sha3-permute.asm. X-Git-Tag: nettle_2.6_release_20130116~15 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=588017df4e98257211876479e0cbf11a8efebc87;p=thirdparty%2Fnettle.git Rewrote x86_64 sha3-permute.asm. --- diff --git a/ChangeLog b/ChangeLog index c9270851..4db6a8cf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2012-12-13 Niels Möller + + * x86_64/sha3-permute.asm: Rewrote, to keep all state in + registers. 2400 cycles on x86_&4, only slightly faster than the + current C code. + 2012-12-09 Niels Möller * sha3-permute.c (sha3_permute): Rewrote to do permutation in diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm index 2f742331..34cf6bcc 100644 --- a/x86_64/sha3-permute.asm +++ b/x86_64/sha3-permute.asm @@ -20,41 +20,59 @@ C MA 02111-1301, USA. define(, <%rdi>) C 25 64-bit values, 200 bytes. define(, <%r8>) C Avoid clobbering %rsi, for W64. -define(, <%xmm0>) -define(, <%xmm1>) -define(, <%rdx>) +define(, <%rax>) +define(, <%xmm0>) +define(, <%xmm1>) -define(, <%xmm2>) -define(, <%xmm3>) -define(, <%r9>) -define(, <%xmm4>) -define(, <%xmm5>) -define(, <%r10>) -define(, <%xmm6>) -define(, <%xmm7>) +define(, <%rcx>) +define(, <%xmm2>) +define(, <%xmm3>) + +define(, <%rdx>) +define(, <%xmm4>) +define(, <%xmm5>) -define(, <%r11>) +define(, <%rbp>) +define(, <%xmm6>) +define(, <%xmm7>) + +define(, <%r9>) +define(, <%xmm8>) +define(, <%xmm9>) -define(, <200>) +define(, <%r10>) +define(, <%xmm10>) +define(, <%xmm11>) -define(, ) -define(, ) -define(, ) +define(, <%r11>) +define(, <%xmm12>) +define(, <%xmm13>) - C FIXME: Possible optimizations. +C Wide temporaries +define(, <%xmm14>) +define(, <%xmm15>) +define(, <%xmm12>) C Overlap D12 +define(, <%xmm13>) C Overlap D34 - C * Compute the parity vector C at the end of the chi step. - C This avoids one pass over the data. - - C * Micro optimizations with register use and scheduling. +define(, <%r12>) +define(, <%r13>) +define(, <%r11>) C Overlap D0 +define(, <%r10>) C Overlap C0 + +define(, <%r14>) + +define(, ) +define(, ) - C * Try different order during the permutation step, maybe - C doing sequential writes rather than sequential reads. +define(, 0x4e,>) - C * Try to do the permutation and the chi step, without - C storing intermediate values? That would reducing the - C number of passes over the data. We still need a copy, but - C we would let the theta step produce that copy. +C ROTL64(rot, register, temp) +C Caller needs to or together the result. +define(, < + movdqa $2, $3 + psllq <$>$1, $2 + psrlq <$>eval(64-$1), $3 +>) .file "sha3-permute.asm" @@ -62,351 +80,416 @@ define(, ) .text ALIGN(4) PROLOGUE(nettle_sha3_permute) - W64_ENTRY(1, 8) - subq $FRAME_SIZE, %rsp - movl $24, XREG(COUNT) - negq COUNT + W64_ENTRY(1, 16) + push %rbp + push %r12 + push %r13 + push %r14 - lea .rc_end(%rip), RC_END + movl $24, XREG(COUNT) + lea .rc-8(%rip), RC + movq STATE(0), A00 + movups STATE(1), A0102 + movups STATE(3), A0304 + movq A00, C0 + + movq STATE(5), A05 + movdqa A0102, C12 + movups STATE(6), A0607 + movdqa A0304, C34 + movups STATE(8), A0809 + xorq A05, C0 + + movq STATE(10), A10 + pxor A0607, C12 + movups STATE(11), A1112 + pxor A0809, C34 + movups STATE(13), A1314 + xorq A10, C0 + + movq STATE(15), A15 + pxor A1112, C12 + movups STATE(16), A1617 + pxor A1314, C34 + movups STATE(18), A1819 + xorq A15, C0 + + movq STATE(20), A20 + pxor A1617, C12 + movups STATE(21), A2122 + pxor A1819, C34 + movups STATE(23), A2324 + xorq A20, C0 + pxor A2122, C12 + pxor A2324, C34 + ALIGN(4) .Loop: - C theta step - C Compute parity vector C[0,...,4]. - movups A(0), C01 - movups A(2), C23 - movq A(4), C4 - - movups A(5), T01 - movups A(7), T23 - xorq A(9), C4 C C[4] ^= A[9] - - pxor T01, C01 C C[0,1] ^= A[5,6] - movups A(10), T01 - pxor T23, C23 C C[2,3] ^= A[7,8] - movups A(12), T23 - xorq A(14), C4 C C[4] ^= A[14] - - pxor T01, C01 C C[0,1] ^= A[10,11] - movups A(15), T01 - pxor T23, C23 C C[2,3] ^= A[12,13] - movups A(17), T23 - xorq A(19), C4 C C[4] ^= A[19] - - pxor T01, C01 C C[0,1] ^= A[15,16] - movups A(20), T01 - pxor T23, C23 C C[2,3] ^= A[17,18] - movups A(22), T23 - xorq A(24), C4 C C[4] ^= A[24] - - pxor T01, C01 C C[0,1] ^= A[20,21] - pxor T23, C23 C C[2,3] ^= A[22,23] - - C Combine parity bits: - C D[0] = C[4] ^ ROTL64(1, C[1]) - C D[1,2] = C[0,1] ^ ROTL64(1, C[2,3]) - C D[3,4] = C[2,3] ^ ROTL64(1, C[4,0]) + C The theta step. Combine parity bits, then xor to state. + C D0 = C4 ^ (C1 <<< 1) + C D1 = C0 ^ (C2 <<< 1) + C D2 = C1 ^ (C3 <<< 1) + C D3 = C2 ^ (C4 <<< 1) + C D4 = C3 ^ (C0 <<< 1) + + C Shift the words around, putting (C0, C1) in D12, (C2, C3) in + C D34, and (C4, C0) in C34. - C Copy to D0, D12, D34, rotate original - movdqa C01, D12 - movdqa C23, D34 - movdqa C01, T01 - movdqa C23, T23 - psllq $1, T01 - psllq $1, T23 - psrlq $63, C01 - psrlq $63, C23 - movq C4, D0 - rolq $1, C4 - por T01, C01 - por T23, C23 - - C Move around, putting - C T4 <-- ROTL(1,C1), T40 <-- ROTL(1,C[4,0]) - movq C4, T40 - punpcklqdq C01, T40 - psrldq $8, C01 - movd C01, T4 C Really a movq! - - pxor C23, D12 - xorq T4, D0 - pxor T40, D34 - - C xor D on top of state - xorq D0, A(0) - movups A(1), T01 - movups A(3), T23 - pxor D12, T01 - pxor D34, T23 - movups T01, A(1) - movups T23, A(3) + C Notes on "unpack" instructions: + C punpckhqdq 01, 23 gives 31 + C punpcklqdq 01, 23 gives 20 + + SWAP64 C34, C34 C Holds C4, C3 + movdqa C12, D34 + movq C0, D12 + punpcklqdq C12, D12 C Holds C0, C1 + punpckhqdq C34, D34 C Holds C2, C3 + punpcklqdq D12, C34 C Holds C4, C0 + movq C34, D0 + movq C12, T0 + rolq $1, T0 + xorq T0, D0 + + C Can use C12 as temporary + movdqa D34, W0 + movdqa D34, W1 + psllq $1, W0 + psrlq $63, W1 + pxor W0, D12 + pxor W1, D12 C Done D12 - xorq D0, A(5) - movups A(6), T01 - movups A(8), T23 - pxor D12, T01 - pxor D34, T23 - movups T01, A(6) - movups T23, A(8) - - xorq D0, A(10) - movups A(11), T01 - movups A(13), T23 - pxor D12, T01 - pxor D34, T23 - movups T01, A(11) - movups T23, A(13) - - xorq D0, A(15) - movups A(16), T01 - movups A(18), T23 - pxor D12, T01 - pxor D34, T23 - movups T01, A(16) - movups T23, A(18) - - xorq D0, A(20) - movups A(21), T01 - movups A(23), T23 - pxor D12, T01 - pxor D34, T23 - movups T01, A(21) - movups T23, A(23) - - C rho and pi steps: Rotate and permute - movq A(0), C4 C rot 0, perm 0 - movq A(1), T4 C rot 1, perm 10 - movq C4, B(0) - rolq $1, T4 - movq A(2), C4 C rot 62, perm 20 - movq T4, B(10) - rolq $62, C4 - movq A(3), T4 C rot 28, perm 5 - movq C4, B(20) - rolq $28, T4 - movq A(4), C4 C rot 27, perm 15 - movq T4, B(5) - rolq $27, C4 - movq A(5), T4 C rot 36, perm 16 - movq C4, B(15) - rolq $36, T4 - movq A(6), C4 C rot 44, perm 1 - movq T4, B(16) - rolq $44, C4 - movq A(7), T4 C rot 6, perm 11 - movq C4, B(1) - rolq $6, T4 - movq A(8), C4 C rot 55, perm 21 - movq T4, B(11) - rolq $55, C4 - movq A(9), T4 C rot 20, perm 6 - movq C4, B(21) - rolq $20, T4 - movq A(10), C4 C rot 3, perm 7 - movq T4, B(6) - rolq $3, C4 - movq A(11), T4 C rot 10, perm 17 - movq C4, B(7) - rolq $10, T4 - movq A(12), C4 C rot 43, perm 2 - movq T4, B(17) - rolq $43, C4 - movq A(13), T4 C rot 25, perm 12 - movq C4, B(2) - rolq $25, T4 - movq A(14), C4 C rot 39, perm 22 - movq T4, B(12) - rolq $39, C4 - movq A(15), T4 C rot 41, perm 23 - movq C4, B(22) - rolq $41, T4 - movq A(16), C4 C rot 45, perm 8 - movq T4, B(23) - rolq $45, C4 - movq A(17), T4 C rot 15, perm 18 - movq C4, B(8) - rolq $15, T4 - movq A(18), C4 C rot 21, perm 3 - movq T4, B(18) - rolq $21, C4 - movq A(19), T4 C rot 8, perm 13 - movq C4, B(3) - rolq $8, T4 - movq A(20), C4 C rot 18, perm 14 - movq T4, B(13) - rolq $18, C4 - movq A(21), T4 C rot 2, perm 24 - movq C4, B(14) - rolq $2, T4 - movq A(22), C4 C rot 61, perm 9 - movq T4, B(24) - rolq $61, C4 - movq A(23), T4 C rot 56, perm 19 - movq C4, B(9) - rolq $56, T4 - movq A(24), C4 C rot 14, perm 4 - movq T4, B(19) - rolq $14, C4 - movq C4, B(4) - - C chi step - C Read with some overlap, pairs C01, D12, D34 - C Then also construct pairs C23 and T40. - - C We do the operations as - C A01 = B01 ^ (~B12 & B23) - C A12 = B12 ^ (~B23 & B34) - C A34 = B34 ^ (~B40 & B01) - - C Where we store only the low 64 bits of A01, and add in the - C round key if applicable. + movdqa C34, C12 + psrlq $63, C34 + psllq $1, C12 + pxor C34, D34 + pxor C12, D34 C Done D34 + + xorq D0, A00 + xorq D0, A05 + xorq D0, A10 + xorq D0, A15 + xorq D0, A20 + pxor D12, A0102 + pxor D12, A0607 + pxor D12, A1112 + pxor D12, A1617 + pxor D12, A2122 + pxor D34, A0304 + pxor D34, A0809 + pxor D34, A1314 + pxor D34, A1819 + pxor D34, A2324 + + C theta step done, no C, D or W temporaries alive. + + C rho and pi steps. When doing the permutations, also + C transpose the matrix. - movups B(0), C01 - movups B(1), D12 - movups B(3), D34 - - pshufd $0x4e, D34, D43 - movdqa D43, T40 - punpcklqdq C01, T40 C Get 40 - movdqa D12, C23 - punpckhqdq D43, C23 C Get 23 - - pandn C01, T40 - pxor D34, T40 - movups T40, A(3) - - movdqa D12, T40 - pandn C23, T40 - pxor C01, T40 - - movd T40, T4 C Really movq! - xorq (RC_END, COUNT, 8), T4 - movq T4, A(0) - - pandn D34, C23 - pxor D12, C23 - movups C23, A(1) - - - movups B(5), C01 - movups B(6), D12 - movups B(8), D34 - - pshufd $0x4e, D34, D43 - movdqa D43, T40 - punpcklqdq C01, T40 C Get 40 - movdqa D12, C23 - punpckhqdq D43, C23 C Get 23 - - pandn C01, T40 - pxor D34, T40 - movups T40, A(8) - - movdqa D12, T40 - pandn C23, T40 - pxor C01, T40 - - movq T40, A(5) - - pandn D34, C23 - pxor D12, C23 - movups C23, A(6) - - - movups B(10), C01 - movups B(11), D12 - movups B(13), D34 - - pshufd $0x4e, D34, D43 - movdqa D43, T40 - punpcklqdq C01, T40 C Get 40 - movdqa D12, C23 - punpckhqdq D43, C23 C Get 23 - - pandn C01, T40 - pxor D34, T40 - movups T40, A(13) - - movdqa D12, T40 - pandn C23, T40 - pxor C01, T40 - - movq T40, A(10) - - pandn D34, C23 - pxor D12, C23 - movups C23, A(11) - - - movups B(15), C01 - movups B(16), D12 - movups B(18), D34 - - pshufd $0x4e, D34, D43 - movdqa D43, T40 - punpcklqdq C01, T40 C Get 40 - movdqa D12, C23 - punpckhqdq D43, C23 C Get 23 - - pandn C01, T40 - pxor D34, T40 - movups T40, A(18) - - movdqa D12, T40 - pandn C23, T40 - pxor C01, T40 - - movq T40, A(15) - - pandn D34, C23 - pxor D12, C23 - movups C23, A(16) - - - movups B(20), C01 - movups B(21), D12 - movups B(23), D34 - - pshufd $0x4e, D34, D43 - movdqa D43, T40 - punpcklqdq C01, T40 C Get 40 - movdqa D12, C23 - punpckhqdq D43, C23 C Get 23 - - pandn C01, T40 - pxor D34, T40 - movups T40, A(23) - - movdqa D12, T40 - pandn C23, T40 - pxor C01, T40 - - movq T40, A(20) - - pandn D34, C23 - pxor D12, C23 - movups C23, A(21) - - - incq COUNT + C The combined permutation + transpose gives the following + C cycles (rotation counts in parenthesis) + C 0 <- 0(0) + C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1) + C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36) + C 7 <- 7(6) + C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3) + C 14 <- 14(39) + C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41) + C 16 <- 16(45) + C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18) + C 23 <- 23(56) + + C Do the 1,2,3,4 row. First rotate, then permute. + movdqa A0102, W0 + movdqa A0102, W1 + movdqa A0102, W2 + psllq $1, A0102 + psrlq $63, W0 + psllq $62, W1 + por A0102, W0 C rotl 1 (A01) + psrlq $2, W2 + por W1, W2 C rotl 62 (A02) + + movdqa A0304, A0102 + movdqa A0304, W1 + psllq $28, A0102 + psrlq $36, W1 + por W1, A0102 C rotl 28 (A03) + movdqa A0304, W1 + psllq $27, A0304 + psrlq $37, W1 + por W1, A0304 C rotl 27 (A04) + + punpcklqdq W0, A0102 + punpckhqdq W2, A0304 + + C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36) + C 7 <- 7(6) + C __ _______ + C _ L' ` L_ __` + C |5| |6|7| |8|9| + C `-_________-^`-^ + + rolq $36, A05 + movq A05, W0 + movq A0607, A05 + rolq $44, A05 C Done A05 + ROTL64(6, A0607, W1) + por A0607, W1 + movdqa A0809, A0607 + ROTL64(20, A0607, W2) + por W2, A0607 + punpckhqdq W1, A0607 C Done A0607 + ROTL64(55, A0809, W1) + por A0809, W1 + movdqa W0, A0809 + punpcklqdq W1, A0809 C Done 0809 + + C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3) + C 14 <- 14(39) + C _____ ___ + C __L' __`_L_ `_____ + C |10| |11|12| |13|14| + C `-___-^`-______-^ + C + + rolq $42, A10 C 42 + 25 = 3 (mod 64) + SWAP64 A1112, W0 + movq A10, A1112 + movq W0, A10 + rolq $43, A10 C Done A10 + + punpcklqdq A1314, A1112 + ROTL64(25, A1112, W1) + por W1, A1112 C Done A1112 + ROTL64(39, A1314, W2) + por A1314, W2 + ROTL64(10, W0, A1314) + por W0, A1314 + punpckhqdq W2, A1314 C Done A1314 + + + C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41) + C 16 <- 16(45) + C _____________ + C / _______ + C _L' ____L' | `_ + C |15| |16|17| |18|19| + C \ `_____-^ ^ + C \_________________/ + + SWAP64 A1819, W0 + rolq $41, A15 + movq A15, W1 + movq A1819, A15 + rolq $21, A15 C Done A15 + SWAP64 A1617, A1819 + ROTL64(45, A1617, W2) + por W2, A1617 + ROTL64(8, W0, W3) + por W3, W0 + punpcklqdq W0, A1617 C Done A1617 + ROTL64(15, A1819, W2) + por W2, A1819 + punpcklqdq W1, A1819 C Done A1819 + + C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18) + C 23 <- 23(56) + C _______________ + C / \ + C _L' _L'\_ ___`_ + C |20| |21|22| |23|24| + C \ `__ ^________-^ + C \_______/ + + rolq $18, A20 + movq A20, W0 + SWAP64 A2324, W1 + movd W1, A20 + rolq $14, A20 C Done A20 + ROTL64(56, A2324, W1) + por W1, A2324 + + movdqa A2122, W2 + ROTL64(2, W2, W1) + por W1, W2 + punpcklqdq W2, A2324 C Done A2324 + + ROTL64(61, A2122, W1) + por W1, A2122 + psrldq $8, A2122 + punpcklqdq W0, A2122 C Done A2122 + + C chi step. With the transposed matrix, applied independently + C to each column. + movq A05, T0 + notq T0 + andq A10, T0 + movq A10, T1 + notq T1 + andq A15, T1 + movq A15, T2 + notq T2 + andq A20, T2 + xorq T2, A10 + movq A20, T3 + notq T3 + andq A00, T3 + xorq T3, A15 + movq A00, T2 + notq T2 + andq A05, T2 + xorq T2, A20 + xorq T0, A00 + xorq T1, A05 + + movdqa A0607, W0 + pandn A1112, W0 + movdqa A1112, W1 + pandn A1617, W1 + movdqa A1617, W2 + pandn A2122, W2 + pxor W2, A1112 + movdqa A2122, W3 + pandn A0102, W3 + pxor W3, A1617 + movdqa A0102, W2 + pandn A0607, W2 + pxor W2, A2122 + pxor W0, A0102 + pxor W1, A0607 + + movdqa A0809, W0 + pandn A1314, W0 + movdqa A1314, W1 + pandn A1819, W1 + movdqa A1819, W2 + pandn A2324, W2 + pxor W2, A1314 + movdqa A2324, W3 + pandn A0304, W3 + pxor W3, A1819 + movdqa A0304, W2 + pandn A0809, W2 + pxor W2, A2324 + pxor W0, A0304 + pxor W1, A0809 + + xorq (RC, COUNT, 8), A00 + + C Transpose. + C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304, + C and also copy to C12 and C34 while at it. + + movq A05, C12 + movq A15, C34 + movq A10, W0 + movq A20, W1 + movq A00, C0 + punpcklqdq W0, C12 + punpcklqdq W1, C34 + movq A0102, A05 + movq A0304, A15 + psrldq $8, A0102 + psrldq $8, A0304 + xorq A05, C0 + xorq A15, C0 + movq A0102, A10 + movq A0304, A20 + + movdqa C12, A0102 + movdqa C34, A0304 + + C Transpose (A0607, A1112) + movdqa A0607, W0 + punpcklqdq A1112, A0607 + xorq A10, C0 + xorq A20, C0 + punpckhqdq W0, A1112 + SWAP64 A1112, A1112 + + C Transpose (A1819, A2324) + movdqa A1819, W0 + punpcklqdq A2324, A1819 + pxor A0607, C12 + pxor A1112, C12 + punpckhqdq W0, A2324 + SWAP64 A2324, A2324 + + C Transpose (A0809, A1314) and (A1617, A2122), and swap + movdqa A0809, W0 + movdqa A1314, W1 + movdqa A1617, A0809 + movdqa A2122, A1314 + pxor A1819, C34 + pxor A2324, C34 + punpcklqdq A2122, A0809 + punpckhqdq A1617, A1314 + SWAP64 A1314, A1314 + movdqa W0, A1617 + movdqa W1, A2122 + pxor A0809, C34 + pxor A1314, C34 + punpcklqdq W1, A1617 + punpckhqdq W0, A2122 + SWAP64 A2122, A2122 + + decl XREG(COUNT) + pxor A1617, C12 + pxor A2122, C12 jnz .Loop - addq $FRAME_SIZE, %rsp - W64_EXIT(1, 8) + movq A00, STATE(0) + movups A0102, STATE(1) + movups A0304, STATE(3) + + movq A05, STATE(5) + movups A0607, STATE(6) + movups A0809, STATE(8) + + movq A10, STATE(10) + movups A1112, STATE(11) + movups A1314, STATE(13) + + movq A15, STATE(15) + movups A1617, STATE(16) + movups A1819, STATE(18) + + movq A20, STATE(20) + movups A2122, STATE(21) + movups A2324, STATE(23) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + W64_EXIT(1, 16) ret EPILOGUE(nettle_sha3_permute) ALIGN(4) - .quad 0x0000000000000001, 0X0000000000008082 - .quad 0X800000000000808A, 0X8000000080008000 - .quad 0X000000000000808B, 0X0000000080000001 - .quad 0X8000000080008081, 0X8000000000008009 - .quad 0X000000000000008A, 0X0000000000000088 - .quad 0X0000000080008009, 0X000000008000000A - .quad 0X000000008000808B, 0X800000000000008B - .quad 0X8000000000008089, 0X8000000000008003 - .quad 0X8000000000008002, 0X8000000000000080 - .quad 0X000000000000800A, 0X800000008000000A - .quad 0X8000000080008081, 0X8000000000008080 - .quad 0X0000000080000001, 0X8000000080008008 -.rc_end: +.rc: C In reverse order + .quad 0x8000000080008008 + .quad 0x0000000080000001 + .quad 0x8000000000008080 + .quad 0x8000000080008081 + .quad 0x800000008000000A + .quad 0x000000000000800A + .quad 0x8000000000000080 + .quad 0x8000000000008002 + .quad 0x8000000000008003 + .quad 0x8000000000008089 + .quad 0x800000000000008B + .quad 0x000000008000808B + .quad 0x000000008000000A + .quad 0x0000000080008009 + .quad 0x0000000000000088 + .quad 0x000000000000008A + .quad 0x8000000000008009 + .quad 0x8000000080008081 + .quad 0x0000000080000001 + .quad 0x000000000000808B + .quad 0x8000000080008000 + .quad 0x800000000000808A + .quad 0x0000000000008082 + .quad 0x0000000000000001