define(<SE>,<%ebp>)
define(<DATA>,<%esp>)
define(<TMP>,<%edi>)
-define(<TMP2>,<%esi>) C Used by SWAP and F3
-
-define(<TMP3>, <64(%esp)>)dnl
+define(<TMP2>,<%esi>) C Used by SWAP
+define(<KVALUE>,<%esi>) C Used by rounds
+
C Constants
define(<K1VALUE>, <<$>0x5A827999>) C Rounds 0-19
define(<K2VALUE>, <<$>0x6ED9EBA1>) C Rounds 20-39
movl $1, TMP
xorl $2, TMP
xorl $3, TMP>)dnl
-C Uses TMP2
-define(<F3>, <
- movl $1, TMP2
- andl $2, TMP2
- movl $1, TMP
- orl $2, TMP
- andl $3, TMP
- orl TMP2, TMP>)dnl
C The form of one sha1 round is
C
C e += a <<< 5 + f( b, c, d ) + k + w;
C b <<<= 30
C
-C ROUND(a,b,c,d,e,f,k,w)
+C ROUND(a,b,c,d,e,f,w)
define(<ROUND>, <
- addl $7, $5
- addl ifelse($8,,TMP,$8), $5
+ addl KVALUE, $5
+ addl ifelse($7,,TMP,$7), $5
$6($2,$3,$4)
addl TMP, $5
movl $1, TMP
roll <$>5, TMP
addl TMP, $5
+ C roll <$>5, $1
+ C addl $1, $5
+ C rorl <$>5, $1
+ roll <$>30, $2>)dnl
+
+C As suggested by George Spelvin, write the F3 function as
+C (x&y) | (y&z) | (x&z) == (x & (y^z)) + (y&z). Then, we can compute
+C and add each term to e, using a single temporary.
+
+C ROUND_F3(a,b,c,d,e,w)
+define(<ROUND_F3>, <
+ addl KVALUE, $5
+ addl TMP, $5
+
+ movl $3, TMP
+ andl $4, TMP
+ addl TMP, $5
+ movl $3, TMP
+ xorl $4, TMP
+ andl $2, TMP
+ addl TMP, $5
+
+C Using the TMP register can be avoided, by rotating $1 in place,
+C adding, and then rotating back.
+ movl $1, TMP
+ roll <$>5, TMP
+ addl TMP, $5
+ C roll <$>5, $1
+ C addl $1, $5
+ C rorl <$>5, $1
roll <$>30, $2>)dnl
+
.file "sha1-compress.asm"
C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
pushl %esi C 72(%esp)
pushl %edi C 68(%esp)
+ C FIXME: Trim to 64
subl $68, %esp C %esp = W
C Load and byteswap data
movl 12(TMP), SD
movl 16(TMP), SE
- movl K1VALUE, TMP2
- ROUND(SA, SB, SC, SD, SE, <F1>, TMP2, NOEXPAND( 0))
- ROUND(SE, SA, SB, SC, SD, <F1>, TMP2, NOEXPAND( 1))
- ROUND(SD, SE, SA, SB, SC, <F1>, TMP2, NOEXPAND( 2))
- ROUND(SC, SD, SE, SA, SB, <F1>, TMP2, NOEXPAND( 3))
- ROUND(SB, SC, SD, SE, SA, <F1>, TMP2, NOEXPAND( 4))
-
- ROUND(SA, SB, SC, SD, SE, <F1>, TMP2, NOEXPAND( 5))
- ROUND(SE, SA, SB, SC, SD, <F1>, TMP2, NOEXPAND( 6))
- ROUND(SD, SE, SA, SB, SC, <F1>, TMP2, NOEXPAND( 7))
- ROUND(SC, SD, SE, SA, SB, <F1>, TMP2, NOEXPAND( 8))
- ROUND(SB, SC, SD, SE, SA, <F1>, TMP2, NOEXPAND( 9))
-
- ROUND(SA, SB, SC, SD, SE, <F1>, TMP2, NOEXPAND(10))
- ROUND(SE, SA, SB, SC, SD, <F1>, TMP2, NOEXPAND(11))
- ROUND(SD, SE, SA, SB, SC, <F1>, TMP2, NOEXPAND(12))
- ROUND(SC, SD, SE, SA, SB, <F1>, TMP2, NOEXPAND(13))
- ROUND(SB, SC, SD, SE, SA, <F1>, TMP2, NOEXPAND(14))
-
- ROUND(SA, SB, SC, SD, SE, <F1>, TMP2, NOEXPAND(15))
- EXPAND(16) ROUND(SE, SA, SB, SC, SD, <F1>, TMP2)
- EXPAND(17) ROUND(SD, SE, SA, SB, SC, <F1>, TMP2)
- EXPAND(18) ROUND(SC, SD, SE, SA, SB, <F1>, TMP2)
- EXPAND(19) ROUND(SB, SC, SD, SE, SA, <F1>, TMP2)
+ movl K1VALUE, KVALUE
+ ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND( 0))
+ ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND( 1))
+ ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND( 2))
+ ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND( 3))
+ ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND( 4))
+
+ ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND( 5))
+ ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND( 6))
+ ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND( 7))
+ ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND( 8))
+ ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND( 9))
+
+ ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND(10))
+ ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND(11))
+ ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND(12))
+ ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND(13))
+ ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND(14))
+
+ ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND(15))
+ EXPAND(16) ROUND(SE, SA, SB, SC, SD, <F1>)
+ EXPAND(17) ROUND(SD, SE, SA, SB, SC, <F1>)
+ EXPAND(18) ROUND(SC, SD, SE, SA, SB, <F1>)
+ EXPAND(19) ROUND(SB, SC, SD, SE, SA, <F1>)
C TMP2 is free to use in these rounds
- movl K2VALUE, TMP2
- EXPAND(20) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(21) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(22) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(23) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(24) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(25) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(26) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(27) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(28) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(29) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(30) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(31) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(32) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(33) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(34) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(35) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(36) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(37) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(38) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(39) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
+ movl K2VALUE, KVALUE
+ EXPAND(20) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(21) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(22) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(23) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(24) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(25) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(26) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(27) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(28) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(29) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(30) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(31) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(32) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(33) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(34) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(35) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(36) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(37) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(38) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(39) ROUND(SB, SC, SD, SE, SA, <F2>)
C We have to put this constant on the stack
- movl K3VALUE, TMP3
- EXPAND(40) ROUND(SA, SB, SC, SD, SE, <F3>, TMP3)
- EXPAND(41) ROUND(SE, SA, SB, SC, SD, <F3>, TMP3)
- EXPAND(42) ROUND(SD, SE, SA, SB, SC, <F3>, TMP3)
- EXPAND(43) ROUND(SC, SD, SE, SA, SB, <F3>, TMP3)
- EXPAND(44) ROUND(SB, SC, SD, SE, SA, <F3>, TMP3)
-
- EXPAND(45) ROUND(SA, SB, SC, SD, SE, <F3>, TMP3)
- EXPAND(46) ROUND(SE, SA, SB, SC, SD, <F3>, TMP3)
- EXPAND(47) ROUND(SD, SE, SA, SB, SC, <F3>, TMP3)
- EXPAND(48) ROUND(SC, SD, SE, SA, SB, <F3>, TMP3)
- EXPAND(49) ROUND(SB, SC, SD, SE, SA, <F3>, TMP3)
-
- EXPAND(50) ROUND(SA, SB, SC, SD, SE, <F3>, TMP3)
- EXPAND(51) ROUND(SE, SA, SB, SC, SD, <F3>, TMP3)
- EXPAND(52) ROUND(SD, SE, SA, SB, SC, <F3>, TMP3)
- EXPAND(53) ROUND(SC, SD, SE, SA, SB, <F3>, TMP3)
- EXPAND(54) ROUND(SB, SC, SD, SE, SA, <F3>, TMP3)
-
- EXPAND(55) ROUND(SA, SB, SC, SD, SE, <F3>, TMP3)
- EXPAND(56) ROUND(SE, SA, SB, SC, SD, <F3>, TMP3)
- EXPAND(57) ROUND(SD, SE, SA, SB, SC, <F3>, TMP3)
- EXPAND(58) ROUND(SC, SD, SE, SA, SB, <F3>, TMP3)
- EXPAND(59) ROUND(SB, SC, SD, SE, SA, <F3>, TMP3)
-
- movl K4VALUE, TMP2
- EXPAND(60) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(61) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(62) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(63) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(64) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(65) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(66) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(67) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(68) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(69) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(70) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(71) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(72) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(73) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(74) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
-
- EXPAND(75) ROUND(SA, SB, SC, SD, SE, <F2>, TMP2)
- EXPAND(76) ROUND(SE, SA, SB, SC, SD, <F2>, TMP2)
- EXPAND(77) ROUND(SD, SE, SA, SB, SC, <F2>, TMP2)
- EXPAND(78) ROUND(SC, SD, SE, SA, SB, <F2>, TMP2)
- EXPAND(79) ROUND(SB, SC, SD, SE, SA, <F2>, TMP2)
+ movl K3VALUE, KVALUE
+ EXPAND(40) ROUND_F3(SA, SB, SC, SD, SE)
+ EXPAND(41) ROUND_F3(SE, SA, SB, SC, SD)
+ EXPAND(42) ROUND_F3(SD, SE, SA, SB, SC)
+ EXPAND(43) ROUND_F3(SC, SD, SE, SA, SB)
+ EXPAND(44) ROUND_F3(SB, SC, SD, SE, SA)
+
+ EXPAND(45) ROUND_F3(SA, SB, SC, SD, SE)
+ EXPAND(46) ROUND_F3(SE, SA, SB, SC, SD)
+ EXPAND(47) ROUND_F3(SD, SE, SA, SB, SC)
+ EXPAND(48) ROUND_F3(SC, SD, SE, SA, SB)
+ EXPAND(49) ROUND_F3(SB, SC, SD, SE, SA)
+
+ EXPAND(50) ROUND_F3(SA, SB, SC, SD, SE)
+ EXPAND(51) ROUND_F3(SE, SA, SB, SC, SD)
+ EXPAND(52) ROUND_F3(SD, SE, SA, SB, SC)
+ EXPAND(53) ROUND_F3(SC, SD, SE, SA, SB)
+ EXPAND(54) ROUND_F3(SB, SC, SD, SE, SA)
+
+ EXPAND(55) ROUND_F3(SA, SB, SC, SD, SE)
+ EXPAND(56) ROUND_F3(SE, SA, SB, SC, SD)
+ EXPAND(57) ROUND_F3(SD, SE, SA, SB, SC)
+ EXPAND(58) ROUND_F3(SC, SD, SE, SA, SB)
+ EXPAND(59) ROUND_F3(SB, SC, SD, SE, SA)
+
+ movl K4VALUE, KVALUE
+ EXPAND(60) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(61) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(62) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(63) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(64) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(65) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(66) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(67) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(68) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(69) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(70) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(71) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(72) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(73) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(74) ROUND(SB, SC, SD, SE, SA, <F2>)
+
+ EXPAND(75) ROUND(SA, SB, SC, SD, SE, <F2>)
+ EXPAND(76) ROUND(SE, SA, SB, SC, SD, <F2>)
+ EXPAND(77) ROUND(SD, SE, SA, SB, SC, <F2>)
+ EXPAND(78) ROUND(SC, SD, SE, SA, SB, <F2>)
+ EXPAND(79) ROUND(SB, SC, SD, SE, SA, <F2>)
C Update the state vector
movl 88(%esp),TMP
ret
EPILOGUE(_nettle_sha1_compress)
-C It's possible to shave of half of the stores to tmp in the evaluation of f3,
+C George Spelvin also suggested using lea, with an immediate offset
+C for the magic constants. This frees one register, which can be used
+C for loosen up dependencies and to more operations in parallel. For
+C example, take the rounds involving f2, the simplest round function.
+C Currently, we have
+C
+C movl 16(%esp), TMP
+C xorl 24(%esp), TMP
+C xorl 48(%esp), TMP
+C xorl 4(%esp), TMP
+C roll $1, TMP
+C movl TMP, 16(%esp)
+C addl KVALUE, SE C 0
+C addl TMP, SE C 1
+C movl SB, TMP C 0
+C xorl SC, TMP C 1
+C xorl SD, TMP C 2
+C addl TMP, SE C 3
+C movl SA, TMP C 0
+C roll $5, TMP C 1
+C addl TMP, SE C 4
+C roll $30, SB C 0
+
+C These 16 instructions could be executed in 5.33 cycles if there were
+C no dependencies. The crucial dependencies are from (previous) SE to
+C use SA, and (previous) result SB to use SC. (What does this say
+C about recurrency chain? Ought to unroll 5 times to see it).
+
+C It would be preferable to accumulate the terms in two or more
+C registers, to make dependencies shallower. Something like
+
+C ...expand, put data in W
+C movl SD, TMP C 0
+C leal K1VALUE(SE, W), SE C 0
+C movl SA, TMP2 C 0
+C xorl SC, TMP C 1
+C roll $5, TMP2 C 1
+C xorl SB, TMP C 2
+C addl TMP2, TMP C 3
+C addl TMP, SE C 4
+C a + b + c + d + e = ((((a + b) + c) + d) + e), latency 4
+C a + b + c + d + e = ((a + b) + c) + (d + e)
+C the out-of-order execution. Next iteration
+C
+C ...expand...
+C roll $1, TMP C 4
+C movl TMP, 16(%esp) C 5
+C addl KVALUE, SD C 0
+C addl TMP, SD C 5
+C movl SA, TMP C 0
+C xorl SB, TMP C 1
+C xorl SC, TMP C 2
+C addl TMP, SD C 6
+C movl SE, TMP C 8
+C roll $5, TMP C 9
+C addl TMP, SD C 7
+C roll $30, SA C 0
+C
+C Lets look at the latency. Next iteration will operate on (E, A, B, C, D), so we have recurrencies:
+
+C from result SA to use of SE (none, SA not modified)
+C from result of SB to use of SA, result of SC to use of SB
+
+C It's possible to shave of half of the stores to tmp in the evaluation of f3,
C although it's probably not worth the effort. This is the trick:
C
C round(a,b,c,d,e,f,k) modifies only b,e.