define(<SE>,<%ebp>)
define(<DATA>,<%esp>)
define(<T1>,<%edi>)
-define(<T2>,<%esi>) C Used by SWAP
+define(<T2>,<%esi>)
C Constants
define(<K1VALUE>, <0x5A827999>) C Rounds 0-19
xor $3, T2
and $2, T2
xor $4, T2
- lea K1VALUE (T1, T2), T2
rol <$>30, $2
+ lea K1VALUE (T1, $5), $5
mov $1, T1
rol <$>5, T1
add T1, $5
mov $4, T2
xor $3, T2
xor $2, T2
- lea $7 (T1, T2), T2
rol <$>30, $2
+ lea $7 (T1, $5), $5
mov $1, T1
rol <$>5, T1
add T1, $5
mov T1, OFFSET(eval($6 % 16)) (DATA)
mov $4, T2
and $3, T2
- lea K3VALUE (T1, T2), T1
- mov $4, T2
- xor $3, T2
- and $2, T2
- add T1, $5
+ lea K3VALUE (T1, $5), $5
+ mov $4, T1
+ xor $3, T1
+ and $2, T1
+ add T2, $5
rol <$>30, $2
- mov $1, T1
- rol <$>5, T1
+ mov $1, T2
+ rol <$>5, T2
add T1, $5
add T2, $5
>)
C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
.text
- ALIGN(4)
+
PROLOGUE(_nettle_sha1_compress)
C save all registers that need to be saved
C 88(%esp) data
subl $64, %esp C %esp = W
- C Load and byteswap data
- movl 88(%esp), T2
-
- SWAP( 0, %eax) SWAP( 1, %ebx) SWAP( 2, %ecx) SWAP( 3, %edx)
- SWAP( 4, %eax) SWAP( 5, %ebx) SWAP( 6, %ecx) SWAP( 7, %edx)
- SWAP( 8, %eax) SWAP( 9, %ebx) SWAP(10, %ecx) SWAP(11, %edx)
- SWAP(12, %eax) SWAP(13, %ebx) SWAP(14, %ecx) SWAP(15, %edx)
-
- C load the state vector
- movl 84(%esp),T1
- movl (T1), SA
- movl 4(T1), SB
- movl 8(T1), SC
- movl 12(T1), SD
- movl 16(T1), SE
-
- ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 0)
- ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 1)
- ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 2)
- ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 3)
- ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 4)
-
- ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 5)
- ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 6)
- ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 7)
- ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 8)
- ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 9)
-
- ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
- ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
- ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
- ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
- ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
-
- ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
- ROUND_F1(SE, SA, SB, SC, SD, 16)
- ROUND_F1(SD, SE, SA, SB, SC, 17)
- ROUND_F1(SC, SD, SE, SA, SB, 18)
- ROUND_F1(SB, SC, SD, SE, SA, 19)
-
- ROUND_F2(SA, SB, SC, SD, SE, 20, K2VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 21, K2VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 22, K2VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 23, K2VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 24, K2VALUE)
-
- ROUND_F2(SA, SB, SC, SD, SE, 25, K2VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 26, K2VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 27, K2VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 28, K2VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 29, K2VALUE)
-
- ROUND_F2(SA, SB, SC, SD, SE, 30, K2VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 31, K2VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 32, K2VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 33, K2VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 34, K2VALUE)
-
- ROUND_F2(SA, SB, SC, SD, SE, 35, K2VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 36, K2VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 37, K2VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
-
- ROUND_F3(SA, SB, SC, SD, SE, 40)
- ROUND_F3(SE, SA, SB, SC, SD, 41)
- ROUND_F3(SD, SE, SA, SB, SC, 42)
- ROUND_F3(SC, SD, SE, SA, SB, 43)
- ROUND_F3(SB, SC, SD, SE, SA, 44)
-
- ROUND_F3(SA, SB, SC, SD, SE, 45)
- ROUND_F3(SE, SA, SB, SC, SD, 46)
- ROUND_F3(SD, SE, SA, SB, SC, 47)
- ROUND_F3(SC, SD, SE, SA, SB, 48)
- ROUND_F3(SB, SC, SD, SE, SA, 49)
-
- ROUND_F3(SA, SB, SC, SD, SE, 50)
- ROUND_F3(SE, SA, SB, SC, SD, 51)
- ROUND_F3(SD, SE, SA, SB, SC, 52)
- ROUND_F3(SC, SD, SE, SA, SB, 53)
- ROUND_F3(SB, SC, SD, SE, SA, 54)
-
- ROUND_F3(SA, SB, SC, SD, SE, 55)
- ROUND_F3(SE, SA, SB, SC, SD, 56)
- ROUND_F3(SD, SE, SA, SB, SC, 57)
- ROUND_F3(SC, SD, SE, SA, SB, 58)
- ROUND_F3(SB, SC, SD, SE, SA, 59)
-
- ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
-
- ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
-
- ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
+ C Loop-mixed to 520 cycles (for the complete function call) on
+ C AMD K7.
+ALIGN(5)
+ mov 88(%esp), T2
+ mov OFFSET(2)(T2), %ecx
+ mov OFFSET(0)(T2), %eax
+ bswap %ecx
+ bswap %eax
+ mov %ecx, OFFSET(2) (DATA)
+ mov %eax, OFFSET(0) (DATA)
+ mov OFFSET(3)(T2), %edx
+ mov OFFSET(6)(T2), %ecx
+ mov OFFSET(4)(T2), %eax
+ mov OFFSET(1)(T2), %ebx
+ bswap %ebx
+ bswap %eax
+ bswap %ecx
+ mov %ecx, OFFSET(6) (DATA)
+ mov %eax, OFFSET(4) (DATA)
+ bswap %edx
+ mov %edx, OFFSET(3) (DATA)
+ mov %ebx, OFFSET(1) (DATA)
+ mov OFFSET(10)(T2), %ecx
+ mov OFFSET(8)(T2), %eax
+ mov OFFSET(7)(T2), %edx
+ bswap %eax
+ bswap %edx
+ mov %edx, OFFSET(7) (DATA)
+ mov OFFSET(5)(T2), %ebx
+ mov %eax, OFFSET(8) (DATA)
+ mov OFFSET(11)(T2), %edx
+ bswap %ecx
+ bswap %edx
+ mov OFFSET(12)(T2), %eax
+ bswap %ebx
+ mov %ecx, OFFSET(10) (DATA)
+ mov %ebx, OFFSET(5) (DATA)
+ mov %edx, OFFSET(11) (DATA)
+ mov OFFSET(15)(T2), %edx
+ mov 84(%esp),T1
+ mov OFFSET(9)(T2), %ebx
+ bswap %edx
+ bswap %ebx
+ bswap %eax
+ mov OFFSET(14)(T2), %ecx
+ mov %edx, OFFSET(15) (DATA)
+ bswap %ecx
+ mov %ecx, OFFSET(14) (DATA)
+ mov %ebx, OFFSET(9) (DATA)
+ mov OFFSET(13)(T2), %ebx
+ mov 12(T1), SD
+ bswap %ebx
+ mov %ebx, OFFSET(13) (DATA)
+ mov 8(T1), SC
+ mov 16(T1), SE
+ mov 4(T1), SB
+ mov SD, T2
+ add OFFSET(0) (DATA), SE
+ xor SC, T2
+ mov %eax, OFFSET(12) (DATA)
+ mov (T1), SA
+ and SB, T2
+ xor SD, T2
+ rol $30, SB
+ add T2, SE
+ mov SA, T1
+ mov SC, T2
+ add OFFSET(1) (DATA), SD
+ rol $5, T1
+ xor SB, T2
+ and SA, T2
+ xor SC, T2
+ lea K1VALUE (T1, SE), SE
+ add T2, SD
+ mov SB, T2
+ rol $30, SA
+ xor SA, T2
+ and SE, T2
+ mov SE, T1
+ add OFFSET(2) (DATA), SC
+ rol $30, SE
+ xor SB, T2
+ rol $5, T1
+ lea K1VALUE (T1, SD), SD
+ mov SD, T1
+ rol $5, T1
+ add T2, SC
+ mov SA, T2
+ xor SE, T2
+ lea K1VALUE (T1, SC), SC
+ and SD, T2
+ xor SA, T2
+ add OFFSET(3) (DATA), SB
+ mov SC, T1
+ add T2, SB
+ mov SE, T2
+ rol $30, SD
+ xor SD, T2
+ and SC, T2
+ rol $5, T1
+ xor SE, T2
+ add OFFSET(4) (DATA), SA
+ lea K1VALUE (T1, SB), SB
+ add T2, SA
+ rol $30, SC
+ mov SD, T2
+ xor SC, T2
+ and SB, T2
+ mov SB, T1
+ rol $5, T1
+ add OFFSET(5) (DATA), SE
+ rol $30, SB
+ xor SD, T2
+ add T2, SE
+ mov SC, T2
+ xor SB, T2
+ lea K1VALUE (T1, SA), SA
+ mov SA, T1
+ add OFFSET(6) (DATA), SD
+ and SA, T2
+ rol $5, T1
+ xor SC, T2
+ lea K1VALUE (T1, SE), SE
+ rol $30, SA
+ add T2, SD
+ mov SB, T2
+ mov SE, T1
+ xor SA, T2
+ and SE, T2
+ rol $5, T1
+ lea K1VALUE (T1, SD), SD
+ xor SB, T2
+ add OFFSET(7) (DATA), SC
+ rol $30, SE
+ add OFFSET(8) (DATA), SB
+ mov SD, T1
+ add T2, SC
+ mov SA, T2
+ xor SE, T2
+ rol $5, T1
+ and SD, T2
+ lea K1VALUE (T1, SC), SC
+ xor SA, T2
+ add T2, SB
+ mov SE, T2
+ mov SC, T1
+ rol $30, SD
+ xor SD, T2
+ rol $5, T1
+ lea K1VALUE (T1, SB), SB
+ and SC, T2
+ xor SE, T2
+ add OFFSET(10) (DATA), SE
+ add OFFSET(9) (DATA), SA
+ mov SB, T1
+ add T2, SA
+ rol $5, T1
+ lea K1VALUE (T1, SA), SA
+ mov SD, T2
+ rol $30, SC
+ xor SC, T2
+ and SB, T2
+ xor SD, T2
+ rol $30, SB
+ add T2, SE
+ mov SC, T2
+ mov SA, T1
+ xor SB, T2
+ add OFFSET(11) (DATA), SD
+ and SA, T2
+ rol $30, SA
+ rol $5, T1
+ xor SC, T2
+ lea K1VALUE (T1, SE), SE
+ add T2, SD
+ mov SB, T2
+ xor SA, T2
+ mov SE, T1
+ rol $5, T1
+ and SE, T2
+ lea K1VALUE (T1, SD), SD
+ xor SB, T2
+ add OFFSET(12) (DATA), SC
+ add T2, SC
+ rol $30, SE
+ mov SA, T2
+ xor SE, T2
+ mov SD, T1
+ rol $5, T1
+ and SD, T2
+ add OFFSET(13) (DATA), SB
+ lea K1VALUE (T1, SC), SC
+ xor SA, T2
+ add T2, SB
+ mov SE, T2
+ rol $30, SD
+ xor SD, T2
+ and SC, T2
+ mov SC, T1
+ rol $5, T1
+ rol $30, SC
+ add OFFSET(14) (DATA), SA
+ xor SE, T2
+ add T2, SA
+ mov SD, T2
+ xor SC, T2
+ lea K1VALUE (T1, SB), SB
+ and SB, T2
+ mov SB, T1
+ rol $5, T1
+ lea K1VALUE (T1, SA), SA
+ mov SA, T1
+ xor SD, T2
+ add OFFSET(15) (DATA), SE
+ add T2, SE
+ rol $5, T1
+ lea K1VALUE (T1, SE), SE
+ mov OFFSET(0) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ mov SC, T2
+ xor OFFSET(8) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ rol $30, SB
+ xor SB, T2
+ and SA, T2
+ xor SC, T2
+ rol $1, T1
+ lea K1VALUE (T1, T2), T2
+ mov T1, OFFSET(0) (DATA)
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(1) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ rol $30, SA
+ add T2, SD
+ mov SB, T2
+ xor SA, T2
+ and SE, T2
+ xor OFFSET(9) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ xor SB, T2
+ rol $1, T1
+ mov T1, OFFSET(1) (DATA)
+ lea K1VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(2) (DATA), T1
+ xor OFFSET(4) (DATA), T1
+ rol $30, SE
+ add T2, SC
+ mov SA, T2
+ xor SE, T2
+ xor OFFSET(10) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ and SD, T2
+ rol $1, T1
+ xor SA, T2
+ mov T1, OFFSET(2) (DATA)
+ lea K1VALUE (T1, T2), T2
+ mov SC, T1
+ rol $30, SD
+ rol $5, T1
+ add T1, SB
+ add T2, SB
+ mov SE, T2
+ mov OFFSET(3) (DATA), T1
+ xor SD, T2
+ xor OFFSET(5) (DATA), T1
+ and SC, T2
+ xor SE, T2
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(3) (DATA)
+ lea K1VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(4) (DATA), T1
+ xor OFFSET(6) (DATA), T1
+ rol $30, SC
+ xor OFFSET(12) (DATA), T1
+ add T2, SA
+ xor OFFSET(1) (DATA), T1
+ mov SD, T2
+ xor SC, T2
+ rol $1, T1
+ xor SB, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(4) (DATA)
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(5) (DATA), T1
+ add T2, SE
+ mov SC, T2
+ xor OFFSET(7) (DATA), T1
+ rol $30, SB
+ xor OFFSET(13) (DATA), T1
+ xor SB, T2
+ xor OFFSET(2) (DATA), T1
+ xor SA, T2
+ rol $1, T1
+ mov T1, OFFSET(5) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(6) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ add T2, SD
+ rol $30, SA
+ xor OFFSET(14) (DATA), T1
+ mov SB, T2
+ xor OFFSET(3) (DATA), T1
+ xor SA, T2
+ rol $1, T1
+ xor SE, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(6) (DATA)
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ add T2, SC
+ mov SA, T2
+ rol $30, SE
+ mov OFFSET(7) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ xor SE, T2
+ xor OFFSET(15) (DATA), T1
+ xor OFFSET(4) (DATA), T1
+ xor SD, T2
+ rol $1, T1
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(7) (DATA)
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(8) (DATA), T1
+ xor OFFSET(10) (DATA), T1
+ add T2, SB
+ rol $30, SD
+ mov SE, T2
+ xor OFFSET(0) (DATA), T1
+ xor OFFSET(5) (DATA), T1
+ xor SD, T2
+ xor SC, T2
+ rol $1, T1
+ mov T1, OFFSET(8) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(9) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ add T2, SA
+ xor OFFSET(6) (DATA), T1
+ mov SD, T2
+ rol $1, T1
+ rol $30, SC
+ xor SC, T2
+ mov T1, OFFSET(9) (DATA)
+ xor SB, T2
+ lea K2VALUE (T1, T2), T2
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(10) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ add T2, SE
+ mov SC, T2
+ rol $30, SB
+ xor OFFSET(7) (DATA), T1
+ xor SB, T2
+ rol $1, T1
+ xor SA, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(10) (DATA)
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(11) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ rol $30, SA
+ xor OFFSET(3) (DATA), T1
+ add T2, SD
+ xor OFFSET(8) (DATA), T1
+ mov SB, T2
+ xor SA, T2
+ rol $1, T1
+ mov T1, OFFSET(11) (DATA)
+ xor SE, T2
+ lea K2VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(12) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ rol $30, SE
+ add T2, SC
+ xor OFFSET(4) (DATA), T1
+ mov SA, T2
+ xor OFFSET(9) (DATA), T1
+ xor SE, T2
+ rol $1, T1
+ xor SD, T2
+ mov T1, OFFSET(12) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ rol $30, SD
+ mov OFFSET(13) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ add T2, SB
+ mov SE, T2
+ xor OFFSET(5) (DATA), T1
+ xor SD, T2
+ xor OFFSET(10) (DATA), T1
+ xor SC, T2
+ rol $1, T1
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(13) (DATA)
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ add T2, SA
+ mov SD, T2
+ mov OFFSET(14) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ rol $30, SC
+ xor OFFSET(6) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor SC, T2
+ xor SB, T2
+ rol $1, T1
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(14) (DATA)
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(15) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ add T2, SE
+ mov SC, T2
+ rol $30, SB
+ xor SB, T2
+ xor OFFSET(7) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ xor SA, T2
+ rol $1, T1
+ mov T1, OFFSET(15) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(0) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ add T2, SD
+ mov SB, T2
+ rol $30, SA
+ xor SA, T2
+ xor OFFSET(13) (DATA), T1
+ rol $1, T1
+ xor SE, T2
+ mov T1, OFFSET(0) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(1) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ add T2, SC
+ mov SA, T2
+ rol $30, SE
+ xor SE, T2
+ xor OFFSET(9) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ rol $1, T1
+ xor SD, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(1) (DATA)
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(2) (DATA), T1
+ rol $30, SD
+ xor OFFSET(4) (DATA), T1
+ add T2, SB
+ mov SE, T2
+ xor OFFSET(10) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ xor SD, T2
+ xor SC, T2
+ rol $1, T1
+ mov T1, OFFSET(2) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(3) (DATA), T1
+ xor OFFSET(5) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ add T2, SA
+ rol $30, SC
+ mov SD, T2
+ xor SC, T2
+ rol $1, T1
+ xor SB, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(3) (DATA)
+ mov SA, T1
+ rol $5, T1
+ rol $30, SB
+ add T1, SE
+ mov OFFSET(4) (DATA), T1
+ add T2, SE
+ xor OFFSET(6) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ mov SC, T2
+ xor SB, T2
+ rol $1, T1
+ xor SA, T2
+ lea K2VALUE (T1, T2), T2
+ mov T1, OFFSET(4) (DATA)
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ add T2, SD
+ mov OFFSET(5) (DATA), T1
+ mov SB, T2
+ rol $30, SA
+ xor SA, T2
+ xor SE, T2
+ xor OFFSET(7) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(5) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(6) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ add T2, SC
+ xor OFFSET(14) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(6) (DATA)
+ mov SA, T2
+ rol $30, SE
+ xor SE, T2
+ xor SD, T2
+ lea K2VALUE (T1, T2), T2
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ add T2, SB
+ mov OFFSET(7) (DATA), T1
+ mov SE, T2
+ rol $30, SD
+ xor OFFSET(9) (DATA), T1
+ xor SD, T2
+ xor SC, T2
+ xor OFFSET(15) (DATA), T1
+ xor OFFSET(4) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(7) (DATA)
+ lea K2VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(8) (DATA), T1
+ xor OFFSET(10) (DATA), T1
+ rol $30, SC
+ xor OFFSET(0) (DATA), T1
+ add T2, SA
+ mov SD, T2
+ xor OFFSET(5) (DATA), T1
+ rol $1, T1
+ and SC, T2
+ mov T1, OFFSET(8) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SE
+ mov SA, T1
+ mov SD, T2
+ xor SC, T2
+ and SB, T2
+ rol $30, SB
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(9) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ add T2, SE
+ mov SC, T2
+ xor OFFSET(6) (DATA), T1
+ rol $1, T1
+ and SB, T2
+ mov T1, OFFSET(9) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SD
+ mov SC, T2
+ xor SB, T2
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(10) (DATA), T1
+ and SA, T2
+ add T2, SD
+ xor OFFSET(12) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ rol $30, SA
+ mov SB, T2
+ and SA, T2
+ xor OFFSET(7) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(10) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SC
+ mov SD, T1
+ rol $5, T1
+ mov SB, T2
+ add T1, SC
+ mov OFFSET(11) (DATA), T1
+ xor SA, T2
+ xor OFFSET(13) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ and SE, T2
+ xor OFFSET(8) (DATA), T1
+ add T2, SC
+ rol $1, T1
+ mov SA, T2
+ mov T1, OFFSET(11) (DATA)
+ rol $30, SE
+ and SE, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SB
+ mov SA, T2
+ mov SC, T1
+ xor SE, T2
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(12) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ xor OFFSET(4) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ and SD, T2
+ rol $30, SD
+ add T2, SB
+ rol $1, T1
+ mov T1, OFFSET(12) (DATA)
+ mov SE, T2
+ and SD, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SA
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(13) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ mov SE, T2
+ xor OFFSET(5) (DATA), T1
+ xor SD, T2
+ and SC, T2
+ xor OFFSET(10) (DATA), T1
+ add T2, SA
+ rol $1, T1
+ rol $30, SC
+ mov T1, OFFSET(13) (DATA)
+ mov SD, T2
+ and SC, T2
+ lea K3VALUE (T1, T2), T1
+ mov SD, T2
+ add T1, SE
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(14) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ xor SC, T2
+ and SB, T2
+ xor OFFSET(6) (DATA), T1
+ rol $30, SB
+ xor OFFSET(11) (DATA), T1
+ rol $1, T1
+ add T2, SE
+ mov SC, T2
+ mov T1, OFFSET(14) (DATA)
+ and SB, T2
+ lea K3VALUE (T1, T2), T1
+ mov SC, T2
+ add T1, SD
+ mov SE, T1
+ xor SB, T2
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(15) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ and SA, T2
+ xor OFFSET(7) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ add T2, SD
+ rol $30, SA
+ mov SB, T2
+ rol $1, T1
+ mov T1, OFFSET(15) (DATA)
+ and SA, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SC
+ mov SD, T1
+ mov SB, T2
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(0) (DATA), T1
+ xor SA, T2
+ xor OFFSET(2) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ and SE, T2
+ add T2, SC
+ rol $30, SE
+ rol $1, T1
+ mov T1, OFFSET(0) (DATA)
+ mov SA, T2
+ and SE, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SB
+ mov SC, T1
+ mov SA, T2
+ xor SE, T2
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(1) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ and SD, T2
+ xor OFFSET(14) (DATA), T1
+ add T2, SB
+ rol $30, SD
+ mov SE, T2
+ rol $1, T1
+ and SD, T2
+ mov T1, OFFSET(1) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SA
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov SE, T2
+ mov OFFSET(2) (DATA), T1
+ xor SD, T2
+ xor OFFSET(4) (DATA), T1
+ xor OFFSET(10) (DATA), T1
+ and SC, T2
+ add T2, SA
+ xor OFFSET(15) (DATA), T1
+ rol $30, SC
+ mov SD, T2
+ rol $1, T1
+ mov T1, OFFSET(2) (DATA)
+ and SC, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SE
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(3) (DATA), T1
+ xor OFFSET(5) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ mov SD, T2
+ rol $1, T1
+ xor SC, T2
+ and SB, T2
+ mov T1, OFFSET(3) (DATA)
+ rol $30, SB
+ add T2, SE
+ mov SC, T2
+ and SB, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SD
+ mov SE, T1
+ mov SC, T2
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(4) (DATA), T1
+ xor OFFSET(6) (DATA), T1
+ xor SB, T2
+ and SA, T2
+ add T2, SD
+ mov SB, T2
+ xor OFFSET(12) (DATA), T1
+ rol $30, SA
+ and SA, T2
+ xor OFFSET(1) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(4) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SC
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(5) (DATA), T1
+ xor OFFSET(7) (DATA), T1
+ mov SB, T2
+ xor OFFSET(13) (DATA), T1
+ xor SA, T2
+ xor OFFSET(2) (DATA), T1
+ and SE, T2
+ rol $30, SE
+ add T2, SC
+ rol $1, T1
+ mov SA, T2
+ mov T1, OFFSET(5) (DATA)
+ and SE, T2
+ lea K3VALUE (T1, T2), T1
+ add T1, SB
+ mov SA, T2
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ xor SE, T2
+ and SD, T2
+ mov OFFSET(6) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ rol $1, T1
+ add T2, SB
+ rol $30, SD
+ mov SE, T2
+ and SD, T2
+ mov T1, OFFSET(6) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SA
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(7) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ mov SE, T2
+ xor SD, T2
+ xor OFFSET(15) (DATA), T1
+ and SC, T2
+ rol $30, SC
+ add T2, SA
+ mov SD, T2
+ xor OFFSET(4) (DATA), T1
+ rol $1, T1
+ and SC, T2
+ mov T1, OFFSET(7) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SE
+ mov SA, T1
+ rol $5, T1
+ mov SD, T2
+ add T1, SE
+ mov OFFSET(8) (DATA), T1
+ xor OFFSET(10) (DATA), T1
+ xor SC, T2
+ xor OFFSET(0) (DATA), T1
+ and SB, T2
+ add T2, SE
+ xor OFFSET(5) (DATA), T1
+ rol $30, SB
+ mov SC, T2
+ and SB, T2
+ rol $1, T1
+ mov T1, OFFSET(8) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SD
+ mov SE, T1
+ rol $5, T1
+ mov SC, T2
+ xor SB, T2
+ add T1, SD
+ and SA, T2
+ mov OFFSET(9) (DATA), T1
+ rol $30, SA
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ add T2, SD
+ mov SB, T2
+ and SA, T2
+ xor OFFSET(6) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(9) (DATA)
+ lea K3VALUE (T1, T2), T1
+ add T1, SC
+ mov SD, T1
+ rol $5, T1
+ mov SB, T2
+ xor SA, T2
+ and SE, T2
+ add T1, SC
+ mov OFFSET(10) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ add T2, SC
+ mov SA, T2
+ rol $30, SE
+ xor OFFSET(7) (DATA), T1
+ rol $1, T1
+ and SE, T2
+ mov T1, OFFSET(10) (DATA)
+ lea K3VALUE (T1, T2), T1
+ mov SA, T2
+ xor SE, T2
+ add T1, SB
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(11) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ and SD, T2
+ add T2, SB
+ mov SE, T2
+ rol $1, T1
+ mov T1, OFFSET(11) (DATA)
+ rol $30, SD
+ and SD, T2
+ lea K3VALUE (T1, T2), T1
+ mov SE, T2
+ add T1, SA
+ xor SD, T2
+ mov SB, T1
+ and SC, T2
+ rol $30, SC
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(12) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ add T2, SA
+ mov SD, T2
+ xor OFFSET(4) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(12) (DATA)
+ xor SC, T2
+ xor SB, T2
+ lea K4VALUE (T1, T2), T2
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(13) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ add T2, SE
+ rol $30, SB
+ mov SC, T2
+ xor OFFSET(5) (DATA), T1
+ xor SB, T2
+ xor OFFSET(10) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(13) (DATA)
+ xor SA, T2
+ lea K4VALUE (T1, T2), T2
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(14) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ rol $30, SA
+ add T2, SD
+ mov SB, T2
+ xor SA, T2
+ xor SE, T2
+ xor OFFSET(6) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(14) (DATA)
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ add T2, SC
+ mov OFFSET(15) (DATA), T1
+ mov SA, T2
+ rol $30, SE
+ xor OFFSET(1) (DATA), T1
+ xor OFFSET(7) (DATA), T1
+ xor SE, T2
+ xor SD, T2
+ xor OFFSET(12) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(15) (DATA)
+ lea K4VALUE (T1, T2), T2
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(0) (DATA), T1
+ add T2, SB
+ xor OFFSET(2) (DATA), T1
+ mov SE, T2
+ rol $30, SD
+ xor OFFSET(8) (DATA), T1
+ xor SD, T2
+ xor OFFSET(13) (DATA), T1
+ xor SC, T2
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(0) (DATA)
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(1) (DATA), T1
+ rol $30, SC
+ xor OFFSET(3) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ add T2, SA
+ mov SD, T2
+ xor SC, T2
+ rol $1, T1
+ xor SB, T2
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(1) (DATA)
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(2) (DATA), T1
+ rol $30, SB
+ xor OFFSET(4) (DATA), T1
+ add T2, SE
+ mov SC, T2
+ xor SB, T2
+ xor OFFSET(10) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ xor SA, T2
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(2) (DATA)
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ mov OFFSET(3) (DATA), T1
+ xor OFFSET(5) (DATA), T1
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ rol $30, SA
+ add T2, SD
+ mov SB, T2
+ rol $1, T1
+ mov T1, OFFSET(3) (DATA)
+ xor SA, T2
+ xor SE, T2
+ lea K4VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(4) (DATA), T1
+ add T2, SC
+ rol $30, SE
+ xor OFFSET(6) (DATA), T1
+ mov SA, T2
+ xor OFFSET(12) (DATA), T1
+ xor SE, T2
+ xor OFFSET(1) (DATA), T1
+ rol $1, T1
+ xor SD, T2
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(4) (DATA)
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ rol $30, SD
+ mov OFFSET(5) (DATA), T1
+ add T2, SB
+ xor OFFSET(7) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ mov SE, T2
+ xor SD, T2
+ xor OFFSET(2) (DATA), T1
+ xor SC, T2
+ rol $1, T1
+ mov T1, OFFSET(5) (DATA)
+ lea K4VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(6) (DATA), T1
+ xor OFFSET(8) (DATA), T1
+ xor OFFSET(14) (DATA), T1
+ add T2, SA
+ xor OFFSET(3) (DATA), T1
+ mov SD, T2
+ rol $30, SC
+ rol $1, T1
+ xor SC, T2
+ mov T1, OFFSET(6) (DATA)
+ xor SB, T2
+ lea K4VALUE (T1, T2), T2
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ add T2, SE
+ mov OFFSET(7) (DATA), T1
+ xor OFFSET(9) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ rol $30, SB
+ xor OFFSET(4) (DATA), T1
+ mov SC, T2
+ rol $1, T1
+ mov T1, OFFSET(7) (DATA)
+ xor SB, T2
+ xor SA, T2
+ lea K4VALUE (T1, T2), T2
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ rol $30, SA
+ mov OFFSET(8) (DATA), T1
+ xor OFFSET(10) (DATA), T1
+ add T2, SD
+ xor OFFSET(0) (DATA), T1
+ xor OFFSET(5) (DATA), T1
+ rol $1, T1
+ mov SB, T2
+ mov T1, OFFSET(8) (DATA)
+ xor SA, T2
+ xor SE, T2
+ lea K4VALUE (T1, T2), T2
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ add T2, SC
+ mov SA, T2
+ mov OFFSET(9) (DATA), T1
+ rol $30, SE
+ xor OFFSET(11) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ xor OFFSET(6) (DATA), T1
+ xor SE, T2
+ xor SD, T2
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(9) (DATA)
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ rol $30, SD
+ mov OFFSET(10) (DATA), T1
+ xor OFFSET(12) (DATA), T1
+ xor OFFSET(2) (DATA), T1
+ add T2, SB
+ mov SE, T2
+ xor SD, T2
+ xor SC, T2
+ xor OFFSET(7) (DATA), T1
+ rol $1, T1
+ mov T1, OFFSET(10) (DATA)
+ lea K4VALUE (T1, T2), T2
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ mov OFFSET(11) (DATA), T1
+ xor OFFSET(13) (DATA), T1
+ xor OFFSET(3) (DATA), T1
+ add T2, SA
+ mov SD, T2
+ rol $30, SC
+ xor SC, T2
+ xor OFFSET(8) (DATA), T1
+ rol $1, T1
+ xor SB, T2
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(11) (DATA)
+ mov SA, T1
+ rol $5, T1
+ add T1, SE
+ mov OFFSET(12) (DATA), T1
+ add T2, SE
+ xor OFFSET(14) (DATA), T1
+ rol $30, SB
+ mov SC, T2
+ xor OFFSET(4) (DATA), T1
+ xor SB, T2
+ xor SA, T2
+ xor OFFSET(9) (DATA), T1
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(12) (DATA)
+ mov SE, T1
+ rol $5, T1
+ add T1, SD
+ add T2, SD
+ rol $30, SA
+ mov OFFSET(13) (DATA), T1
+ xor OFFSET(15) (DATA), T1
+ mov SB, T2
+ xor OFFSET(5) (DATA), T1
+ xor SA, T2
+ xor OFFSET(10) (DATA), T1
+ xor SE, T2
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(13) (DATA)
+ mov SD, T1
+ rol $5, T1
+ add T1, SC
+ mov OFFSET(14) (DATA), T1
+ xor OFFSET(0) (DATA), T1
+ xor OFFSET(6) (DATA), T1
+ add T2, SC
+ rol $30, SE
+ mov SA, T2
+ xor SE, T2
+ xor OFFSET(11) (DATA), T1
+ xor SD, T2
+ rol $1, T1
+ lea K4VALUE (T1, T2), T2
+ mov T1, OFFSET(14) (DATA)
+ mov SC, T1
+ rol $5, T1
+ add T1, SB
+ mov OFFSET(15) (DATA), T1
+ xor OFFSET(1) (DATA), T1
+ xor OFFSET(7) (DATA), T1
+ rol $30, SD
+ add T2, SB
+ xor OFFSET(12) (DATA), T1
+ mov SE, T2
+ xor SD, T2
+ rol $1, T1
+ xor SC, T2
+ lea K4VALUE (T1, T2), T2
+ rol $30, SC
+ mov T1, OFFSET(15) (DATA)
+ mov SB, T1
+ rol $5, T1
+ add T1, SA
+ add T2, SA
- ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
- ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
- ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
- ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
- ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
+C C Load and byteswap data
+C movl 88(%esp), T2
+C
+C SWAP( 0, %eax) SWAP( 1, %ebx) SWAP( 2, %ecx) SWAP( 3, %edx)
+C SWAP( 4, %eax) SWAP( 5, %ebx) SWAP( 6, %ecx) SWAP( 7, %edx)
+C SWAP( 8, %eax) SWAP( 9, %ebx) SWAP(10, %ecx) SWAP(11, %edx)
+C SWAP(12, %eax) SWAP(13, %ebx) SWAP(14, %ecx) SWAP(15, %edx)
+C
+C C load the state vector
+C movl 84(%esp),T1
+C movl (T1), SA
+C movl 4(T1), SB
+C movl 8(T1), SC
+C movl 12(T1), SD
+C movl 16(T1), SE
+C
+C ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 0)
+C ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 1)
+C ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 2)
+C ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 3)
+C ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 4)
+C
+C ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 5)
+C ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 6)
+C ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 7)
+C ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 8)
+C ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 9)
+C
+C ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
+C ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
+C ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
+C ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
+C ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
+C
+C ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
+C ROUND_F1(SE, SA, SB, SC, SD, 16)
+C ROUND_F1(SD, SE, SA, SB, SC, 17)
+C ROUND_F1(SC, SD, SE, SA, SB, 18)
+C ROUND_F1(SB, SC, SD, SE, SA, 19)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 20, K2VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 21, K2VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 22, K2VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 23, K2VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 24, K2VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 25, K2VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 26, K2VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 27, K2VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 28, K2VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 29, K2VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 30, K2VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 31, K2VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 32, K2VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 33, K2VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 34, K2VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 35, K2VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 36, K2VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 37, K2VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
+C
+C ROUND_F3(SA, SB, SC, SD, SE, 40)
+C ROUND_F3(SE, SA, SB, SC, SD, 41)
+C ROUND_F3(SD, SE, SA, SB, SC, 42)
+C ROUND_F3(SC, SD, SE, SA, SB, 43)
+C ROUND_F3(SB, SC, SD, SE, SA, 44)
+C
+C ROUND_F3(SA, SB, SC, SD, SE, 45)
+C ROUND_F3(SE, SA, SB, SC, SD, 46)
+C ROUND_F3(SD, SE, SA, SB, SC, 47)
+C ROUND_F3(SC, SD, SE, SA, SB, 48)
+C ROUND_F3(SB, SC, SD, SE, SA, 49)
+C
+C ROUND_F3(SA, SB, SC, SD, SE, 50)
+C ROUND_F3(SE, SA, SB, SC, SD, 51)
+C ROUND_F3(SD, SE, SA, SB, SC, 52)
+C ROUND_F3(SC, SD, SE, SA, SB, 53)
+C ROUND_F3(SB, SC, SD, SE, SA, 54)
+C
+C ROUND_F3(SA, SB, SC, SD, SE, 55)
+C ROUND_F3(SE, SA, SB, SC, SD, 56)
+C ROUND_F3(SD, SE, SA, SB, SC, 57)
+C ROUND_F3(SC, SD, SE, SA, SB, 58)
+C ROUND_F3(SB, SC, SD, SE, SA, 59)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
+C
+C ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
+C ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
+C ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
+C ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
+C ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
C Update the state vector
movl 84(%esp),T1
popl %ebx
ret
EPILOGUE(_nettle_sha1_compress)
+
+C TODO:
+
+C * Extend loopmixer so that it can exploit associativity, and for
+C example reorder
+C
+C add %eax, %ebx
+C add %ecx, %ebx
+
+C * Use mmx instructions for the data expansion, doing two words at a
+C time.