2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # If compared to compiler-generated code with similar characteristics,
22 # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23 # this implementation is 25% smaller and >2x faster. In absolute terms
24 # performance is (quite impressive) ~6.5 cycles per processed byte.
25 # Fully unrolled assembler would be ~5x larger and is likely to be
26 # ~15% faster. It would be free from references to intermediate ring
27 # buffer, but put more pressure on L1P [both because the code would be
28 # larger and won't be using SPLOOP buffer]. There are no plans to
29 # realize fully unrolled variant though...
31 # !!! Note that this module uses AMR, which means that all interrupt
32 # service routines are expected to preserve it and for own well-being
35 $output = pop and open STDOUT
,">$output";
37 ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
39 ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
40 ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
41 ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
42 ($XPA,$XPB) = ("A5","B5"); # X circular buffer
43 ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
48 .if .ASSEMBLER_VERSION
<7000000
52 .asg sha1_block_data_order
,_sha1_block_data_order
64 .global _sha1_block_data_order
65 _sha1_block_data_order
:
66 .asmfunc stack_usage
(64)
67 MV
$NUM,A0
; reassign
$NUM
69 [!A0
] BNOP RA
; if ($NUM==0) return;
70 || [A0
] STW FP
,*SP
--[16] ; save frame pointer
and alloca
(64)
72 [A0
] LDW
*${CTX
}[0],$A ; load A
-E
...
73 || [A0
] AND B0
,SP
,SP
; align stack at
64 bytes
74 [A0
] LDW
*${CTX
}[1],$B
75 || [A0
] SUBAW SP
,2,SP
; reserve two words above buffer
76 [A0
] LDW
*${CTX
}[2],$C
77 || [A0
] MVK
0x00404,B0
78 [A0
] LDW
*${CTX
}[3],$D
79 || [A0
] MVKH
0x50000,B0
; 0x050404, 64 bytes
for $XP[AB
]
80 [A0
] LDW
*${CTX
}[4],$E
81 || [A0
] MVC B0
,AMR
; setup circular addressing
82 LDNW
*${INP
}++,$TX1 ; pre
-fetch input
90 MVKH
0x5a820000,$K ; K_00_19
94 ;;==================================================
95 SPLOOPD
5 ; BODY_00_13
104 || ADD
$K,$E,$T ; T
=E
+K
106 XOR
$F0,$F,$F ; F_00_19
(B
,C
,D
)
110 || LDNW
*${INP
}++,$TX1
112 ADD
$F,$T,$T ; T
+=F_00_19
(B
,C
,D
)
113 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
114 || SWAP4
$TX2,$TX3 ; byte swap
116 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
119 ADD
$TX3,$T,$A ; A
=T
+Xi
120 || STW
$TX3,*${XPB
}++
122 ;;==================================================
123 ROTL
$A,5,$Arot ; BODY_14
126 || ADD
$K,$E,$T ; T
=E
+K
128 XOR
$F0,$F,$F ; F_00_19
(B
,C
,D
)
132 || LDNW
*${INP
}++,$TX1
134 ADD
$F,$T,$T ; T
+=F_00_19
(B
,C
,D
)
135 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
136 || SWAP4
$TX2,$TX2 ; byte swap
137 || LDW
*${XPA
}++,$X0 ; fetches from X ring buffer are
138 || LDW
*${XPB
}[4],$X2 ; 2 iterations ahead
140 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
142 || LDW
*${XPA
}[7],$X8
143 || MV
$TX3,$X13 ; || LDW
*${XPB
}[15],$X13
146 ADD
$TX2,$T,$A ; A
=T
+Xi
147 || STW
$TX2,*${XPB
}++
148 ;;==================================================
149 ROTL
$A,5,$Arot ; BODY_15
152 || ADD
$K,$E,$T ; T
=E
+K
154 XOR
$F0,$F,$F ; F_00_19
(B
,C
,D
)
159 ADD
$F,$T,$T ; T
+=F_00_19
(B
,C
,D
)
160 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
161 || SWAP4
$TX2,$TX2 ; byte swap
162 || XOR
$X0,$X2,$TX0 ; Xupdate XORs are
1 iteration ahead
164 || LDW
*${XPB
}[4],$X2
166 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
169 || LDW
*${XPA
}[7],$X8
170 || MV
$TX3,$X13 ; || LDW
*${XPB
}[15],$X13
173 ADD
$TX2,$T,$A ; A
=T
+Xi
174 || STW
$TX2,*${XPB
}++
175 || XOR
$TX0,$TX1,$TX1
177 ;;==================================================
178 SPLOOPD
5 ; BODY_16_19
184 || ADD
$K,$E,$T ; T
=E
+K
185 || ROTL
$TX1,1,$TX2 ; Xupdate output
187 XOR
$F0,$F,$F ; F_00_19
(B
,C
,D
)
191 ADD
$F,$T,$T ; T
+=F_00_19
(B
,C
,D
)
192 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
195 || LDW
*${XPB
}[4],$X2
197 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
200 || LDW
*${XPA
}[7],$X8
201 || MV
$TX3,$X13 ; || LDW
*${XPB
}[15],$X13
204 ADD
$TX2,$T,$A ; A
=T
+Xi
205 || STW
$TX2,*${XPB
}++
206 || XOR
$TX0,$TX1,$TX1
211 MVKH
0x6ed90000,$K ; K_20_39
215 ;;==================================================
216 SPLOOPD
5 ; BODY_20_39
221 || ADD
$K,$E,$T ; T
=E
+K
222 || ROTL
$TX1,1,$TX2 ; Xupdate output
224 XOR
$D,$F,$F ; F_20_39
(B
,C
,D
)
228 ADD
$F,$T,$T ; T
+=F_20_39
(B
,C
,D
)
229 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
232 || LDW
*${XPB
}[4],$X2
234 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
237 || LDW
*${XPA
}[7],$X8
238 || MV
$TX3,$X13 ; || LDW
*${XPB
}[15],$X13
241 ADD
$TX2,$T,$A ; A
=T
+Xi
242 || STW
$TX2,*${XPB
}++ ; last one is redundant
243 || XOR
$TX0,$TX1,$TX1
246 $code.=<<___
if (!shift);
248 MVKH
0x8f1b0000,$K ; K_40_59
252 ;;==================================================
253 SPLOOPD
5 ; BODY_40_59
261 || ADD
$K,$E,$T ; T
=E
+K
262 || ROTL
$TX1,1,$TX2 ; Xupdate output
264 XOR
$F0,$F,$F ; F_40_59
(B
,C
,D
)
268 ADD
$F,$T,$T ; T
+=F_40_59
(B
,C
,D
)
269 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
272 || LDW
*${XPB
}[4],$X2
274 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
277 || LDW
*${XPA
}[7],$X8
278 || MV
$TX3,$X13 ; || LDW
*${XPB
}[15],$X13
281 ADD
$TX2,$T,$A ; A
=T
+Xi
282 || STW
$TX2,*${XPB
}++
283 || XOR
$TX0,$TX1,$TX1
290 MVKH
0xca620000,$K ; K_60_79
292 &BODY_20_39
(-1); # BODY_60_78
294 ;;==================================================
296 || ROTL
$A,5,$Arot ; BODY_79
298 || ROTL
$TX1,1,$TX2 ; Xupdate output
300 [A0
] LDNW
*${INP
}++,$TX1 ; pre
-fetch input
301 || ADD
$K,$E,$T ; T
=E
+K
302 || XOR
$D,$F,$F ; F_20_39
(B
,C
,D
)
304 ADD
$F,$T,$T ; T
+=F_20_39
(B
,C
,D
)
305 || ADD
$Ectx,$D,$E ; E
=D
,E
+=Ectx
306 || ADD
$Dctx,$C,$D ; D
=C
,D
+=Dctx
307 || ROTL
$B,30,$C ; C
=ROL
(B
,30)
309 ADD
$Arot,$T,$T ; T
+=ROL
(A
,5)
310 || ADD
$Bctx,$A,$B ; B
=A
,B
+=Bctx
312 ADD
$TX2,$T,$A ; A
=T
+Xi
314 ADD
$Actx,$A,$A ; A
+=Actx
315 || ADD
$Cctx,$C,$C ; C
+=Cctx
319 || MV FP
,SP
; restore stack pointer
320 || LDW
*FP
[0],FP
; restore frame pointer
321 STW
$A,*${CTX
}[0] ; emit A
-E
...
324 || MVC B0
,AMR
; clear AMR
331 .cstring
"SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
336 close STDOUT
or die "error closing STDOUT: $!";