]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
a598ed0d | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
3e181369 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # SHA256 for C64x+. | |
18 | # | |
19 | # January 2012 | |
20 | # | |
21 | # Performance is just below 10 cycles per processed byte, which is | |
22 | # almost 40% faster than compiler-generated code. Unroll is unlikely | |
23 | # to give more than ~8% improvement... | |
24 | # | |
25 | # !!! Note that this module uses AMR, which means that all interrupt | |
26 | # service routines are expected to preserve it and for own well-being | |
27 | # zero it upon entry. | |
28 | ||
1aa89a7a | 29 | $output = pop and open STDOUT,">$output"; |
3e181369 AP |
30 | |
31 | ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments | |
32 | $K256="A3"; | |
33 | ||
34 | ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) | |
35 | =map("A$_",(16..31)); | |
36 | ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) | |
37 | =map("B$_",(16..31)); | |
38 | ||
39 | ($Xia,$Xib)=("A5","B5"); # circular/ring buffer | |
40 | $CTXB=$t2e; | |
41 | ||
42 | ($Xn,$X0,$K)=("B7","B8","B9"); | |
43 | ($Maj,$Ch)=($T2,"B6"); | |
44 | ||
45 | $code.=<<___; | |
46 | .text | |
bd227733 AP |
47 | |
48 | .if .ASSEMBLER_VERSION<7000000 | |
49 | .asg 0,__TI_EABI__ | |
50 | .endif | |
3e181369 AP |
51 | .if __TI_EABI__ |
52 | .nocmp | |
904732f6 | 53 | .asg sha256_block_data_order,_sha256_block_data_order |
3e181369 AP |
54 | .endif |
55 | ||
56 | .asg B3,RA | |
57 | .asg A15,FP | |
58 | .asg B15,SP | |
59 | ||
60 | .if .BIG_ENDIAN | |
61 | .asg SWAP2,MV | |
62 | .asg SWAP4,MV | |
63 | .endif | |
64 | ||
65 | .global _sha256_block_data_order | |
66 | _sha256_block_data_order: | |
bd227733 | 67 | __sha256_block: |
3e181369 AP |
68 | .asmfunc stack_usage(64) |
69 | MV $NUM,A0 ; reassign $NUM | |
70 | || MVK -64,B0 | |
71 | [!A0] BNOP RA ; if ($NUM==0) return; | |
72 | || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) | |
73 | || [A0] MV SP,FP | |
bd227733 | 74 | [A0] ADDKPC __sha256_block,B2 |
3e181369 AP |
75 | || [A0] AND B0,SP,SP ; align stack at 64 bytes |
76 | .if __TI_EABI__ | |
77 | [A0] MVK 0x00404,B1 | |
bd227733 | 78 | || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 |
3e181369 | 79 | [A0] MVKH 0x50000,B1 |
bd227733 | 80 | || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 |
3e181369 AP |
81 | .else |
82 | [A0] MVK 0x00404,B1 | |
bd227733 | 83 | || [A0] MVKL (K256-__sha256_block),$K256 |
3e181369 | 84 | [A0] MVKH 0x50000,B1 |
bd227733 | 85 | || [A0] MVKH (K256-__sha256_block),$K256 |
3e181369 AP |
86 | .endif |
87 | [A0] MVC B1,AMR ; setup circular addressing | |
88 | || [A0] MV SP,$Xia | |
89 | [A0] MV SP,$Xib | |
90 | || [A0] ADD B2,$K256,$K256 | |
91 | || [A0] MV $CTXA,$CTXB | |
92 | || [A0] SUBAW SP,2,SP ; reserve two words above buffer | |
93 | LDW *${CTXA}[0],$A ; load ctx | |
94 | || LDW *${CTXB}[4],$E | |
95 | LDW *${CTXA}[1],$B | |
96 | || LDW *${CTXB}[5],$F | |
97 | LDW *${CTXA}[2],$C | |
98 | || LDW *${CTXB}[6],$G | |
99 | LDW *${CTXA}[3],$D | |
100 | || LDW *${CTXB}[7],$H | |
101 | ||
102 | LDNW *$INP++,$Xn ; pre-fetch input | |
103 | LDW *$K256++,$K ; pre-fetch K256[0] | |
104 | MVK 14,B0 ; loop counters | |
105 | MVK 47,B1 | |
106 | || ADDAW $Xia,9,$Xia | |
107 | outerloop?: | |
108 | SUB A0,1,A0 | |
109 | || MV $A,$Actx | |
110 | || MV $E,$Ectx | |
111 | || MVD $B,$Bctx | |
112 | || MVD $F,$Fctx | |
113 | MV $C,$Cctx | |
114 | || MV $G,$Gctx | |
115 | || MVD $D,$Dctx | |
116 | || MVD $H,$Hctx | |
117 | || SWAP4 $Xn,$X0 | |
118 | ||
119 | SPLOOPD 8 ; BODY_00_14 | |
120 | || MVC B0,ILC | |
121 | || SWAP2 $X0,$X0 | |
122 | ||
123 | LDNW *$INP++,$Xn | |
124 | || ROTL $A,30,$S0 | |
125 | || OR $A,$B,$Maj | |
126 | || AND $A,$B,$t2a | |
127 | || ROTL $E,26,$S1 | |
128 | || AND $F,$E,$Ch | |
129 | || ANDN $G,$E,$t2e | |
130 | ROTL $A,19,$t0a | |
131 | || AND $C,$Maj,$Maj | |
132 | || ROTL $E,21,$t0e | |
133 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
134 | ROTL $A,10,$t1a | |
135 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
136 | || ROTL $E,7,$t1e | |
137 | || ADD $K,$H,$T1 ; T1 = h + K256[i] | |
138 | ADD $X0,$T1,$T1 ; T1 += X[i]; | |
139 | || STW $X0,*$Xib++ | |
140 | || XOR $t0a,$S0,$S0 | |
141 | || XOR $t0e,$S1,$S1 | |
142 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
143 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
144 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
145 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
146 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
147 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
148 | || ROTL $G,0,$H ; h = g | |
149 | || MV $F,$G ; g = f | |
150 | || MV $X0,$X14 | |
151 | || SWAP4 $Xn,$X0 | |
152 | SWAP2 $X0,$X0 | |
153 | || MV $E,$F ; f = e | |
154 | || ADD $D,$T1,$E ; e = d + T1 | |
155 | || MV $C,$D ; d = c | |
156 | MV $B,$C ; c = b | |
157 | || MV $A,$B ; b = a | |
158 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
159 | SPKERNEL | |
160 | ||
161 | ROTL $A,30,$S0 ; BODY_15 | |
162 | || OR $A,$B,$Maj | |
163 | || AND $A,$B,$t2a | |
164 | || ROTL $E,26,$S1 | |
165 | || AND $F,$E,$Ch | |
166 | || ANDN $G,$E,$t2e | |
167 | || LDW *${Xib}[1],$Xn ; modulo-scheduled | |
168 | ROTL $A,19,$t0a | |
169 | || AND $C,$Maj,$Maj | |
170 | || ROTL $E,21,$t0e | |
171 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
172 | || LDW *${Xib}[2],$X1 ; modulo-scheduled | |
173 | ROTL $A,10,$t1a | |
174 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
175 | || ROTL $E,7,$t1e | |
176 | || ADD $K,$H,$T1 ; T1 = h + K256[i] | |
177 | ADD $X0,$T1,$T1 ; T1 += X[i]; | |
178 | || STW $X0,*$Xib++ | |
179 | || XOR $t0a,$S0,$S0 | |
180 | || XOR $t0e,$S1,$S1 | |
181 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
182 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
183 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
184 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
185 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
186 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
187 | || ROTL $G,0,$H ; h = g | |
188 | || MV $F,$G ; g = f | |
189 | || MV $X0,$X15 | |
190 | MV $E,$F ; f = e | |
191 | || ADD $D,$T1,$E ; e = d + T1 | |
192 | || MV $C,$D ; d = c | |
193 | || MV $Xn,$X0 ; modulo-scheduled | |
194 | || LDW *$Xia,$X9 ; modulo-scheduled | |
195 | || ROTL $X1,25,$t0e ; modulo-scheduled | |
196 | || ROTL $X14,15,$t0a ; modulo-scheduled | |
197 | SHRU $X1,3,$s0 ; modulo-scheduled | |
198 | || SHRU $X14,10,$s1 ; modulo-scheduled | |
199 | || ROTL $B,0,$C ; c = b | |
200 | || MV $A,$B ; b = a | |
201 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
202 | ||
203 | SPLOOPD 10 ; BODY_16_63 | |
204 | || MVC B1,ILC | |
205 | || ROTL $X1,14,$t1e ; modulo-scheduled | |
206 | || ROTL $X14,13,$t1a ; modulo-scheduled | |
207 | ||
208 | XOR $t0e,$s0,$s0 | |
209 | || XOR $t0a,$s1,$s1 | |
210 | || MV $X15,$X14 | |
211 | || MV $X1,$Xn | |
212 | XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) | |
213 | || XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) | |
214 | || LDW *${Xib}[2],$X1 ; module-scheduled | |
215 | ROTL $A,30,$S0 | |
216 | || OR $A,$B,$Maj | |
217 | || AND $A,$B,$t2a | |
218 | || ROTL $E,26,$S1 | |
219 | || AND $F,$E,$Ch | |
220 | || ANDN $G,$E,$t2e | |
221 | || ADD $X9,$X0,$X0 ; X[i] += X[i+9] | |
222 | ROTL $A,19,$t0a | |
223 | || AND $C,$Maj,$Maj | |
224 | || ROTL $E,21,$t0e | |
225 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
226 | || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) | |
227 | ROTL $A,10,$t1a | |
228 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
229 | || ROTL $E,7,$t1e | |
230 | || ADD $H,$K,$T1 ; T1 = h + K256[i] | |
231 | || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) | |
232 | XOR $t0a,$S0,$S0 | |
233 | || XOR $t0e,$S1,$S1 | |
234 | || ADD $X0,$T1,$T1 ; T1 += X[i] | |
235 | || STW $X0,*$Xib++ | |
236 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
237 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
238 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
239 | || MV $X0,$X15 | |
240 | || ROTL $G,0,$H ; h = g | |
241 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
242 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
243 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
244 | || MV $F,$G ; g = f | |
245 | || MV $Xn,$X0 ; modulo-scheduled | |
246 | || LDW *++$Xia,$X9 ; modulo-scheduled | |
247 | || ROTL $X1,25,$t0e ; module-scheduled | |
248 | || ROTL $X14,15,$t0a ; modulo-scheduled | |
249 | ROTL $X1,14,$t1e ; modulo-scheduled | |
250 | || ROTL $X14,13,$t1a ; modulo-scheduled | |
251 | || MV $E,$F ; f = e | |
252 | || ADD $D,$T1,$E ; e = d + T1 | |
253 | || MV $C,$D ; d = c | |
254 | || MV $B,$C ; c = b | |
255 | MV $A,$B ; b = a | |
256 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
257 | || SHRU $X1,3,$s0 ; modulo-scheduled | |
258 | || SHRU $X14,10,$s1 ; modulo-scheduled | |
259 | SPKERNEL | |
260 | ||
261 | [A0] B outerloop? | |
262 | || [A0] LDNW *$INP++,$Xn ; pre-fetch input | |
263 | || [A0] ADDK -260,$K256 ; rewind K256 | |
264 | || ADD $Actx,$A,$A ; accumulate ctx | |
265 | || ADD $Ectx,$E,$E | |
266 | || ADD $Bctx,$B,$B | |
267 | ADD $Fctx,$F,$F | |
268 | || ADD $Cctx,$C,$C | |
269 | || ADD $Gctx,$G,$G | |
270 | || ADD $Dctx,$D,$D | |
271 | || ADD $Hctx,$H,$H | |
272 | || [A0] LDW *$K256++,$K ; pre-fetch K256[0] | |
273 | ||
274 | [!A0] BNOP RA | |
275 | ||[!A0] MV $CTXA,$CTXB | |
276 | [!A0] MV FP,SP ; restore stack pointer | |
277 | ||[!A0] LDW *FP[0],FP ; restore frame pointer | |
278 | [!A0] STW $A,*${CTXA}[0] ; save ctx | |
279 | ||[!A0] STW $E,*${CTXB}[4] | |
280 | ||[!A0] MVK 0,B0 | |
281 | [!A0] STW $B,*${CTXA}[1] | |
282 | ||[!A0] STW $F,*${CTXB}[5] | |
283 | ||[!A0] MVC B0,AMR ; clear AMR | |
284 | STW $C,*${CTXA}[2] | |
285 | || STW $G,*${CTXB}[6] | |
286 | STW $D,*${CTXA}[3] | |
287 | || STW $H,*${CTXB}[7] | |
288 | .endasmfunc | |
289 | ||
904732f6 AP |
290 | .if __TI_EABI__ |
291 | .sect ".text:sha_asm.const" | |
292 | .else | |
3e181369 | 293 | .sect ".const:sha_asm" |
904732f6 | 294 | .endif |
3e181369 AP |
295 | .align 128 |
296 | K256: | |
297 | .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | |
298 | .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | |
299 | .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | |
300 | .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | |
301 | .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | |
302 | .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | |
303 | .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | |
304 | .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | |
305 | .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | |
306 | .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | |
307 | .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | |
308 | .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | |
309 | .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | |
310 | .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | |
311 | .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | |
312 | .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | |
313 | .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" | |
314 | .align 4 | |
315 | ||
316 | ___ | |
317 | ||
318 | print $code; | |
a21314db | 319 | close STDOUT or die "error closing STDOUT: $!"; |