]>
Commit | Line | Data |
---|---|---|
3e181369 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # SHA256 for C64x+. | |
11 | # | |
12 | # January 2012 | |
13 | # | |
14 | # Performance is just below 10 cycles per processed byte, which is | |
15 | # almost 40% faster than compiler-generated code. Unroll is unlikely | |
16 | # to give more than ~8% improvement... | |
17 | # | |
18 | # !!! Note that this module uses AMR, which means that all interrupt | |
19 | # service routines are expected to preserve it and for own well-being | |
20 | # zero it upon entry. | |
21 | ||
22 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} | |
23 | open STDOUT,">$output"; | |
24 | ||
25 | ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments | |
26 | $K256="A3"; | |
27 | ||
28 | ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) | |
29 | =map("A$_",(16..31)); | |
30 | ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) | |
31 | =map("B$_",(16..31)); | |
32 | ||
33 | ($Xia,$Xib)=("A5","B5"); # circular/ring buffer | |
34 | $CTXB=$t2e; | |
35 | ||
36 | ($Xn,$X0,$K)=("B7","B8","B9"); | |
37 | ($Maj,$Ch)=($T2,"B6"); | |
38 | ||
39 | $code.=<<___; | |
40 | .text | |
41 | .if __TI_EABI__ | |
42 | .nocmp | |
904732f6 | 43 | .asg sha256_block_data_order,_sha256_block_data_order |
3e181369 AP |
44 | .endif |
45 | ||
46 | .asg B3,RA | |
47 | .asg A15,FP | |
48 | .asg B15,SP | |
49 | ||
50 | .if .BIG_ENDIAN | |
51 | .asg SWAP2,MV | |
52 | .asg SWAP4,MV | |
53 | .endif | |
54 | ||
55 | .global _sha256_block_data_order | |
56 | _sha256_block_data_order: | |
57 | .asmfunc stack_usage(64) | |
58 | MV $NUM,A0 ; reassign $NUM | |
59 | || MVK -64,B0 | |
60 | [!A0] BNOP RA ; if ($NUM==0) return; | |
61 | || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) | |
62 | || [A0] MV SP,FP | |
63 | [A0] ADDKPC _sha256_block_data_order,B2 | |
64 | || [A0] AND B0,SP,SP ; align stack at 64 bytes | |
65 | .if __TI_EABI__ | |
66 | [A0] MVK 0x00404,B1 | |
67 | || [A0] MVKL \$PCR_OFFSET(K256,_sha256_block_data_order),$K256 | |
68 | [A0] MVKH 0x50000,B1 | |
69 | || [A0] MVKH \$PCR_OFFSET(K256,_sha256_block_data_order),$K256 | |
70 | .else | |
71 | [A0] MVK 0x00404,B1 | |
72 | || [A0] MVKL (K256-_sha256_block_data_order),$K256 | |
73 | [A0] MVKH 0x50000,B1 | |
74 | || [A0] MVKH (K256-_sha256_block_data_order),$K256 | |
75 | .endif | |
76 | [A0] MVC B1,AMR ; setup circular addressing | |
77 | || [A0] MV SP,$Xia | |
78 | [A0] MV SP,$Xib | |
79 | || [A0] ADD B2,$K256,$K256 | |
80 | || [A0] MV $CTXA,$CTXB | |
81 | || [A0] SUBAW SP,2,SP ; reserve two words above buffer | |
82 | LDW *${CTXA}[0],$A ; load ctx | |
83 | || LDW *${CTXB}[4],$E | |
84 | LDW *${CTXA}[1],$B | |
85 | || LDW *${CTXB}[5],$F | |
86 | LDW *${CTXA}[2],$C | |
87 | || LDW *${CTXB}[6],$G | |
88 | LDW *${CTXA}[3],$D | |
89 | || LDW *${CTXB}[7],$H | |
90 | ||
91 | LDNW *$INP++,$Xn ; pre-fetch input | |
92 | LDW *$K256++,$K ; pre-fetch K256[0] | |
93 | MVK 14,B0 ; loop counters | |
94 | MVK 47,B1 | |
95 | || ADDAW $Xia,9,$Xia | |
96 | outerloop?: | |
97 | SUB A0,1,A0 | |
98 | || MV $A,$Actx | |
99 | || MV $E,$Ectx | |
100 | || MVD $B,$Bctx | |
101 | || MVD $F,$Fctx | |
102 | MV $C,$Cctx | |
103 | || MV $G,$Gctx | |
104 | || MVD $D,$Dctx | |
105 | || MVD $H,$Hctx | |
106 | || SWAP4 $Xn,$X0 | |
107 | ||
108 | SPLOOPD 8 ; BODY_00_14 | |
109 | || MVC B0,ILC | |
110 | || SWAP2 $X0,$X0 | |
111 | ||
112 | LDNW *$INP++,$Xn | |
113 | || ROTL $A,30,$S0 | |
114 | || OR $A,$B,$Maj | |
115 | || AND $A,$B,$t2a | |
116 | || ROTL $E,26,$S1 | |
117 | || AND $F,$E,$Ch | |
118 | || ANDN $G,$E,$t2e | |
119 | ROTL $A,19,$t0a | |
120 | || AND $C,$Maj,$Maj | |
121 | || ROTL $E,21,$t0e | |
122 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
123 | ROTL $A,10,$t1a | |
124 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
125 | || ROTL $E,7,$t1e | |
126 | || ADD $K,$H,$T1 ; T1 = h + K256[i] | |
127 | ADD $X0,$T1,$T1 ; T1 += X[i]; | |
128 | || STW $X0,*$Xib++ | |
129 | || XOR $t0a,$S0,$S0 | |
130 | || XOR $t0e,$S1,$S1 | |
131 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
132 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
133 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
134 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
135 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
136 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
137 | || ROTL $G,0,$H ; h = g | |
138 | || MV $F,$G ; g = f | |
139 | || MV $X0,$X14 | |
140 | || SWAP4 $Xn,$X0 | |
141 | SWAP2 $X0,$X0 | |
142 | || MV $E,$F ; f = e | |
143 | || ADD $D,$T1,$E ; e = d + T1 | |
144 | || MV $C,$D ; d = c | |
145 | MV $B,$C ; c = b | |
146 | || MV $A,$B ; b = a | |
147 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
148 | SPKERNEL | |
149 | ||
150 | ROTL $A,30,$S0 ; BODY_15 | |
151 | || OR $A,$B,$Maj | |
152 | || AND $A,$B,$t2a | |
153 | || ROTL $E,26,$S1 | |
154 | || AND $F,$E,$Ch | |
155 | || ANDN $G,$E,$t2e | |
156 | || LDW *${Xib}[1],$Xn ; modulo-scheduled | |
157 | ROTL $A,19,$t0a | |
158 | || AND $C,$Maj,$Maj | |
159 | || ROTL $E,21,$t0e | |
160 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
161 | || LDW *${Xib}[2],$X1 ; modulo-scheduled | |
162 | ROTL $A,10,$t1a | |
163 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
164 | || ROTL $E,7,$t1e | |
165 | || ADD $K,$H,$T1 ; T1 = h + K256[i] | |
166 | ADD $X0,$T1,$T1 ; T1 += X[i]; | |
167 | || STW $X0,*$Xib++ | |
168 | || XOR $t0a,$S0,$S0 | |
169 | || XOR $t0e,$S1,$S1 | |
170 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
171 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
172 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
173 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
174 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
175 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
176 | || ROTL $G,0,$H ; h = g | |
177 | || MV $F,$G ; g = f | |
178 | || MV $X0,$X15 | |
179 | MV $E,$F ; f = e | |
180 | || ADD $D,$T1,$E ; e = d + T1 | |
181 | || MV $C,$D ; d = c | |
182 | || MV $Xn,$X0 ; modulo-scheduled | |
183 | || LDW *$Xia,$X9 ; modulo-scheduled | |
184 | || ROTL $X1,25,$t0e ; modulo-scheduled | |
185 | || ROTL $X14,15,$t0a ; modulo-scheduled | |
186 | SHRU $X1,3,$s0 ; modulo-scheduled | |
187 | || SHRU $X14,10,$s1 ; modulo-scheduled | |
188 | || ROTL $B,0,$C ; c = b | |
189 | || MV $A,$B ; b = a | |
190 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
191 | ||
192 | SPLOOPD 10 ; BODY_16_63 | |
193 | || MVC B1,ILC | |
194 | || ROTL $X1,14,$t1e ; modulo-scheduled | |
195 | || ROTL $X14,13,$t1a ; modulo-scheduled | |
196 | ||
197 | XOR $t0e,$s0,$s0 | |
198 | || XOR $t0a,$s1,$s1 | |
199 | || MV $X15,$X14 | |
200 | || MV $X1,$Xn | |
201 | XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) | |
202 | || XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) | |
203 | || LDW *${Xib}[2],$X1 ; module-scheduled | |
204 | ROTL $A,30,$S0 | |
205 | || OR $A,$B,$Maj | |
206 | || AND $A,$B,$t2a | |
207 | || ROTL $E,26,$S1 | |
208 | || AND $F,$E,$Ch | |
209 | || ANDN $G,$E,$t2e | |
210 | || ADD $X9,$X0,$X0 ; X[i] += X[i+9] | |
211 | ROTL $A,19,$t0a | |
212 | || AND $C,$Maj,$Maj | |
213 | || ROTL $E,21,$t0e | |
214 | || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) | |
215 | || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) | |
216 | ROTL $A,10,$t1a | |
217 | || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) | |
218 | || ROTL $E,7,$t1e | |
219 | || ADD $H,$K,$T1 ; T1 = h + K256[i] | |
220 | || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) | |
221 | XOR $t0a,$S0,$S0 | |
222 | || XOR $t0e,$S1,$S1 | |
223 | || ADD $X0,$T1,$T1 ; T1 += X[i] | |
224 | || STW $X0,*$Xib++ | |
225 | XOR $t1a,$S0,$S0 ; Sigma0(a) | |
226 | || XOR $t1e,$S1,$S1 ; Sigma1(e) | |
227 | || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) | |
228 | || MV $X0,$X15 | |
229 | || ROTL $G,0,$H ; h = g | |
230 | || LDW *$K256++,$K ; pre-fetch K256[i+1] | |
231 | ADD $S1,$T1,$T1 ; T1 += Sigma1(e) | |
232 | || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) | |
233 | || MV $F,$G ; g = f | |
234 | || MV $Xn,$X0 ; modulo-scheduled | |
235 | || LDW *++$Xia,$X9 ; modulo-scheduled | |
236 | || ROTL $X1,25,$t0e ; module-scheduled | |
237 | || ROTL $X14,15,$t0a ; modulo-scheduled | |
238 | ROTL $X1,14,$t1e ; modulo-scheduled | |
239 | || ROTL $X14,13,$t1a ; modulo-scheduled | |
240 | || MV $E,$F ; f = e | |
241 | || ADD $D,$T1,$E ; e = d + T1 | |
242 | || MV $C,$D ; d = c | |
243 | || MV $B,$C ; c = b | |
244 | MV $A,$B ; b = a | |
245 | || ADD $T1,$T2,$A ; a = T1 + T2 | |
246 | || SHRU $X1,3,$s0 ; modulo-scheduled | |
247 | || SHRU $X14,10,$s1 ; modulo-scheduled | |
248 | SPKERNEL | |
249 | ||
250 | [A0] B outerloop? | |
251 | || [A0] LDNW *$INP++,$Xn ; pre-fetch input | |
252 | || [A0] ADDK -260,$K256 ; rewind K256 | |
253 | || ADD $Actx,$A,$A ; accumulate ctx | |
254 | || ADD $Ectx,$E,$E | |
255 | || ADD $Bctx,$B,$B | |
256 | ADD $Fctx,$F,$F | |
257 | || ADD $Cctx,$C,$C | |
258 | || ADD $Gctx,$G,$G | |
259 | || ADD $Dctx,$D,$D | |
260 | || ADD $Hctx,$H,$H | |
261 | || [A0] LDW *$K256++,$K ; pre-fetch K256[0] | |
262 | ||
263 | [!A0] BNOP RA | |
264 | ||[!A0] MV $CTXA,$CTXB | |
265 | [!A0] MV FP,SP ; restore stack pointer | |
266 | ||[!A0] LDW *FP[0],FP ; restore frame pointer | |
267 | [!A0] STW $A,*${CTXA}[0] ; save ctx | |
268 | ||[!A0] STW $E,*${CTXB}[4] | |
269 | ||[!A0] MVK 0,B0 | |
270 | [!A0] STW $B,*${CTXA}[1] | |
271 | ||[!A0] STW $F,*${CTXB}[5] | |
272 | ||[!A0] MVC B0,AMR ; clear AMR | |
273 | STW $C,*${CTXA}[2] | |
274 | || STW $G,*${CTXB}[6] | |
275 | STW $D,*${CTXA}[3] | |
276 | || STW $H,*${CTXB}[7] | |
277 | .endasmfunc | |
278 | ||
904732f6 AP |
279 | .if __TI_EABI__ |
280 | .sect ".text:sha_asm.const" | |
281 | .else | |
3e181369 | 282 | .sect ".const:sha_asm" |
904732f6 | 283 | .endif |
3e181369 AP |
284 | .align 128 |
285 | K256: | |
286 | .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | |
287 | .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | |
288 | .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | |
289 | .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | |
290 | .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | |
291 | .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | |
292 | .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | |
293 | .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | |
294 | .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | |
295 | .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | |
296 | .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | |
297 | .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | |
298 | .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | |
299 | .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | |
300 | .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | |
301 | .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | |
302 | .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" | |
303 | .align 4 | |
304 | ||
305 | ___ | |
306 | ||
307 | print $code; | |
904732f6 | 308 | close STDOUT; |