]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
fb65020b AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # March 2016 | |
18 | # | |
19 | # Initial support for Fujitsu SPARC64 X/X+ comprises minimally | |
20 | # required key setup and single-block procedures. | |
d41de45a AP |
21 | # |
22 | # April 2016 | |
23 | # | |
24 | # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means | |
46f4e1be | 25 | # that parallelizable nature of CBC decrypt and CTR is not utilized |
d41de45a AP |
26 | # yet. CBC encrypt on the other hand is as good as it can possibly |
27 | # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X. | |
28 | # This is ~6x faster than pure software implementation... | |
9515acca AP |
29 | # |
30 | # July 2016 | |
31 | # | |
32 | # Switch from faligndata to fshiftorx, which allows to omit alignaddr | |
33 | # instructions and improve single-block and short-input performance | |
34 | # with misaligned data. | |
fb65020b AP |
35 | |
36 | $output = pop; | |
37 | open STDOUT,">$output"; | |
38 | ||
39 | { | |
40 | my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5)); | |
41 | ||
42 | $code.=<<___; | |
d41de45a AP |
43 | #include "sparc_arch.h" |
44 | ||
45 | #define LOCALS (STACK_BIAS+STACK_FRAME) | |
46 | ||
fb65020b AP |
47 | .text |
48 | ||
49 | .globl aes_fx_encrypt | |
50 | .align 32 | |
51 | aes_fx_encrypt: | |
52 | and $inp, 7, $tmp ! is input aligned? | |
d41de45a | 53 | andn $inp, 7, $inp |
d41de45a | 54 | ldd [$key + 0], %f6 ! round[0] |
fb65020b | 55 | ldd [$key + 8], %f8 |
9515acca AP |
56 | mov %o7, %g1 |
57 | ld [$key + 240], $rounds | |
fb65020b | 58 | |
9515acca AP |
59 | 1: call .+8 |
60 | add %o7, .Linp_align-1b, %o7 | |
61 | ||
62 | sll $tmp, 3, $tmp | |
fb65020b AP |
63 | ldd [$inp + 0], %f0 ! load input |
64 | brz,pt $tmp, .Lenc_inp_aligned | |
65 | ldd [$inp + 8], %f2 | |
66 | ||
9515acca | 67 | ldd [%o7 + $tmp], %f14 ! shift left params |
fb65020b | 68 | ldd [$inp + 16], %f4 |
9515acca AP |
69 | fshiftorx %f0, %f2, %f14, %f0 |
70 | fshiftorx %f2, %f4, %f14, %f2 | |
fb65020b AP |
71 | |
72 | .Lenc_inp_aligned: | |
d41de45a | 73 | ldd [$key + 16], %f10 ! round[1] |
fb65020b | 74 | ldd [$key + 24], %f12 |
fb65020b AP |
75 | |
76 | fxor %f0, %f6, %f0 ! ^=round[0] | |
77 | fxor %f2, %f8, %f2 | |
d41de45a AP |
78 | ldd [$key + 32], %f6 ! round[2] |
79 | ldd [$key + 40], %f8 | |
80 | add $key, 32, $key | |
fb65020b AP |
81 | sub $rounds, 4, $rounds |
82 | ||
83 | .Loop_enc: | |
84 | fmovd %f0, %f4 | |
85 | faesencx %f2, %f10, %f0 | |
86 | faesencx %f4, %f12, %f2 | |
87 | ldd [$key + 16], %f10 | |
88 | ldd [$key + 24], %f12 | |
89 | add $key, 32, $key | |
90 | ||
91 | fmovd %f0, %f4 | |
92 | faesencx %f2, %f6, %f0 | |
93 | faesencx %f4, %f8, %f2 | |
94 | ldd [$key + 0], %f6 | |
95 | ldd [$key + 8], %f8 | |
96 | ||
97 | brnz,a $rounds, .Loop_enc | |
98 | sub $rounds, 2, $rounds | |
99 | ||
100 | andcc $out, 7, $tmp ! is output aligned? | |
9515acca | 101 | andn $out, 7, $out |
fb65020b | 102 | mov 0xff, $mask |
9515acca AP |
103 | srl $mask, $tmp, $mask |
104 | add %o7, 64, %o7 | |
105 | sll $tmp, 3, $tmp | |
fb65020b AP |
106 | |
107 | fmovd %f0, %f4 | |
108 | faesencx %f2, %f10, %f0 | |
109 | faesencx %f4, %f12, %f2 | |
9515acca AP |
110 | ldd [%o7 + $tmp], %f14 ! shift right params |
111 | ||
fb65020b AP |
112 | fmovd %f0, %f4 |
113 | faesenclx %f2, %f6, %f0 | |
114 | faesenclx %f4, %f8, %f2 | |
115 | ||
9515acca AP |
116 | bnz,pn %icc, .Lenc_out_unaligned |
117 | mov %g1, %o7 | |
fb65020b AP |
118 | |
119 | std %f0, [$out + 0] | |
120 | retl | |
121 | std %f2, [$out + 8] | |
122 | ||
d41de45a | 123 | .align 16 |
fb65020b | 124 | .Lenc_out_unaligned: |
9515acca AP |
125 | add $out, 16, $inp |
126 | orn %g0, $mask, $tmp | |
127 | fshiftorx %f0, %f0, %f14, %f4 | |
128 | fshiftorx %f0, %f2, %f14, %f6 | |
129 | fshiftorx %f2, %f2, %f14, %f8 | |
fb65020b AP |
130 | |
131 | stda %f4, [$out + $mask]0xc0 ! partial store | |
132 | std %f6, [$out + 8] | |
9515acca | 133 | stda %f8, [$inp + $tmp]0xc0 ! partial store |
d41de45a AP |
134 | retl |
135 | nop | |
ff823ee8 | 136 | .type aes_fx_encrypt,#function |
fb65020b AP |
137 | .size aes_fx_encrypt,.-aes_fx_encrypt |
138 | ||
139 | .globl aes_fx_decrypt | |
140 | .align 32 | |
141 | aes_fx_decrypt: | |
142 | and $inp, 7, $tmp ! is input aligned? | |
d41de45a | 143 | andn $inp, 7, $inp |
d41de45a | 144 | ldd [$key + 0], %f6 ! round[0] |
fb65020b | 145 | ldd [$key + 8], %f8 |
9515acca AP |
146 | mov %o7, %g1 |
147 | ld [$key + 240], $rounds | |
148 | ||
149 | 1: call .+8 | |
150 | add %o7, .Linp_align-1b, %o7 | |
fb65020b | 151 | |
9515acca | 152 | sll $tmp, 3, $tmp |
fb65020b AP |
153 | ldd [$inp + 0], %f0 ! load input |
154 | brz,pt $tmp, .Ldec_inp_aligned | |
155 | ldd [$inp + 8], %f2 | |
156 | ||
9515acca | 157 | ldd [%o7 + $tmp], %f14 ! shift left params |
fb65020b | 158 | ldd [$inp + 16], %f4 |
9515acca AP |
159 | fshiftorx %f0, %f2, %f14, %f0 |
160 | fshiftorx %f2, %f4, %f14, %f2 | |
fb65020b AP |
161 | |
162 | .Ldec_inp_aligned: | |
d41de45a | 163 | ldd [$key + 16], %f10 ! round[1] |
fb65020b | 164 | ldd [$key + 24], %f12 |
fb65020b AP |
165 | |
166 | fxor %f0, %f6, %f0 ! ^=round[0] | |
167 | fxor %f2, %f8, %f2 | |
d41de45a AP |
168 | ldd [$key + 32], %f6 ! round[2] |
169 | ldd [$key + 40], %f8 | |
170 | add $key, 32, $key | |
fb65020b AP |
171 | sub $rounds, 4, $rounds |
172 | ||
173 | .Loop_dec: | |
174 | fmovd %f0, %f4 | |
175 | faesdecx %f2, %f10, %f0 | |
176 | faesdecx %f4, %f12, %f2 | |
177 | ldd [$key + 16], %f10 | |
178 | ldd [$key + 24], %f12 | |
179 | add $key, 32, $key | |
180 | ||
181 | fmovd %f0, %f4 | |
182 | faesdecx %f2, %f6, %f0 | |
183 | faesdecx %f4, %f8, %f2 | |
184 | ldd [$key + 0], %f6 | |
185 | ldd [$key + 8], %f8 | |
186 | ||
187 | brnz,a $rounds, .Loop_dec | |
188 | sub $rounds, 2, $rounds | |
189 | ||
190 | andcc $out, 7, $tmp ! is output aligned? | |
9515acca | 191 | andn $out, 7, $out |
fb65020b | 192 | mov 0xff, $mask |
9515acca AP |
193 | srl $mask, $tmp, $mask |
194 | add %o7, 64, %o7 | |
195 | sll $tmp, 3, $tmp | |
fb65020b AP |
196 | |
197 | fmovd %f0, %f4 | |
198 | faesdecx %f2, %f10, %f0 | |
199 | faesdecx %f4, %f12, %f2 | |
9515acca AP |
200 | ldd [%o7 + $tmp], %f14 ! shift right params |
201 | ||
fb65020b AP |
202 | fmovd %f0, %f4 |
203 | faesdeclx %f2, %f6, %f0 | |
204 | faesdeclx %f4, %f8, %f2 | |
205 | ||
9515acca AP |
206 | bnz,pn %icc, .Ldec_out_unaligned |
207 | mov %g1, %o7 | |
fb65020b AP |
208 | |
209 | std %f0, [$out + 0] | |
210 | retl | |
211 | std %f2, [$out + 8] | |
212 | ||
d41de45a | 213 | .align 16 |
fb65020b | 214 | .Ldec_out_unaligned: |
9515acca AP |
215 | add $out, 16, $inp |
216 | orn %g0, $mask, $tmp | |
217 | fshiftorx %f0, %f0, %f14, %f4 | |
218 | fshiftorx %f0, %f2, %f14, %f6 | |
219 | fshiftorx %f2, %f2, %f14, %f8 | |
fb65020b AP |
220 | |
221 | stda %f4, [$out + $mask]0xc0 ! partial store | |
222 | std %f6, [$out + 8] | |
9515acca | 223 | stda %f8, [$inp + $tmp]0xc0 ! partial store |
d41de45a AP |
224 | retl |
225 | nop | |
ff823ee8 | 226 | .type aes_fx_decrypt,#function |
fb65020b AP |
227 | .size aes_fx_decrypt,.-aes_fx_decrypt |
228 | ___ | |
229 | } | |
230 | { | |
231 | my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5)); | |
232 | $code.=<<___; | |
233 | .globl aes_fx_set_decrypt_key | |
234 | .align 32 | |
235 | aes_fx_set_decrypt_key: | |
236 | b .Lset_encrypt_key | |
237 | mov -1, $inc | |
238 | retl | |
239 | nop | |
ff823ee8 | 240 | .type aes_fx_set_decrypt_key,#function |
fb65020b AP |
241 | .size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key |
242 | ||
243 | .globl aes_fx_set_encrypt_key | |
244 | .align 32 | |
245 | aes_fx_set_encrypt_key: | |
246 | mov 1, $inc | |
d41de45a | 247 | nop |
fb65020b AP |
248 | .Lset_encrypt_key: |
249 | and $inp, 7, $tmp | |
d41de45a | 250 | andn $inp, 7, $inp |
9515acca AP |
251 | sll $tmp, 3, $tmp |
252 | mov %o7, %g1 | |
253 | ||
254 | 1: call .+8 | |
255 | add %o7, .Linp_align-1b, %o7 | |
256 | ||
257 | ldd [%o7 + $tmp], %f10 ! shift left params | |
258 | mov %g1, %o7 | |
fb65020b AP |
259 | |
260 | cmp $bits, 192 | |
261 | ldd [$inp + 0], %f0 | |
262 | bl,pt %icc, .L128 | |
263 | ldd [$inp + 8], %f2 | |
264 | ||
265 | be,pt %icc, .L192 | |
266 | ldd [$inp + 16], %f4 | |
267 | brz,pt $tmp, .L256aligned | |
268 | ldd [$inp + 24], %f6 | |
269 | ||
270 | ldd [$inp + 32], %f8 | |
9515acca AP |
271 | fshiftorx %f0, %f2, %f10, %f0 |
272 | fshiftorx %f2, %f4, %f10, %f2 | |
273 | fshiftorx %f4, %f6, %f10, %f4 | |
274 | fshiftorx %f6, %f8, %f10, %f6 | |
fb65020b AP |
275 | |
276 | .L256aligned: | |
277 | mov 14, $bits | |
278 | and $inc, `14*16`, $tmp | |
279 | st $bits, [$out + 240] ! store rounds | |
280 | add $out, $tmp, $out ! start or end of key schedule | |
281 | sllx $inc, 4, $inc ! 16 or -16 | |
282 | ___ | |
283 | for ($i=0; $i<6; $i++) { | |
284 | $code.=<<___; | |
285 | std %f0, [$out + 0] | |
286 | faeskeyx %f6, `0x10+$i`, %f0 | |
287 | std %f2, [$out + 8] | |
288 | add $out, $inc, $out | |
289 | faeskeyx %f0, 0x00, %f2 | |
290 | std %f4, [$out + 0] | |
291 | faeskeyx %f2, 0x01, %f4 | |
292 | std %f6, [$out + 8] | |
293 | add $out, $inc, $out | |
294 | faeskeyx %f4, 0x00, %f6 | |
295 | ___ | |
296 | } | |
297 | $code.=<<___; | |
298 | std %f0, [$out + 0] | |
299 | faeskeyx %f6, `0x10+$i`, %f0 | |
300 | std %f2, [$out + 8] | |
301 | add $out, $inc, $out | |
302 | faeskeyx %f0, 0x00, %f2 | |
d41de45a AP |
303 | std %f4,[$out + 0] |
304 | std %f6,[$out + 8] | |
fb65020b | 305 | add $out, $inc, $out |
d41de45a AP |
306 | std %f0,[$out + 0] |
307 | std %f2,[$out + 8] | |
fb65020b AP |
308 | retl |
309 | xor %o0, %o0, %o0 ! return 0 | |
310 | ||
311 | .align 16 | |
312 | .L192: | |
313 | brz,pt $tmp, .L192aligned | |
314 | nop | |
315 | ||
316 | ldd [$inp + 24], %f6 | |
9515acca AP |
317 | fshiftorx %f0, %f2, %f10, %f0 |
318 | fshiftorx %f2, %f4, %f10, %f2 | |
319 | fshiftorx %f4, %f6, %f10, %f4 | |
fb65020b AP |
320 | |
321 | .L192aligned: | |
322 | mov 12, $bits | |
323 | and $inc, `12*16`, $tmp | |
324 | st $bits, [$out + 240] ! store rounds | |
325 | add $out, $tmp, $out ! start or end of key schedule | |
326 | sllx $inc, 4, $inc ! 16 or -16 | |
327 | ___ | |
328 | for ($i=0; $i<8; $i+=2) { | |
329 | $code.=<<___; | |
330 | std %f0, [$out + 0] | |
331 | faeskeyx %f4, `0x10+$i`, %f0 | |
332 | std %f2, [$out + 8] | |
333 | add $out, $inc, $out | |
334 | faeskeyx %f0, 0x00, %f2 | |
335 | std %f4, [$out + 0] | |
336 | faeskeyx %f2, 0x00, %f4 | |
337 | std %f0, [$out + 8] | |
338 | add $out, $inc, $out | |
339 | faeskeyx %f4, `0x10+$i+1`, %f0 | |
340 | std %f2, [$out + 0] | |
341 | faeskeyx %f0, 0x00, %f2 | |
342 | std %f4, [$out + 8] | |
343 | add $out, $inc, $out | |
344 | ___ | |
345 | $code.=<<___ if ($i<6); | |
346 | faeskeyx %f2, 0x00, %f4 | |
347 | ___ | |
348 | } | |
349 | $code.=<<___; | |
350 | std %f0, [$out + 0] | |
351 | std %f2, [$out + 8] | |
352 | retl | |
353 | xor %o0, %o0, %o0 ! return 0 | |
354 | ||
355 | .align 16 | |
356 | .L128: | |
357 | brz,pt $tmp, .L128aligned | |
358 | nop | |
359 | ||
360 | ldd [$inp + 16], %f4 | |
9515acca AP |
361 | fshiftorx %f0, %f2, %f10, %f0 |
362 | fshiftorx %f2, %f4, %f10, %f2 | |
fb65020b AP |
363 | |
364 | .L128aligned: | |
365 | mov 10, $bits | |
366 | and $inc, `10*16`, $tmp | |
367 | st $bits, [$out + 240] ! store rounds | |
368 | add $out, $tmp, $out ! start or end of key schedule | |
369 | sllx $inc, 4, $inc ! 16 or -16 | |
370 | ___ | |
371 | for ($i=0; $i<10; $i++) { | |
372 | $code.=<<___; | |
373 | std %f0, [$out + 0] | |
374 | faeskeyx %f2, `0x10+$i`, %f0 | |
375 | std %f2, [$out + 8] | |
376 | add $out, $inc, $out | |
377 | faeskeyx %f0, 0x00, %f2 | |
378 | ___ | |
379 | } | |
380 | $code.=<<___; | |
381 | std %f0, [$out + 0] | |
382 | std %f2, [$out + 8] | |
383 | retl | |
384 | xor %o0, %o0, %o0 ! return 0 | |
ff823ee8 | 385 | .type aes_fx_set_encrypt_key,#function |
fb65020b AP |
386 | .size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key |
387 | ___ | |
388 | } | |
d41de45a AP |
389 | { |
390 | my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5)); | |
391 | my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); | |
9515acca | 392 | my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) |
d41de45a AP |
393 | = map("%f$_",grep { !($_ & 1) } (16 .. 62)); |
394 | my ($ileft,$iright) = ($ialign,$oalign); | |
395 | ||
396 | $code.=<<___; | |
397 | .globl aes_fx_cbc_encrypt | |
398 | .align 32 | |
399 | aes_fx_cbc_encrypt: | |
400 | save %sp, -STACK_FRAME-16, %sp | |
8604a6e0 | 401 | srln $len, 4, $len |
d41de45a | 402 | and $inp, 7, $ialign |
d41de45a | 403 | andn $inp, 7, $inp |
9515acca AP |
404 | brz,pn $len, .Lcbc_no_data |
405 | sll $ialign, 3, $ileft | |
406 | ||
407 | 1: call .+8 | |
408 | add %o7, .Linp_align-1b, %o7 | |
409 | ||
d41de45a AP |
410 | ld [$key + 240], $rounds |
411 | and $out, 7, $oalign | |
412 | ld [$ivp + 0], %f0 ! load ivec | |
9515acca | 413 | andn $out, 7, $out |
d41de45a | 414 | ld [$ivp + 4], %f1 |
9515acca | 415 | sll $oalign, 3, $mask |
d41de45a AP |
416 | ld [$ivp + 8], %f2 |
417 | ld [$ivp + 12], %f3 | |
418 | ||
419 | sll $rounds, 4, $rounds | |
420 | add $rounds, $key, $end | |
421 | ldd [$key + 0], $r0hi ! round[0] | |
422 | ldd [$key + 8], $r0lo | |
423 | ||
424 | add $inp, 16, $inp | |
8604a6e0 | 425 | sub $len, 1, $len |
d41de45a AP |
426 | ldd [$end + 0], $rlhi ! round[last] |
427 | ldd [$end + 8], $rllo | |
428 | ||
429 | mov 16, $inc | |
430 | movrz $len, 0, $inc | |
431 | ldd [$key + 16], %f10 ! round[1] | |
432 | ldd [$key + 24], %f12 | |
433 | ||
9515acca AP |
434 | ldd [%o7 + $ileft], $fshift ! shift left params |
435 | add %o7, 64, %o7 | |
d41de45a AP |
436 | ldd [$inp - 16], $in0 ! load input |
437 | ldd [$inp - 8], $in1 | |
438 | ldda [$inp]0x82, $intail ! non-faulting load | |
439 | brz $dir, .Lcbc_decrypt | |
440 | add $inp, $inc, $inp ! inp+=16 | |
441 | ||
442 | fxor $r0hi, %f0, %f0 ! ivec^=round[0] | |
443 | fxor $r0lo, %f2, %f2 | |
9515acca AP |
444 | fshiftorx $in0, $in1, $fshift, $in0 |
445 | fshiftorx $in1, $intail, $fshift, $in1 | |
446 | nop | |
d41de45a AP |
447 | |
448 | .Loop_cbc_enc: | |
449 | fxor $in0, %f0, %f0 ! inp^ivec^round[0] | |
450 | fxor $in1, %f2, %f2 | |
451 | ldd [$key + 32], %f6 ! round[2] | |
452 | ldd [$key + 40], %f8 | |
453 | add $key, 32, $end | |
454 | sub $rounds, 16*6, $inner | |
455 | ||
456 | .Lcbc_enc: | |
457 | fmovd %f0, %f4 | |
458 | faesencx %f2, %f10, %f0 | |
459 | faesencx %f4, %f12, %f2 | |
460 | ldd [$end + 16], %f10 | |
461 | ldd [$end + 24], %f12 | |
462 | add $end, 32, $end | |
463 | ||
464 | fmovd %f0, %f4 | |
465 | faesencx %f2, %f6, %f0 | |
466 | faesencx %f4, %f8, %f2 | |
467 | ldd [$end + 0], %f6 | |
468 | ldd [$end + 8], %f8 | |
469 | ||
470 | brnz,a $inner, .Lcbc_enc | |
471 | sub $inner, 16*2, $inner | |
472 | ||
473 | fmovd %f0, %f4 | |
474 | faesencx %f2, %f10, %f0 | |
475 | faesencx %f4, %f12, %f2 | |
476 | ldd [$end + 16], %f10 ! round[last-1] | |
477 | ldd [$end + 24], %f12 | |
478 | ||
d41de45a AP |
479 | movrz $len, 0, $inc |
480 | fmovd $intail, $in0 | |
481 | ldd [$inp - 8], $in1 ! load next input block | |
482 | ldda [$inp]0x82, $intail ! non-faulting load | |
483 | add $inp, $inc, $inp ! inp+=16 | |
484 | ||
9515acca AP |
485 | fmovd %f0, %f4 |
486 | faesencx %f2, %f6, %f0 | |
487 | faesencx %f4, %f8, %f2 | |
488 | ||
489 | fshiftorx $in0, $in1, $fshift, $in0 | |
490 | fshiftorx $in1, $intail, $fshift, $in1 | |
491 | ||
d41de45a AP |
492 | fmovd %f0, %f4 |
493 | faesencx %f2, %f10, %f0 | |
494 | faesencx %f4, %f12, %f2 | |
495 | ldd [$key + 16], %f10 ! round[1] | |
496 | ldd [$key + 24], %f12 | |
497 | ||
9515acca AP |
498 | fxor $r0hi, $in0, $in0 ! inp^=round[0] |
499 | fxor $r0lo, $in1, $in1 | |
d41de45a AP |
500 | |
501 | fmovd %f0, %f4 | |
9515acca | 502 | faesenclx %f2, $rlhi, %f0 |
d41de45a AP |
503 | faesenclx %f4, $rllo, %f2 |
504 | ||
d41de45a | 505 | brnz,pn $oalign, .Lcbc_enc_unaligned_out |
9515acca | 506 | nop |
d41de45a | 507 | |
9515acca AP |
508 | std %f0, [$out + 0] |
509 | std %f2, [$out + 8] | |
d41de45a AP |
510 | add $out, 16, $out |
511 | ||
512 | brnz,a $len, .Loop_cbc_enc | |
8604a6e0 | 513 | sub $len, 1, $len |
d41de45a | 514 | |
9515acca AP |
515 | st %f0, [$ivp + 0] ! output ivec |
516 | st %f1, [$ivp + 4] | |
517 | st %f2, [$ivp + 8] | |
518 | st %f3, [$ivp + 12] | |
d41de45a AP |
519 | |
520 | .Lcbc_no_data: | |
521 | ret | |
522 | restore | |
523 | ||
524 | .align 32 | |
525 | .Lcbc_enc_unaligned_out: | |
9515acca | 526 | ldd [%o7 + $mask], $fshift ! shift right params |
d41de45a | 527 | mov 0xff, $mask |
d41de45a AP |
528 | srl $mask, $oalign, $mask |
529 | sub %g0, $ileft, $iright | |
530 | ||
9515acca AP |
531 | fshiftorx %f0, %f0, $fshift, %f6 |
532 | fshiftorx %f0, %f2, $fshift, %f8 | |
d41de45a AP |
533 | |
534 | stda %f6, [$out + $mask]0xc0 ! partial store | |
9515acca | 535 | orn %g0, $mask, $mask |
d41de45a AP |
536 | std %f8, [$out + 8] |
537 | add $out, 16, $out | |
538 | brz $len, .Lcbc_enc_unaligned_out_done | |
9515acca AP |
539 | sub $len, 1, $len |
540 | b .Loop_cbc_enc_unaligned_out | |
541 | nop | |
d41de45a | 542 | |
9515acca | 543 | .align 32 |
d41de45a | 544 | .Loop_cbc_enc_unaligned_out: |
9515acca | 545 | fmovd %f2, $outhead |
d41de45a AP |
546 | fxor $in0, %f0, %f0 ! inp^ivec^round[0] |
547 | fxor $in1, %f2, %f2 | |
548 | ldd [$key + 32], %f6 ! round[2] | |
549 | ldd [$key + 40], %f8 | |
550 | ||
551 | fmovd %f0, %f4 | |
552 | faesencx %f2, %f10, %f0 | |
553 | faesencx %f4, %f12, %f2 | |
554 | ldd [$key + 48], %f10 ! round[3] | |
555 | ldd [$key + 56], %f12 | |
556 | ||
557 | ldx [$inp - 16], %o0 | |
558 | ldx [$inp - 8], %o1 | |
9515acca | 559 | brz $ileft, .Lcbc_enc_aligned_inp |
d41de45a AP |
560 | movrz $len, 0, $inc |
561 | ||
562 | ldx [$inp], %o2 | |
563 | sllx %o0, $ileft, %o0 | |
564 | srlx %o1, $iright, %g1 | |
565 | sllx %o1, $ileft, %o1 | |
566 | or %g1, %o0, %o0 | |
567 | srlx %o2, $iright, %o2 | |
568 | or %o2, %o1, %o1 | |
569 | ||
570 | .Lcbc_enc_aligned_inp: | |
571 | fmovd %f0, %f4 | |
572 | faesencx %f2, %f6, %f0 | |
573 | faesencx %f4, %f8, %f2 | |
574 | ldd [$key + 64], %f6 ! round[4] | |
575 | ldd [$key + 72], %f8 | |
576 | add $key, 64, $end | |
577 | sub $rounds, 16*8, $inner | |
578 | ||
579 | stx %o0, [%sp + LOCALS + 0] | |
580 | stx %o1, [%sp + LOCALS + 8] | |
581 | add $inp, $inc, $inp ! inp+=16 | |
9515acca | 582 | nop |
d41de45a AP |
583 | |
584 | .Lcbc_enc_unaligned: | |
585 | fmovd %f0, %f4 | |
586 | faesencx %f2, %f10, %f0 | |
587 | faesencx %f4, %f12, %f2 | |
588 | ldd [$end + 16], %f10 | |
589 | ldd [$end + 24], %f12 | |
590 | add $end, 32, $end | |
591 | ||
592 | fmovd %f0, %f4 | |
593 | faesencx %f2, %f6, %f0 | |
594 | faesencx %f4, %f8, %f2 | |
595 | ldd [$end + 0], %f6 | |
596 | ldd [$end + 8], %f8 | |
597 | ||
598 | brnz,a $inner, .Lcbc_enc_unaligned | |
599 | sub $inner, 16*2, $inner | |
600 | ||
601 | fmovd %f0, %f4 | |
602 | faesencx %f2, %f10, %f0 | |
603 | faesencx %f4, %f12, %f2 | |
604 | ldd [$end + 16], %f10 ! round[last-1] | |
605 | ldd [$end + 24], %f12 | |
606 | ||
607 | fmovd %f0, %f4 | |
608 | faesencx %f2, %f6, %f0 | |
609 | faesencx %f4, %f8, %f2 | |
9515acca | 610 | |
d41de45a AP |
611 | ldd [%sp + LOCALS + 0], $in0 |
612 | ldd [%sp + LOCALS + 8], $in1 | |
613 | ||
614 | fmovd %f0, %f4 | |
615 | faesencx %f2, %f10, %f0 | |
616 | faesencx %f4, %f12, %f2 | |
617 | ldd [$key + 16], %f10 ! round[1] | |
618 | ldd [$key + 24], %f12 | |
619 | ||
9515acca AP |
620 | fxor $r0hi, $in0, $in0 ! inp^=round[0] |
621 | fxor $r0lo, $in1, $in1 | |
622 | ||
d41de45a | 623 | fmovd %f0, %f4 |
9515acca | 624 | faesenclx %f2, $rlhi, %f0 |
d41de45a AP |
625 | faesenclx %f4, $rllo, %f2 |
626 | ||
9515acca AP |
627 | fshiftorx $outhead, %f0, $fshift, %f6 |
628 | fshiftorx %f0, %f2, $fshift, %f8 | |
d41de45a AP |
629 | std %f6, [$out + 0] |
630 | std %f8, [$out + 8] | |
631 | add $out, 16, $out | |
632 | ||
633 | brnz,a $len, .Loop_cbc_enc_unaligned_out | |
8604a6e0 | 634 | sub $len, 1, $len |
d41de45a AP |
635 | |
636 | .Lcbc_enc_unaligned_out_done: | |
9515acca | 637 | fshiftorx %f2, %f2, $fshift, %f8 |
d41de45a AP |
638 | stda %f8, [$out + $mask]0xc0 ! partial store |
639 | ||
9515acca AP |
640 | st %f0, [$ivp + 0] ! output ivec |
641 | st %f1, [$ivp + 4] | |
642 | st %f2, [$ivp + 8] | |
643 | st %f3, [$ivp + 12] | |
d41de45a AP |
644 | |
645 | ret | |
646 | restore | |
647 | ||
648 | .align 32 | |
649 | .Lcbc_decrypt: | |
9515acca AP |
650 | fshiftorx $in0, $in1, $fshift, $in0 |
651 | fshiftorx $in1, $intail, $fshift, $in1 | |
d41de45a AP |
652 | fmovd %f0, $iv0 |
653 | fmovd %f2, $iv1 | |
654 | ||
655 | .Loop_cbc_dec: | |
656 | fxor $in0, $r0hi, %f0 ! inp^round[0] | |
657 | fxor $in1, $r0lo, %f2 | |
658 | ldd [$key + 32], %f6 ! round[2] | |
659 | ldd [$key + 40], %f8 | |
660 | add $key, 32, $end | |
661 | sub $rounds, 16*6, $inner | |
662 | ||
663 | .Lcbc_dec: | |
664 | fmovd %f0, %f4 | |
665 | faesdecx %f2, %f10, %f0 | |
666 | faesdecx %f4, %f12, %f2 | |
667 | ldd [$end + 16], %f10 | |
668 | ldd [$end + 24], %f12 | |
669 | add $end, 32, $end | |
670 | ||
671 | fmovd %f0, %f4 | |
672 | faesdecx %f2, %f6, %f0 | |
673 | faesdecx %f4, %f8, %f2 | |
674 | ldd [$end + 0], %f6 | |
675 | ldd [$end + 8], %f8 | |
676 | ||
677 | brnz,a $inner, .Lcbc_dec | |
678 | sub $inner, 16*2, $inner | |
679 | ||
680 | fmovd %f0, %f4 | |
681 | faesdecx %f2, %f10, %f0 | |
682 | faesdecx %f4, %f12, %f2 | |
683 | ldd [$end + 16], %f10 ! round[last-1] | |
684 | ldd [$end + 24], %f12 | |
685 | ||
686 | fmovd %f0, %f4 | |
687 | faesdecx %f2, %f6, %f0 | |
688 | faesdecx %f4, %f8, %f2 | |
689 | fxor $iv0, $rlhi, %f6 ! ivec^round[last] | |
690 | fxor $iv1, $rllo, %f8 | |
691 | fmovd $in0, $iv0 | |
692 | fmovd $in1, $iv1 | |
693 | ||
694 | movrz $len, 0, $inc | |
695 | fmovd $intail, $in0 | |
696 | ldd [$inp - 8], $in1 ! load next input block | |
697 | ldda [$inp]0x82, $intail ! non-faulting load | |
698 | add $inp, $inc, $inp ! inp+=16 | |
699 | ||
700 | fmovd %f0, %f4 | |
701 | faesdecx %f2, %f10, %f0 | |
702 | faesdecx %f4, %f12, %f2 | |
703 | ldd [$key + 16], %f10 ! round[1] | |
704 | ldd [$key + 24], %f12 | |
705 | ||
9515acca AP |
706 | fshiftorx $in0, $in1, $fshift, $in0 |
707 | fshiftorx $in1, $intail, $fshift, $in1 | |
d41de45a AP |
708 | |
709 | fmovd %f0, %f4 | |
710 | faesdeclx %f2, %f6, %f0 | |
711 | faesdeclx %f4, %f8, %f2 | |
712 | ||
713 | brnz,pn $oalign, .Lcbc_dec_unaligned_out | |
714 | nop | |
715 | ||
716 | std %f0, [$out + 0] | |
717 | std %f2, [$out + 8] | |
718 | add $out, 16, $out | |
719 | ||
720 | brnz,a $len, .Loop_cbc_dec | |
8604a6e0 | 721 | sub $len, 1, $len |
d41de45a AP |
722 | |
723 | st $iv0, [$ivp + 0] ! output ivec | |
724 | st $iv0#lo, [$ivp + 4] | |
725 | st $iv1, [$ivp + 8] | |
726 | st $iv1#lo, [$ivp + 12] | |
727 | ||
728 | ret | |
729 | restore | |
730 | ||
731 | .align 32 | |
732 | .Lcbc_dec_unaligned_out: | |
9515acca | 733 | ldd [%o7 + $mask], $fshift ! shift right params |
d41de45a | 734 | mov 0xff, $mask |
d41de45a AP |
735 | srl $mask, $oalign, $mask |
736 | sub %g0, $ileft, $iright | |
737 | ||
9515acca AP |
738 | fshiftorx %f0, %f0, $fshift, %f6 |
739 | fshiftorx %f0, %f2, $fshift, %f8 | |
d41de45a | 740 | |
9515acca AP |
741 | stda %f6, [$out + $mask]0xc0 ! partial store |
742 | orn %g0, $mask, $mask | |
743 | std %f8, [$out + 8] | |
d41de45a AP |
744 | add $out, 16, $out |
745 | brz $len, .Lcbc_dec_unaligned_out_done | |
9515acca AP |
746 | sub $len, 1, $len |
747 | b .Loop_cbc_dec_unaligned_out | |
748 | nop | |
d41de45a | 749 | |
9515acca | 750 | .align 32 |
d41de45a AP |
751 | .Loop_cbc_dec_unaligned_out: |
752 | fmovd %f2, $outhead | |
753 | fxor $in0, $r0hi, %f0 ! inp^round[0] | |
754 | fxor $in1, $r0lo, %f2 | |
755 | ldd [$key + 32], %f6 ! round[2] | |
756 | ldd [$key + 40], %f8 | |
757 | ||
758 | fmovd %f0, %f4 | |
759 | faesdecx %f2, %f10, %f0 | |
760 | faesdecx %f4, %f12, %f2 | |
761 | ldd [$key + 48], %f10 ! round[3] | |
762 | ldd [$key + 56], %f12 | |
763 | ||
764 | ldx [$inp - 16], %o0 | |
765 | ldx [$inp - 8], %o1 | |
9515acca | 766 | brz $ileft, .Lcbc_dec_aligned_inp |
d41de45a AP |
767 | movrz $len, 0, $inc |
768 | ||
769 | ldx [$inp], %o2 | |
770 | sllx %o0, $ileft, %o0 | |
771 | srlx %o1, $iright, %g1 | |
772 | sllx %o1, $ileft, %o1 | |
773 | or %g1, %o0, %o0 | |
774 | srlx %o2, $iright, %o2 | |
775 | or %o2, %o1, %o1 | |
776 | ||
777 | .Lcbc_dec_aligned_inp: | |
778 | fmovd %f0, %f4 | |
779 | faesdecx %f2, %f6, %f0 | |
780 | faesdecx %f4, %f8, %f2 | |
781 | ldd [$key + 64], %f6 ! round[4] | |
782 | ldd [$key + 72], %f8 | |
783 | add $key, 64, $end | |
784 | sub $rounds, 16*8, $inner | |
785 | ||
786 | stx %o0, [%sp + LOCALS + 0] | |
787 | stx %o1, [%sp + LOCALS + 8] | |
788 | add $inp, $inc, $inp ! inp+=16 | |
9515acca | 789 | nop |
d41de45a AP |
790 | |
791 | .Lcbc_dec_unaligned: | |
792 | fmovd %f0, %f4 | |
793 | faesdecx %f2, %f10, %f0 | |
794 | faesdecx %f4, %f12, %f2 | |
795 | ldd [$end + 16], %f10 | |
796 | ldd [$end + 24], %f12 | |
797 | add $end, 32, $end | |
798 | ||
799 | fmovd %f0, %f4 | |
800 | faesdecx %f2, %f6, %f0 | |
801 | faesdecx %f4, %f8, %f2 | |
802 | ldd [$end + 0], %f6 | |
803 | ldd [$end + 8], %f8 | |
804 | ||
805 | brnz,a $inner, .Lcbc_dec_unaligned | |
806 | sub $inner, 16*2, $inner | |
807 | ||
808 | fmovd %f0, %f4 | |
809 | faesdecx %f2, %f10, %f0 | |
810 | faesdecx %f4, %f12, %f2 | |
811 | ldd [$end + 16], %f10 ! round[last-1] | |
812 | ldd [$end + 24], %f12 | |
813 | ||
814 | fmovd %f0, %f4 | |
815 | faesdecx %f2, %f6, %f0 | |
816 | faesdecx %f4, %f8, %f2 | |
9515acca | 817 | |
d41de45a AP |
818 | fxor $iv0, $rlhi, %f6 ! ivec^round[last] |
819 | fxor $iv1, $rllo, %f8 | |
820 | fmovd $in0, $iv0 | |
821 | fmovd $in1, $iv1 | |
9515acca AP |
822 | ldd [%sp + LOCALS + 0], $in0 |
823 | ldd [%sp + LOCALS + 8], $in1 | |
d41de45a AP |
824 | |
825 | fmovd %f0, %f4 | |
826 | faesdecx %f2, %f10, %f0 | |
827 | faesdecx %f4, %f12, %f2 | |
828 | ldd [$key + 16], %f10 ! round[1] | |
829 | ldd [$key + 24], %f12 | |
830 | ||
831 | fmovd %f0, %f4 | |
832 | faesdeclx %f2, %f6, %f0 | |
833 | faesdeclx %f4, %f8, %f2 | |
d41de45a | 834 | |
9515acca AP |
835 | fshiftorx $outhead, %f0, $fshift, %f6 |
836 | fshiftorx %f0, %f2, $fshift, %f8 | |
837 | std %f6, [$out + 0] | |
838 | std %f8, [$out + 8] | |
d41de45a AP |
839 | add $out, 16, $out |
840 | ||
841 | brnz,a $len, .Loop_cbc_dec_unaligned_out | |
8604a6e0 | 842 | sub $len, 1, $len |
d41de45a AP |
843 | |
844 | .Lcbc_dec_unaligned_out_done: | |
9515acca | 845 | fshiftorx %f2, %f2, $fshift, %f8 |
d41de45a AP |
846 | stda %f8, [$out + $mask]0xc0 ! partial store |
847 | ||
848 | st $iv0, [$ivp + 0] ! output ivec | |
849 | st $iv0#lo, [$ivp + 4] | |
850 | st $iv1, [$ivp + 8] | |
851 | st $iv1#lo, [$ivp + 12] | |
852 | ||
853 | ret | |
854 | restore | |
855 | .type aes_fx_cbc_encrypt,#function | |
856 | .size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt | |
857 | ___ | |
858 | } | |
859 | { | |
860 | my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5)); | |
861 | my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); | |
9515acca | 862 | my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) |
d41de45a AP |
863 | = map("%f$_",grep { !($_ & 1) } (16 .. 62)); |
864 | my ($ileft,$iright) = ($ialign, $oalign); | |
865 | my $one = "%f14"; | |
866 | ||
867 | $code.=<<___; | |
868 | .globl aes_fx_ctr32_encrypt_blocks | |
869 | .align 32 | |
870 | aes_fx_ctr32_encrypt_blocks: | |
871 | save %sp, -STACK_FRAME-16, %sp | |
8604a6e0 | 872 | srln $len, 0, $len |
8604a6e0 | 873 | and $inp, 7, $ialign |
d41de45a | 874 | andn $inp, 7, $inp |
9515acca AP |
875 | brz,pn $len, .Lctr32_no_data |
876 | sll $ialign, 3, $ileft | |
d41de45a AP |
877 | |
878 | .Lpic: call .+8 | |
9515acca | 879 | add %o7, .Linp_align - .Lpic, %o7 |
d41de45a AP |
880 | |
881 | ld [$key + 240], $rounds | |
882 | and $out, 7, $oalign | |
883 | ld [$ivp + 0], $ctr0 ! load counter | |
9515acca | 884 | andn $out, 7, $out |
d41de45a | 885 | ld [$ivp + 4], $ctr0#lo |
9515acca | 886 | sll $oalign, 3, $mask |
d41de45a AP |
887 | ld [$ivp + 8], $ctr1 |
888 | ld [$ivp + 12], $ctr1#lo | |
9515acca | 889 | ldd [%o7 + 128], $one |
d41de45a AP |
890 | |
891 | sll $rounds, 4, $rounds | |
892 | add $rounds, $key, $end | |
893 | ldd [$key + 0], $r0hi ! round[0] | |
894 | ldd [$key + 8], $r0lo | |
895 | ||
896 | add $inp, 16, $inp | |
897 | sub $len, 1, $len | |
898 | ldd [$key + 16], %f10 ! round[1] | |
899 | ldd [$key + 24], %f12 | |
900 | ||
901 | mov 16, $inc | |
902 | movrz $len, 0, $inc | |
903 | ldd [$end + 0], $rlhi ! round[last] | |
904 | ldd [$end + 8], $rllo | |
905 | ||
9515acca AP |
906 | ldd [%o7 + $ileft], $fshift ! shiftleft params |
907 | add %o7, 64, %o7 | |
d41de45a AP |
908 | ldd [$inp - 16], $in0 ! load input |
909 | ldd [$inp - 8], $in1 | |
910 | ldda [$inp]0x82, $intail ! non-faulting load | |
911 | add $inp, $inc, $inp ! inp+=16 | |
912 | ||
9515acca AP |
913 | fshiftorx $in0, $in1, $fshift, $in0 |
914 | fshiftorx $in1, $intail, $fshift, $in1 | |
d41de45a AP |
915 | |
916 | .Loop_ctr32: | |
917 | fxor $ctr0, $r0hi, %f0 ! counter^round[0] | |
918 | fxor $ctr1, $r0lo, %f2 | |
919 | ldd [$key + 32], %f6 ! round[2] | |
920 | ldd [$key + 40], %f8 | |
921 | add $key, 32, $end | |
922 | sub $rounds, 16*6, $inner | |
923 | ||
924 | .Lctr32_enc: | |
925 | fmovd %f0, %f4 | |
926 | faesencx %f2, %f10, %f0 | |
927 | faesencx %f4, %f12, %f2 | |
928 | ldd [$end + 16], %f10 | |
929 | ldd [$end + 24], %f12 | |
930 | add $end, 32, $end | |
931 | ||
932 | fmovd %f0, %f4 | |
933 | faesencx %f2, %f6, %f0 | |
934 | faesencx %f4, %f8, %f2 | |
935 | ldd [$end + 0], %f6 | |
936 | ldd [$end + 8], %f8 | |
937 | ||
938 | brnz,a $inner, .Lctr32_enc | |
939 | sub $inner, 16*2, $inner | |
940 | ||
941 | fmovd %f0, %f4 | |
942 | faesencx %f2, %f10, %f0 | |
943 | faesencx %f4, %f12, %f2 | |
944 | ldd [$end + 16], %f10 ! round[last-1] | |
945 | ldd [$end + 24], %f12 | |
946 | ||
947 | fmovd %f0, %f4 | |
948 | faesencx %f2, %f6, %f0 | |
949 | faesencx %f4, %f8, %f2 | |
950 | fxor $in0, $rlhi, %f6 ! inp^round[last] | |
951 | fxor $in1, $rllo, %f8 | |
952 | ||
953 | movrz $len, 0, $inc | |
954 | fmovd $intail, $in0 | |
955 | ldd [$inp - 8], $in1 ! load next input block | |
956 | ldda [$inp]0x82, $intail ! non-faulting load | |
957 | add $inp, $inc, $inp ! inp+=16 | |
958 | ||
959 | fmovd %f0, %f4 | |
960 | faesencx %f2, %f10, %f0 | |
961 | faesencx %f4, %f12, %f2 | |
962 | ldd [$key + 16], %f10 ! round[1] | |
963 | ldd [$key + 24], %f12 | |
964 | ||
9515acca AP |
965 | fshiftorx $in0, $in1, $fshift, $in0 |
966 | fshiftorx $in1, $intail, $fshift, $in1 | |
d41de45a AP |
967 | fpadd32 $ctr1, $one, $ctr1 ! increment counter |
968 | ||
969 | fmovd %f0, %f4 | |
970 | faesenclx %f2, %f6, %f0 | |
971 | faesenclx %f4, %f8, %f2 | |
972 | ||
973 | brnz,pn $oalign, .Lctr32_unaligned_out | |
974 | nop | |
975 | ||
976 | std %f0, [$out + 0] | |
977 | std %f2, [$out + 8] | |
978 | add $out, 16, $out | |
979 | ||
980 | brnz,a $len, .Loop_ctr32 | |
981 | sub $len, 1, $len | |
982 | ||
983 | .Lctr32_no_data: | |
984 | ret | |
985 | restore | |
986 | ||
987 | .align 32 | |
988 | .Lctr32_unaligned_out: | |
9515acca | 989 | ldd [%o7 + $mask], $fshift ! shift right params |
d41de45a | 990 | mov 0xff, $mask |
d41de45a AP |
991 | srl $mask, $oalign, $mask |
992 | sub %g0, $ileft, $iright | |
993 | ||
9515acca AP |
994 | fshiftorx %f0, %f0, $fshift, %f6 |
995 | fshiftorx %f0, %f2, $fshift, %f8 | |
d41de45a | 996 | |
9515acca AP |
997 | stda %f6, [$out + $mask]0xc0 ! partial store |
998 | orn %g0, $mask, $mask | |
999 | std %f8, [$out + 8] | |
d41de45a AP |
1000 | add $out, 16, $out |
1001 | brz $len, .Lctr32_unaligned_out_done | |
9515acca AP |
1002 | sub $len, 1, $len |
1003 | b .Loop_ctr32_unaligned_out | |
1004 | nop | |
d41de45a | 1005 | |
9515acca | 1006 | .align 32 |
d41de45a AP |
1007 | .Loop_ctr32_unaligned_out: |
1008 | fmovd %f2, $outhead | |
1009 | fxor $ctr0, $r0hi, %f0 ! counter^round[0] | |
1010 | fxor $ctr1, $r0lo, %f2 | |
1011 | ldd [$key + 32], %f6 ! round[2] | |
1012 | ldd [$key + 40], %f8 | |
1013 | ||
1014 | fmovd %f0, %f4 | |
1015 | faesencx %f2, %f10, %f0 | |
1016 | faesencx %f4, %f12, %f2 | |
1017 | ldd [$key + 48], %f10 ! round[3] | |
1018 | ldd [$key + 56], %f12 | |
1019 | ||
1020 | ldx [$inp - 16], %o0 | |
1021 | ldx [$inp - 8], %o1 | |
9515acca | 1022 | brz $ileft, .Lctr32_aligned_inp |
d41de45a AP |
1023 | movrz $len, 0, $inc |
1024 | ||
1025 | ldx [$inp], %o2 | |
1026 | sllx %o0, $ileft, %o0 | |
1027 | srlx %o1, $iright, %g1 | |
1028 | sllx %o1, $ileft, %o1 | |
1029 | or %g1, %o0, %o0 | |
1030 | srlx %o2, $iright, %o2 | |
1031 | or %o2, %o1, %o1 | |
1032 | ||
1033 | .Lctr32_aligned_inp: | |
1034 | fmovd %f0, %f4 | |
1035 | faesencx %f2, %f6, %f0 | |
1036 | faesencx %f4, %f8, %f2 | |
1037 | ldd [$key + 64], %f6 ! round[4] | |
1038 | ldd [$key + 72], %f8 | |
1039 | add $key, 64, $end | |
1040 | sub $rounds, 16*8, $inner | |
1041 | ||
1042 | stx %o0, [%sp + LOCALS + 0] | |
1043 | stx %o1, [%sp + LOCALS + 8] | |
1044 | add $inp, $inc, $inp ! inp+=16 | |
9515acca | 1045 | nop |
d41de45a AP |
1046 | |
1047 | .Lctr32_enc_unaligned: | |
1048 | fmovd %f0, %f4 | |
1049 | faesencx %f2, %f10, %f0 | |
1050 | faesencx %f4, %f12, %f2 | |
1051 | ldd [$end + 16], %f10 | |
1052 | ldd [$end + 24], %f12 | |
1053 | add $end, 32, $end | |
1054 | ||
1055 | fmovd %f0, %f4 | |
1056 | faesencx %f2, %f6, %f0 | |
1057 | faesencx %f4, %f8, %f2 | |
1058 | ldd [$end + 0], %f6 | |
1059 | ldd [$end + 8], %f8 | |
1060 | ||
1061 | brnz,a $inner, .Lctr32_enc_unaligned | |
1062 | sub $inner, 16*2, $inner | |
1063 | ||
1064 | fmovd %f0, %f4 | |
1065 | faesencx %f2, %f10, %f0 | |
1066 | faesencx %f4, %f12, %f2 | |
1067 | ldd [$end + 16], %f10 ! round[last-1] | |
1068 | ldd [$end + 24], %f12 | |
1069 | fpadd32 $ctr1, $one, $ctr1 ! increment counter | |
1070 | ||
1071 | fmovd %f0, %f4 | |
1072 | faesencx %f2, %f6, %f0 | |
1073 | faesencx %f4, %f8, %f2 | |
1074 | fxor $in0, $rlhi, %f6 ! inp^round[last] | |
1075 | fxor $in1, $rllo, %f8 | |
1076 | ldd [%sp + LOCALS + 0], $in0 | |
1077 | ldd [%sp + LOCALS + 8], $in1 | |
1078 | ||
1079 | fmovd %f0, %f4 | |
1080 | faesencx %f2, %f10, %f0 | |
1081 | faesencx %f4, %f12, %f2 | |
1082 | ldd [$key + 16], %f10 ! round[1] | |
1083 | ldd [$key + 24], %f12 | |
1084 | ||
1085 | fmovd %f0, %f4 | |
1086 | faesenclx %f2, %f6, %f0 | |
1087 | faesenclx %f4, %f8, %f2 | |
1088 | ||
9515acca AP |
1089 | fshiftorx $outhead, %f0, $fshift, %f6 |
1090 | fshiftorx %f0, %f2, $fshift, %f8 | |
1091 | std %f6, [$out + 0] | |
1092 | std %f8, [$out + 8] | |
d41de45a AP |
1093 | add $out, 16, $out |
1094 | ||
1095 | brnz,a $len, .Loop_ctr32_unaligned_out | |
1096 | sub $len, 1, $len | |
1097 | ||
1098 | .Lctr32_unaligned_out_done: | |
9515acca | 1099 | fshiftorx %f2, %f2, $fshift, %f8 |
d41de45a AP |
1100 | stda %f8, [$out + $mask]0xc0 ! partial store |
1101 | ||
1102 | ret | |
1103 | restore | |
1104 | .type aes_fx_ctr32_encrypt_blocks,#function | |
1105 | .size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks | |
9515acca | 1106 | |
d41de45a | 1107 | .align 32 |
9515acca AP |
1108 | .Linp_align: ! fshiftorx parameters for left shift toward %rs1 |
1109 | .byte 0, 0, 64, 0, 0, 64, 0, -64 | |
1110 | .byte 0, 0, 56, 8, 0, 56, 8, -56 | |
1111 | .byte 0, 0, 48, 16, 0, 48, 16, -48 | |
1112 | .byte 0, 0, 40, 24, 0, 40, 24, -40 | |
1113 | .byte 0, 0, 32, 32, 0, 32, 32, -32 | |
1114 | .byte 0, 0, 24, 40, 0, 24, 40, -24 | |
1115 | .byte 0, 0, 16, 48, 0, 16, 48, -16 | |
1116 | .byte 0, 0, 8, 56, 0, 8, 56, -8 | |
1117 | .Lout_align: ! fshiftorx parameters for right shift toward %rs2 | |
1118 | .byte 0, 0, 0, 64, 0, 0, 64, 0 | |
1119 | .byte 0, 0, 8, 56, 0, 8, 56, -8 | |
1120 | .byte 0, 0, 16, 48, 0, 16, 48, -16 | |
1121 | .byte 0, 0, 24, 40, 0, 24, 40, -24 | |
1122 | .byte 0, 0, 32, 32, 0, 32, 32, -32 | |
1123 | .byte 0, 0, 40, 24, 0, 40, 24, -40 | |
1124 | .byte 0, 0, 48, 16, 0, 48, 16, -48 | |
1125 | .byte 0, 0, 56, 8, 0, 56, 8, -56 | |
d41de45a AP |
1126 | .Lone: |
1127 | .word 0, 1 | |
1128 | .asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>" | |
1129 | .align 4 | |
1130 | ___ | |
1131 | } | |
fb65020b AP |
1132 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
1133 | # so that one can compile the module without having to specify VIS | |
1134 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. | |
1135 | # Idea is to reserve for option to produce "universal" binary and let | |
1136 | # programmer detect if current CPU is VIS capable at run-time. | |
1137 | sub unvis { | |
1138 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1139 | my ($ref,$opf); | |
1140 | my %visopf = ( "faligndata" => 0x048, | |
1141 | "bshuffle" => 0x04c, | |
d41de45a | 1142 | "fpadd32" => 0x052, |
fb65020b AP |
1143 | "fxor" => 0x06c, |
1144 | "fsrc2" => 0x078 ); | |
1145 | ||
1146 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1147 | ||
1148 | if ($opf=$visopf{$mnemonic}) { | |
1149 | foreach ($rs1,$rs2,$rd) { | |
1150 | return $ref if (!/%f([0-9]{1,2})/); | |
1151 | $_=$1; | |
1152 | if ($1>=32) { | |
1153 | return $ref if ($1&1); | |
1154 | # re-encode for upper double register addressing | |
1155 | $_=($1|$1>>5)&31; | |
1156 | } | |
1157 | } | |
1158 | ||
1159 | return sprintf ".word\t0x%08x !%s", | |
1160 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
1161 | $ref; | |
1162 | } else { | |
1163 | return $ref; | |
1164 | } | |
1165 | } | |
1166 | ||
1167 | sub unvis3 { | |
1168 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1169 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
1170 | my ($ref,$opf); | |
1171 | my %visopf = ( "alignaddr" => 0x018, | |
1172 | "bmask" => 0x019, | |
1173 | "alignaddrl" => 0x01a ); | |
1174 | ||
1175 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1176 | ||
1177 | if ($opf=$visopf{$mnemonic}) { | |
1178 | foreach ($rs1,$rs2,$rd) { | |
1179 | return $ref if (!/%([goli])([0-9])/); | |
1180 | $_=$bias{$1}+$2; | |
1181 | } | |
1182 | ||
1183 | return sprintf ".word\t0x%08x !%s", | |
1184 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
1185 | $ref; | |
1186 | } else { | |
1187 | return $ref; | |
1188 | } | |
1189 | } | |
1190 | ||
1191 | sub unfx { | |
1192 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
1193 | my ($ref,$opf); | |
1194 | my %aesopf = ( "faesencx" => 0x90, | |
1195 | "faesdecx" => 0x91, | |
1196 | "faesenclx" => 0x92, | |
1197 | "faesdeclx" => 0x93, | |
1198 | "faeskeyx" => 0x94 ); | |
1199 | ||
1200 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
1201 | ||
1202 | if (defined($opf=$aesopf{$mnemonic})) { | |
1203 | $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2; | |
1204 | $rs2 = oct($rs2) if ($rs2 =~ /^0/); | |
1205 | ||
1206 | foreach ($rs1,$rd) { | |
1207 | return $ref if (!/%f([0-9]{1,2})/); | |
1208 | $_=$1; | |
1209 | if ($1>=32) { | |
1210 | return $ref if ($1&1); | |
1211 | # re-encode for upper double register addressing | |
1212 | $_=($1|$1>>5)&31; | |
1213 | } | |
1214 | } | |
1215 | ||
1216 | return sprintf ".word\t0x%08x !%s", | |
1217 | 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, | |
1218 | $ref; | |
1219 | } else { | |
1220 | return $ref; | |
1221 | } | |
1222 | } | |
1223 | ||
9515acca AP |
1224 | sub unfx3src { |
1225 | my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; | |
1226 | my ($ref,$opf); | |
1227 | my %aesopf = ( "fshiftorx" => 0x0b ); | |
1228 | ||
1229 | $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; | |
1230 | ||
1231 | if (defined($opf=$aesopf{$mnemonic})) { | |
1232 | foreach ($rs1,$rs2,$rs3,$rd) { | |
1233 | return $ref if (!/%f([0-9]{1,2})/); | |
1234 | $_=$1; | |
1235 | if ($1>=32) { | |
1236 | return $ref if ($1&1); | |
1237 | # re-encode for upper double register addressing | |
1238 | $_=($1|$1>>5)&31; | |
1239 | } | |
1240 | } | |
1241 | ||
1242 | return sprintf ".word\t0x%08x !%s", | |
1243 | 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, | |
1244 | $ref; | |
1245 | } else { | |
1246 | return $ref; | |
1247 | } | |
1248 | } | |
1249 | ||
fb65020b AP |
1250 | foreach (split("\n",$code)) { |
1251 | s/\`([^\`]*)\`/eval $1/ge; | |
1252 | ||
d41de45a AP |
1253 | s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge; |
1254 | ||
fb65020b | 1255 | s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ |
9515acca AP |
1256 | &unfx($1,$2,$3,$4) |
1257 | /ge or | |
1258 | s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
1259 | &unfx3src($1,$2,$3,$4,$5) | |
fb65020b AP |
1260 | /ge or |
1261 | s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
1262 | &unvis($1,$2,$3,$4) | |
1263 | /ge or | |
1264 | s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
1265 | &unvis3($1,$2,$3,$4) | |
1266 | /ge; | |
1267 | print $_,"\n"; | |
1268 | } | |
1269 | ||
1270 | close STDOUT; |