]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/perlasm/sparcv9_modes.pl
AES for SPARC T4: add XTS, reorder subroutines to improve TLB locality.
[thirdparty/openssl.git] / crypto / perlasm / sparcv9_modes.pl
CommitLineData
54a1f448
AP
1#!/usr/bin/env perl
2
dea80680
AP
3# Specific modes implementations for SPARC Architecture 2011. There
4# is T4 dependency though, an ASI value that is not specified in the
5# Architecture Manual. But as SPARC universe is rather monocultural,
6# we imply that processor capable of executing crypto instructions
7# can handle the ASI in question as well. This means that we ought to
8# keep eyes open when new processors emerge...
9#
10# As for above mentioned ASI. It's so called "block initializing
11# store" which cancels "read" in "read-update-write" on cache lines.
12# This is "cooperative" optimization, as it reduces overall pressure
13# on memory interface. Benefits can't be observed/quantified with
14# usual benchmarks, on the contrary you can notice that single-thread
fd3b0eb0
AP
15# performance for parallelizable modes is ~1.5% worse for largest
16# block sizes [though few percent better for not so long ones]. All
17# this based on suggestions from David Miller.
54a1f448 18
38049c2b
AP
19sub asm_init { # to be called with @ARGV as argument
20 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
21 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
22 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
23}
24
25# unified interface
26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
27# local variables
dea80680 28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
54a1f448
AP
29
30sub alg_cbc_encrypt_implement {
31my ($alg,$bits) = @_;
32
33$::code.=<<___;
34.globl ${alg}${bits}_t4_cbc_encrypt
35.align 32
36${alg}${bits}_t4_cbc_encrypt:
37 save %sp, -$::frame, %sp
dea80680 38 sub $inp, $out, $blk_init ! $inp!=$out
54a1f448
AP
39___
40$::code.=<<___ if (!$::evp);
41 andcc $ivec, 7, $ivoff
42 alignaddr $ivec, %g0, $ivec
43
44 ldd [$ivec + 0], %f0 ! load ivec
45 bz,pt %icc, 1f
46 ldd [$ivec + 8], %f2
47 ldd [$ivec + 16], %f4
48 faligndata %f0, %f2, %f0
49 faligndata %f2, %f4, %f2
501:
51___
52$::code.=<<___ if ($::evp);
53 ld [$ivec + 0], %f0
54 ld [$ivec + 4], %f1
55 ld [$ivec + 8], %f2
56 ld [$ivec + 12], %f3
57___
58$::code.=<<___;
dea80680
AP
59 prefetch [$inp], 20
60 prefetch [$inp + 63], 20
54a1f448 61 call _${alg}${bits}_load_enckey
54a1f448
AP
62 and $inp, 7, $ileft
63 andn $inp, 7, $inp
64 sll $ileft, 3, $ileft
65 mov 64, $iright
66 mov 0xff, $omask
67 sub $iright, $ileft, $iright
68 and $out, 7, $ooff
dea80680
AP
69 cmp $len, 127
70 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
71 movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
72 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
54a1f448
AP
73 srl $omask, $ooff, $omask
74
dea80680
AP
75 alignaddrl $out, %g0, $out
76 srlx $len, 4, $len
77 prefetch [$out], 22
78
54a1f448
AP
79.L${bits}_cbc_enc_loop:
80 ldx [$inp + 0], %o0
81 brz,pt $ileft, 4f
82 ldx [$inp + 8], %o1
83
84 ldx [$inp + 16], %o2
85 sllx %o0, $ileft, %o0
86 srlx %o1, $iright, %g1
87 sllx %o1, $ileft, %o1
88 or %g1, %o0, %o0
89 srlx %o2, $iright, %o2
90 or %o2, %o1, %o1
914:
92 xor %g4, %o0, %o0 ! ^= rk[0]
93 xor %g5, %o1, %o1
94 movxtod %o0, %f12
95 movxtod %o1, %f14
96
97 fxor %f12, %f0, %f0 ! ^= ivec
98 fxor %f14, %f2, %f2
dea80680
AP
99 prefetch [$out + 63], 22
100 prefetch [$inp + 16+63], 20
54a1f448
AP
101 call _${alg}${bits}_encrypt_1x
102 add $inp, 16, $inp
103
104 brnz,pn $ooff, 2f
105 sub $len, 1, $len
106
107 std %f0, [$out + 0]
108 std %f2, [$out + 8]
109 brnz,pt $len, .L${bits}_cbc_enc_loop
110 add $out, 16, $out
111___
112$::code.=<<___ if ($::evp);
113 st %f0, [$ivec + 0]
114 st %f1, [$ivec + 4]
115 st %f2, [$ivec + 8]
116 st %f3, [$ivec + 12]
117___
118$::code.=<<___ if (!$::evp);
119 brnz,pn $ivoff, 3f
120 nop
121
122 std %f0, [$ivec + 0] ! write out ivec
123 std %f2, [$ivec + 8]
124___
125$::code.=<<___;
126 ret
127 restore
128
129.align 16
1302: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
131 ! and ~3x deterioration
132 ! in inp==out case
133 faligndata %f0, %f0, %f4 ! handle unaligned output
134 faligndata %f0, %f2, %f6
135 faligndata %f2, %f2, %f8
136
137 stda %f4, [$out + $omask]0xc0 ! partial store
138 std %f6, [$out + 8]
139 add $out, 16, $out
140 orn %g0, $omask, $omask
141 stda %f8, [$out + $omask]0xc0 ! partial store
142
143 brnz,pt $len, .L${bits}_cbc_enc_loop+4
144 orn %g0, $omask, $omask
145___
146$::code.=<<___ if ($::evp);
147 st %f0, [$ivec + 0]
148 st %f1, [$ivec + 4]
149 st %f2, [$ivec + 8]
150 st %f3, [$ivec + 12]
151___
152$::code.=<<___ if (!$::evp);
153 brnz,pn $ivoff, 3f
154 nop
155
156 std %f0, [$ivec + 0] ! write out ivec
157 std %f2, [$ivec + 8]
158 ret
159 restore
160
161.align 16
1623: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
163 mov 0xff, $omask
164 srl $omask, $ivoff, $omask
165 faligndata %f0, %f0, %f4
166 faligndata %f0, %f2, %f6
167 faligndata %f2, %f2, %f8
168 stda %f4, [$ivec + $omask]0xc0
169 std %f6, [$ivec + 8]
170 add $ivec, 16, $ivec
171 orn %g0, $omask, $omask
172 stda %f8, [$ivec + $omask]0xc0
173___
dea80680
AP
174$::code.=<<___;
175 ret
176 restore
177
178!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
179.align 32
180.L${bits}cbc_enc_blk:
181 add $out, $len, $blk_init
182 and $blk_init, 63, $blk_init ! tail
183 sub $len, $blk_init, $len
184 add $blk_init, 15, $blk_init ! round up to 16n
185 srlx $len, 4, $len
186 srl $blk_init, 4, $blk_init
187
188.L${bits}_cbc_enc_blk_loop:
189 ldx [$inp + 0], %o0
190 brz,pt $ileft, 5f
191 ldx [$inp + 8], %o1
192
193 ldx [$inp + 16], %o2
194 sllx %o0, $ileft, %o0
195 srlx %o1, $iright, %g1
196 sllx %o1, $ileft, %o1
197 or %g1, %o0, %o0
198 srlx %o2, $iright, %o2
199 or %o2, %o1, %o1
2005:
201 xor %g4, %o0, %o0 ! ^= rk[0]
202 xor %g5, %o1, %o1
203 movxtod %o0, %f12
204 movxtod %o1, %f14
205
206 fxor %f12, %f0, %f0 ! ^= ivec
207 fxor %f14, %f2, %f2
208 prefetch [$inp + 16+63], 20
209 call _${alg}${bits}_encrypt_1x
210 add $inp, 16, $inp
211 sub $len, 1, $len
212
38049c2b 213 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 214 add $out, 8, $out
38049c2b 215 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680
AP
216 brnz,pt $len, .L${bits}_cbc_enc_blk_loop
217 add $out, 8, $out
218
fd3b0eb0 219 membar #StoreLoad|#StoreStore
dea80680
AP
220 brnz,pt $blk_init, .L${bits}_cbc_enc_loop
221 mov $blk_init, $len
222___
223$::code.=<<___ if ($::evp);
224 st %f0, [$ivec + 0]
225 st %f1, [$ivec + 4]
226 st %f2, [$ivec + 8]
227 st %f3, [$ivec + 12]
228___
229$::code.=<<___ if (!$::evp);
230 brnz,pn $ivoff, 3b
231 nop
232
233 std %f0, [$ivec + 0] ! write out ivec
234 std %f2, [$ivec + 8]
235___
54a1f448
AP
236$::code.=<<___;
237 ret
238 restore
239.type ${alg}${bits}_t4_cbc_encrypt,#function
240.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
241___
242}
243
244sub alg_cbc_decrypt_implement {
245my ($alg,$bits) = @_;
246
247$::code.=<<___;
248.globl ${alg}${bits}_t4_cbc_decrypt
249.align 32
250${alg}${bits}_t4_cbc_decrypt:
251 save %sp, -$::frame, %sp
dea80680 252 sub $inp, $out, $blk_init ! $inp!=$out
54a1f448
AP
253___
254$::code.=<<___ if (!$::evp);
255 andcc $ivec, 7, $ivoff
256 alignaddr $ivec, %g0, $ivec
257
258 ldd [$ivec + 0], %f12 ! load ivec
259 bz,pt %icc, 1f
260 ldd [$ivec + 8], %f14
261 ldd [$ivec + 16], %f0
262 faligndata %f12, %f14, %f12
263 faligndata %f14, %f0, %f14
2641:
265___
266$::code.=<<___ if ($::evp);
267 ld [$ivec + 0], %f12 ! load ivec
268 ld [$ivec + 4], %f13
269 ld [$ivec + 8], %f14
270 ld [$ivec + 12], %f15
271___
272$::code.=<<___;
dea80680
AP
273 prefetch [$inp], 20
274 prefetch [$inp + 63], 20
54a1f448 275 call _${alg}${bits}_load_deckey
54a1f448
AP
276 and $inp, 7, $ileft
277 andn $inp, 7, $inp
278 sll $ileft, 3, $ileft
279 mov 64, $iright
280 mov 0xff, $omask
281 sub $iright, $ileft, $iright
282 and $out, 7, $ooff
fd3b0eb0 283 cmp $len, 255
dea80680 284 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
fd3b0eb0 285 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
dea80680
AP
286 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
287 srl $omask, $ooff, $omask
288
289 andcc $len, 16, %g0 ! is number of blocks even?
290 srlx $len, 4, $len
54a1f448
AP
291 alignaddrl $out, %g0, $out
292 bz %icc, .L${bits}_cbc_dec_loop2x
dea80680 293 prefetch [$out], 22
54a1f448
AP
294.L${bits}_cbc_dec_loop:
295 ldx [$inp + 0], %o0
296 brz,pt $ileft, 4f
297 ldx [$inp + 8], %o1
298
299 ldx [$inp + 16], %o2
300 sllx %o0, $ileft, %o0
301 srlx %o1, $iright, %g1
302 sllx %o1, $ileft, %o1
303 or %g1, %o0, %o0
304 srlx %o2, $iright, %o2
305 or %o2, %o1, %o1
3064:
307 xor %g4, %o0, %o2 ! ^= rk[0]
308 xor %g5, %o1, %o3
309 movxtod %o2, %f0
310 movxtod %o3, %f2
311
dea80680
AP
312 prefetch [$out + 63], 22
313 prefetch [$inp + 16+63], 20
54a1f448
AP
314 call _${alg}${bits}_decrypt_1x
315 add $inp, 16, $inp
316
317 fxor %f12, %f0, %f0 ! ^= ivec
318 fxor %f14, %f2, %f2
319 movxtod %o0, %f12
320 movxtod %o1, %f14
321
322 brnz,pn $ooff, 2f
323 sub $len, 1, $len
324
325 std %f0, [$out + 0]
326 std %f2, [$out + 8]
327 brnz,pt $len, .L${bits}_cbc_dec_loop2x
328 add $out, 16, $out
329___
330$::code.=<<___ if ($::evp);
331 st %f12, [$ivec + 0]
332 st %f13, [$ivec + 4]
333 st %f14, [$ivec + 8]
334 st %f15, [$ivec + 12]
335___
336$::code.=<<___ if (!$::evp);
337 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
338 nop
339
340 std %f12, [$ivec + 0] ! write out ivec
341 std %f14, [$ivec + 8]
342___
343$::code.=<<___;
344 ret
345 restore
346
347.align 16
3482: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
349 ! and ~3x deterioration
350 ! in inp==out case
351 faligndata %f0, %f0, %f4 ! handle unaligned output
352 faligndata %f0, %f2, %f6
353 faligndata %f2, %f2, %f8
354
355 stda %f4, [$out + $omask]0xc0 ! partial store
356 std %f6, [$out + 8]
357 add $out, 16, $out
358 orn %g0, $omask, $omask
359 stda %f8, [$out + $omask]0xc0 ! partial store
360
361 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
362 orn %g0, $omask, $omask
363___
364$::code.=<<___ if ($::evp);
365 st %f12, [$ivec + 0]
366 st %f13, [$ivec + 4]
367 st %f14, [$ivec + 8]
368 st %f15, [$ivec + 12]
369___
370$::code.=<<___ if (!$::evp);
371 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
372 nop
373
374 std %f12, [$ivec + 0] ! write out ivec
375 std %f14, [$ivec + 8]
376___
377$::code.=<<___;
378 ret
379 restore
380
381!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
382.align 32
383.L${bits}_cbc_dec_loop2x:
384 ldx [$inp + 0], %o0
385 ldx [$inp + 8], %o1
386 ldx [$inp + 16], %o2
387 brz,pt $ileft, 4f
388 ldx [$inp + 24], %o3
389
390 ldx [$inp + 32], %o4
391 sllx %o0, $ileft, %o0
392 srlx %o1, $iright, %g1
393 or %g1, %o0, %o0
394 sllx %o1, $ileft, %o1
395 srlx %o2, $iright, %g1
396 or %g1, %o1, %o1
397 sllx %o2, $ileft, %o2
398 srlx %o3, $iright, %g1
399 or %g1, %o2, %o2
400 sllx %o3, $ileft, %o3
401 srlx %o4, $iright, %o4
402 or %o4, %o3, %o3
4034:
404 xor %g4, %o0, %o4 ! ^= rk[0]
405 xor %g5, %o1, %o5
406 movxtod %o4, %f0
407 movxtod %o5, %f2
408 xor %g4, %o2, %o4
409 xor %g5, %o3, %o5
410 movxtod %o4, %f4
411 movxtod %o5, %f6
412
dea80680
AP
413 prefetch [$out + 63], 22
414 prefetch [$inp + 32+63], 20
54a1f448
AP
415 call _${alg}${bits}_decrypt_2x
416 add $inp, 32, $inp
417
418 movxtod %o0, %f8
419 movxtod %o1, %f10
420 fxor %f12, %f0, %f0 ! ^= ivec
421 fxor %f14, %f2, %f2
422 movxtod %o2, %f12
423 movxtod %o3, %f14
424 fxor %f8, %f4, %f4
425 fxor %f10, %f6, %f6
426
427 brnz,pn $ooff, 2f
428 sub $len, 2, $len
429
430 std %f0, [$out + 0]
431 std %f2, [$out + 8]
432 std %f4, [$out + 16]
433 std %f6, [$out + 24]
434 brnz,pt $len, .L${bits}_cbc_dec_loop2x
435 add $out, 32, $out
436___
437$::code.=<<___ if ($::evp);
438 st %f12, [$ivec + 0]
439 st %f13, [$ivec + 4]
440 st %f14, [$ivec + 8]
441 st %f15, [$ivec + 12]
442___
443$::code.=<<___ if (!$::evp);
444 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
445 nop
446
447 std %f12, [$ivec + 0] ! write out ivec
448 std %f14, [$ivec + 8]
449___
450$::code.=<<___;
451 ret
452 restore
453
454.align 16
4552: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
456 ! and ~3x deterioration
457 ! in inp==out case
458 faligndata %f0, %f0, %f8 ! handle unaligned output
459 faligndata %f0, %f2, %f0
460 faligndata %f2, %f4, %f2
461 faligndata %f4, %f6, %f4
462 faligndata %f6, %f6, %f6
463 stda %f8, [$out + $omask]0xc0 ! partial store
464 std %f0, [$out + 8]
465 std %f2, [$out + 16]
466 std %f4, [$out + 24]
467 add $out, 32, $out
468 orn %g0, $omask, $omask
469 stda %f6, [$out + $omask]0xc0 ! partial store
470
471 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
472 orn %g0, $omask, $omask
473___
474$::code.=<<___ if ($::evp);
475 st %f12, [$ivec + 0]
476 st %f13, [$ivec + 4]
477 st %f14, [$ivec + 8]
478 st %f15, [$ivec + 12]
479___
480$::code.=<<___ if (!$::evp);
481 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
482 nop
483
484 std %f12, [$ivec + 0] ! write out ivec
485 std %f14, [$ivec + 8]
486 ret
487 restore
488
489.align 16
490.L${bits}_cbc_dec_unaligned_ivec:
491 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
492 mov 0xff, $omask
493 srl $omask, $ivoff, $omask
494 faligndata %f12, %f12, %f0
495 faligndata %f12, %f14, %f2
496 faligndata %f14, %f14, %f4
497 stda %f0, [$ivec + $omask]0xc0
498 std %f2, [$ivec + 8]
499 add $ivec, 16, $ivec
500 orn %g0, $omask, $omask
501 stda %f4, [$ivec + $omask]0xc0
502___
dea80680
AP
503$::code.=<<___;
504 ret
505 restore
506
507!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
508.align 32
509.L${bits}cbc_dec_blk:
510 add $out, $len, $blk_init
511 and $blk_init, 63, $blk_init ! tail
512 sub $len, $blk_init, $len
513 add $blk_init, 15, $blk_init ! round up to 16n
514 srlx $len, 4, $len
515 srl $blk_init, 4, $blk_init
516 sub $len, 1, $len
517 add $blk_init, 1, $blk_init
518
519.L${bits}_cbc_dec_blk_loop2x:
520 ldx [$inp + 0], %o0
521 ldx [$inp + 8], %o1
522 ldx [$inp + 16], %o2
523 brz,pt $ileft, 5f
524 ldx [$inp + 24], %o3
525
526 ldx [$inp + 32], %o4
527 sllx %o0, $ileft, %o0
528 srlx %o1, $iright, %g1
529 or %g1, %o0, %o0
530 sllx %o1, $ileft, %o1
531 srlx %o2, $iright, %g1
532 or %g1, %o1, %o1
533 sllx %o2, $ileft, %o2
534 srlx %o3, $iright, %g1
535 or %g1, %o2, %o2
536 sllx %o3, $ileft, %o3
537 srlx %o4, $iright, %o4
538 or %o4, %o3, %o3
5395:
540 xor %g4, %o0, %o4 ! ^= rk[0]
541 xor %g5, %o1, %o5
542 movxtod %o4, %f0
543 movxtod %o5, %f2
544 xor %g4, %o2, %o4
545 xor %g5, %o3, %o5
546 movxtod %o4, %f4
547 movxtod %o5, %f6
548
549 prefetch [$inp + 32+63], 20
550 call _${alg}${bits}_decrypt_2x
551 add $inp, 32, $inp
552 subcc $len, 2, $len
553
554 movxtod %o0, %f8
555 movxtod %o1, %f10
556 fxor %f12, %f0, %f0 ! ^= ivec
557 fxor %f14, %f2, %f2
558 movxtod %o2, %f12
559 movxtod %o3, %f14
560 fxor %f8, %f4, %f4
561 fxor %f10, %f6, %f6
562
38049c2b 563 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 564 add $out, 8, $out
38049c2b 565 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 566 add $out, 8, $out
38049c2b 567 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 568 add $out, 8, $out
38049c2b 569 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680
AP
570 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
571 add $out, 8, $out
572
573 add $blk_init, $len, $len
574 andcc $len, 1, %g0 ! is number of blocks even?
fd3b0eb0 575 membar #StoreLoad|#StoreStore
dea80680
AP
576 bnz,pt %icc, .L${bits}_cbc_dec_loop
577 srl $len, 0, $len
578 brnz,pn $len, .L${bits}_cbc_dec_loop2x
579 nop
580___
581$::code.=<<___ if ($::evp);
38049c2b
AP
582 st %f12, [$ivec + 0] ! write out ivec
583 st %f13, [$ivec + 4]
584 st %f14, [$ivec + 8]
585 st %f15, [$ivec + 12]
dea80680
AP
586___
587$::code.=<<___ if (!$::evp);
588 brnz,pn $ivoff, 3b
589 nop
590
38049c2b
AP
591 std %f12, [$ivec + 0] ! write out ivec
592 std %f14, [$ivec + 8]
dea80680 593___
54a1f448
AP
594$::code.=<<___;
595 ret
596 restore
597.type ${alg}${bits}_t4_cbc_decrypt,#function
598.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
599___
600}
601
602sub alg_ctr32_implement {
603my ($alg,$bits) = @_;
604
605$::code.=<<___;
606.globl ${alg}${bits}_t4_ctr32_encrypt
607.align 32
608${alg}${bits}_t4_ctr32_encrypt:
609 save %sp, -$::frame, %sp
610
dea80680
AP
611 prefetch [$inp], 20
612 prefetch [$inp + 63], 20
54a1f448 613 call _${alg}${bits}_load_enckey
dea80680 614 sllx $len, 4, $len
54a1f448
AP
615
616 ld [$ivec + 0], %l4 ! counter
617 ld [$ivec + 4], %l5
618 ld [$ivec + 8], %l6
619 ld [$ivec + 12], %l7
620
621 sllx %l4, 32, %o5
622 or %l5, %o5, %o5
623 sllx %l6, 32, %g1
624 xor %o5, %g4, %g4 ! ^= rk[0]
625 xor %g1, %g5, %g5
626 movxtod %g4, %f14 ! most significant 64 bits
627
dea80680 628 sub $inp, $out, $blk_init ! $inp!=$out
54a1f448
AP
629 and $inp, 7, $ileft
630 andn $inp, 7, $inp
631 sll $ileft, 3, $ileft
632 mov 64, $iright
633 mov 0xff, $omask
634 sub $iright, $ileft, $iright
635 and $out, 7, $ooff
fd3b0eb0 636 cmp $len, 255
dea80680 637 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
fd3b0eb0 638 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
dea80680
AP
639 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
640 srl $omask, $ooff, $omask
641
642 andcc $len, 16, %g0 ! is number of blocks even?
54a1f448
AP
643 alignaddrl $out, %g0, $out
644 bz %icc, .L${bits}_ctr32_loop2x
dea80680 645 srlx $len, 4, $len
54a1f448
AP
646.L${bits}_ctr32_loop:
647 ldx [$inp + 0], %o0
648 brz,pt $ileft, 4f
649 ldx [$inp + 8], %o1
650
651 ldx [$inp + 16], %o2
652 sllx %o0, $ileft, %o0
653 srlx %o1, $iright, %g1
654 sllx %o1, $ileft, %o1
655 or %g1, %o0, %o0
656 srlx %o2, $iright, %o2
657 or %o2, %o1, %o1
6584:
659 xor %g5, %l7, %g1 ! ^= rk[0]
660 add %l7, 1, %l7
661 movxtod %g1, %f2
662 srl %l7, 0, %l7 ! clruw
dea80680
AP
663 prefetch [$out + 63], 22
664 prefetch [$inp + 16+63], 20
54a1f448
AP
665___
666$::code.=<<___ if ($alg eq "aes");
667 aes_eround01 %f16, %f14, %f2, %f4
668 aes_eround23 %f18, %f14, %f2, %f2
669___
670$::code.=<<___ if ($alg eq "cmll");
671 camellia_f %f16, %f2, %f14, %f2
672 camellia_f %f18, %f14, %f2, %f0
673___
674$::code.=<<___;
675 call _${alg}${bits}_encrypt_1x+8
676 add $inp, 16, $inp
677
678 movxtod %o0, %f10
679 movxtod %o1, %f12
680 fxor %f10, %f0, %f0 ! ^= inp
681 fxor %f12, %f2, %f2
682
683 brnz,pn $ooff, 2f
684 sub $len, 1, $len
685
686 std %f0, [$out + 0]
687 std %f2, [$out + 8]
688 brnz,pt $len, .L${bits}_ctr32_loop2x
689 add $out, 16, $out
690
691 ret
692 restore
693
694.align 16
6952: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
696 ! and ~3x deterioration
697 ! in inp==out case
698 faligndata %f0, %f0, %f4 ! handle unaligned output
699 faligndata %f0, %f2, %f6
700 faligndata %f2, %f2, %f8
701 stda %f4, [$out + $omask]0xc0 ! partial store
702 std %f6, [$out + 8]
703 add $out, 16, $out
704 orn %g0, $omask, $omask
705 stda %f8, [$out + $omask]0xc0 ! partial store
706
707 brnz,pt $len, .L${bits}_ctr32_loop2x+4
708 orn %g0, $omask, $omask
709
710 ret
711 restore
712
713!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
714.align 32
715.L${bits}_ctr32_loop2x:
716 ldx [$inp + 0], %o0
717 ldx [$inp + 8], %o1
718 ldx [$inp + 16], %o2
719 brz,pt $ileft, 4f
720 ldx [$inp + 24], %o3
721
722 ldx [$inp + 32], %o4
723 sllx %o0, $ileft, %o0
724 srlx %o1, $iright, %g1
725 or %g1, %o0, %o0
726 sllx %o1, $ileft, %o1
727 srlx %o2, $iright, %g1
728 or %g1, %o1, %o1
729 sllx %o2, $ileft, %o2
730 srlx %o3, $iright, %g1
731 or %g1, %o2, %o2
732 sllx %o3, $ileft, %o3
733 srlx %o4, $iright, %o4
734 or %o4, %o3, %o3
7354:
736 xor %g5, %l7, %g1 ! ^= rk[0]
737 add %l7, 1, %l7
738 movxtod %g1, %f2
739 srl %l7, 0, %l7 ! clruw
740 xor %g5, %l7, %g1
741 add %l7, 1, %l7
742 movxtod %g1, %f6
743 srl %l7, 0, %l7 ! clruw
dea80680
AP
744 prefetch [$out + 63], 22
745 prefetch [$inp + 32+63], 20
54a1f448
AP
746___
747$::code.=<<___ if ($alg eq "aes");
748 aes_eround01 %f16, %f14, %f2, %f8
749 aes_eround23 %f18, %f14, %f2, %f2
750 aes_eround01 %f16, %f14, %f6, %f10
751 aes_eround23 %f18, %f14, %f6, %f6
752___
753$::code.=<<___ if ($alg eq "cmll");
754 camellia_f %f16, %f2, %f14, %f2
755 camellia_f %f16, %f6, %f14, %f6
756 camellia_f %f18, %f14, %f2, %f0
757 camellia_f %f18, %f14, %f6, %f4
758___
759$::code.=<<___;
760 call _${alg}${bits}_encrypt_2x+16
761 add $inp, 32, $inp
762
763 movxtod %o0, %f8
764 movxtod %o1, %f10
765 movxtod %o2, %f12
766 fxor %f8, %f0, %f0 ! ^= inp
767 movxtod %o3, %f8
768 fxor %f10, %f2, %f2
769 fxor %f12, %f4, %f4
770 fxor %f8, %f6, %f6
771
772 brnz,pn $ooff, 2f
773 sub $len, 2, $len
774
775 std %f0, [$out + 0]
776 std %f2, [$out + 8]
777 std %f4, [$out + 16]
778 std %f6, [$out + 24]
779 brnz,pt $len, .L${bits}_ctr32_loop2x
780 add $out, 32, $out
781
782 ret
783 restore
784
785.align 16
7862: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
787 ! and ~3x deterioration
788 ! in inp==out case
789 faligndata %f0, %f0, %f8 ! handle unaligned output
790 faligndata %f0, %f2, %f0
791 faligndata %f2, %f4, %f2
792 faligndata %f4, %f6, %f4
793 faligndata %f6, %f6, %f6
794
795 stda %f8, [$out + $omask]0xc0 ! partial store
796 std %f0, [$out + 8]
797 std %f2, [$out + 16]
798 std %f4, [$out + 24]
799 add $out, 32, $out
800 orn %g0, $omask, $omask
801 stda %f6, [$out + $omask]0xc0 ! partial store
802
803 brnz,pt $len, .L${bits}_ctr32_loop2x+4
804 orn %g0, $omask, $omask
805
dea80680
AP
806 ret
807 restore
808
809!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
810.align 32
811.L${bits}_ctr32_blk:
812 add $out, $len, $blk_init
813 and $blk_init, 63, $blk_init ! tail
814 sub $len, $blk_init, $len
815 add $blk_init, 15, $blk_init ! round up to 16n
816 srlx $len, 4, $len
817 srl $blk_init, 4, $blk_init
818 sub $len, 1, $len
819 add $blk_init, 1, $blk_init
820
821.L${bits}_ctr32_blk_loop2x:
822 ldx [$inp + 0], %o0
823 ldx [$inp + 8], %o1
824 ldx [$inp + 16], %o2
825 brz,pt $ileft, 5f
826 ldx [$inp + 24], %o3
827
828 ldx [$inp + 32], %o4
829 sllx %o0, $ileft, %o0
830 srlx %o1, $iright, %g1
831 or %g1, %o0, %o0
832 sllx %o1, $ileft, %o1
833 srlx %o2, $iright, %g1
834 or %g1, %o1, %o1
835 sllx %o2, $ileft, %o2
836 srlx %o3, $iright, %g1
837 or %g1, %o2, %o2
838 sllx %o3, $ileft, %o3
839 srlx %o4, $iright, %o4
840 or %o4, %o3, %o3
8415:
842 xor %g5, %l7, %g1 ! ^= rk[0]
843 add %l7, 1, %l7
844 movxtod %g1, %f2
845 srl %l7, 0, %l7 ! clruw
846 xor %g5, %l7, %g1
847 add %l7, 1, %l7
848 movxtod %g1, %f6
849 srl %l7, 0, %l7 ! clruw
850 prefetch [$inp + 32+63], 20
851___
852$::code.=<<___ if ($alg eq "aes");
853 aes_eround01 %f16, %f14, %f2, %f8
854 aes_eround23 %f18, %f14, %f2, %f2
855 aes_eround01 %f16, %f14, %f6, %f10
856 aes_eround23 %f18, %f14, %f6, %f6
857___
858$::code.=<<___ if ($alg eq "cmll");
859 camellia_f %f16, %f2, %f14, %f2
860 camellia_f %f16, %f6, %f14, %f6
861 camellia_f %f18, %f14, %f2, %f0
862 camellia_f %f18, %f14, %f6, %f4
863___
864$::code.=<<___;
865 call _${alg}${bits}_encrypt_2x+16
866 add $inp, 32, $inp
867 subcc $len, 2, $len
868
869 movxtod %o0, %f8
870 movxtod %o1, %f10
871 movxtod %o2, %f12
872 fxor %f8, %f0, %f0 ! ^= inp
873 movxtod %o3, %f8
874 fxor %f10, %f2, %f2
875 fxor %f12, %f4, %f4
876 fxor %f8, %f6, %f6
877
38049c2b 878 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 879 add $out, 8, $out
38049c2b 880 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 881 add $out, 8, $out
38049c2b 882 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680 883 add $out, 8, $out
38049c2b 884 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
dea80680
AP
885 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
886 add $out, 8, $out
887
888 add $blk_init, $len, $len
889 andcc $len, 1, %g0 ! is number of blocks even?
fd3b0eb0 890 membar #StoreLoad|#StoreStore
dea80680
AP
891 bnz,pt %icc, .L${bits}_ctr32_loop
892 srl $len, 0, $len
893 brnz,pn $len, .L${bits}_ctr32_loop2x
894 nop
895
54a1f448
AP
896 ret
897 restore
898.type ${alg}${bits}_t4_ctr32_encrypt,#function
899.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
900___
901}
902
cd686946
AP
903sub alg_xts_implement {
904my ($alg,$bits,$dir) = @_;
905my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
906my $rem=$ivec;
907
908$::code.=<<___;
909.globl ${alg}${bits}_t4_xts_${dir}crypt
910.align 32
911${alg}${bits}_t4_xts_${dir}crypt:
912 save %sp, -$::frame-16, %sp
913
914 mov $ivec, %o0
915 add %fp, $::bias-16, %o1
916 call ${alg}_t4_encrypt
917 mov $key2, %o2
918
919 add %fp, $::bias-16, %l7
920 ldxa [%l7]0x88, %g2
921 add %fp, $::bias-8, %l7
922 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
923
924 sethi %hi(0x76543210), %l7
925 or %l7, %lo(0x76543210), %l7
926 bmask %l7, %g0, %g0 ! byte swap mask
927
928 prefetch [$inp], 20
929 prefetch [$inp + 63], 20
930 call _${alg}${bits}_load_${dir}ckey
931 and $len, 15, $rem
932 and $len, -16, $len
933___
934$code.=<<___ if ($dir eq "de");
935 mov 0, %l7
936 movrnz $rem, 16, %l7
937 sub $len, %l7, $len
938___
939$code.=<<___;
940
941 sub $inp, $out, $blk_init ! $inp!=$out
942 and $inp, 7, $ileft
943 andn $inp, 7, $inp
944 sll $ileft, 3, $ileft
945 mov 64, $iright
946 mov 0xff, $omask
947 sub $iright, $ileft, $iright
948 and $out, 7, $ooff
949 cmp $len, 255
950 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
951 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
952 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
953 srl $omask, $ooff, $omask
954
955 andcc $len, 16, %g0 ! is number of blocks even?
956___
957$code.=<<___ if ($dir eq "de");
958 brz,pn $len, .L${bits}_xts_${dir}steal
959___
960$code.=<<___;
961 alignaddrl $out, %g0, $out
962 bz %icc, .L${bits}_xts_${dir}loop2x
963 srlx $len, 4, $len
964.L${bits}_xts_${dir}loop:
965 ldx [$inp + 0], %o0
966 brz,pt $ileft, 4f
967 ldx [$inp + 8], %o1
968
969 ldx [$inp + 16], %o2
970 sllx %o0, $ileft, %o0
971 srlx %o1, $iright, %g1
972 sllx %o1, $ileft, %o1
973 or %g1, %o0, %o0
974 srlx %o2, $iright, %o2
975 or %o2, %o1, %o1
9764:
977 movxtod %g2, %f12
978 movxtod %g3, %f14
979 bshuffle %f12, %f12, %f12
980 bshuffle %f14, %f14, %f14
981
982 xor %g4, %o0, %o0 ! ^= rk[0]
983 xor %g5, %o1, %o1
984 movxtod %o0, %f0
985 movxtod %o1, %f2
986
987 fxor %f12, %f0, %f0 ! ^= tweak[0]
988 fxor %f14, %f2, %f2
989
990 prefetch [$out + 63], 22
991 prefetch [$inp + 16+63], 20
992 call _${alg}${bits}_${dir}crypt_1x
993 add $inp, 16, $inp
994
995 fxor %f12, %f0, %f0 ! ^= tweak[0]
996 fxor %f14, %f2, %f2
997
998 srax %g3, 63, %l7 ! next tweak value
999 addcc %g2, %g2, %g2
1000 and %l7, 0x87, %l7
1001 addxc %g3, %g3, %g3
1002 xor %l7, %g2, %g2
1003
1004 brnz,pn $ooff, 2f
1005 sub $len, 1, $len
1006
1007 std %f0, [$out + 0]
1008 std %f2, [$out + 8]
1009 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1010 add $out, 16, $out
1011
1012 brnz,pn $rem, .L${bits}_xts_${dir}steal
1013 nop
1014
1015 ret
1016 restore
1017
1018.align 16
10192: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1020 ! and ~3x deterioration
1021 ! in inp==out case
1022 faligndata %f0, %f0, %f4 ! handle unaligned output
1023 faligndata %f0, %f2, %f6
1024 faligndata %f2, %f2, %f8
1025 stda %f4, [$out + $omask]0xc0 ! partial store
1026 std %f6, [$out + 8]
1027 add $out, 16, $out
1028 orn %g0, $omask, $omask
1029 stda %f8, [$out + $omask]0xc0 ! partial store
1030
1031 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1032 orn %g0, $omask, $omask
1033
1034 brnz,pn $rem, .L${bits}_xts_${dir}steal
1035 nop
1036
1037 ret
1038 restore
1039
1040!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1041.align 32
1042.L${bits}_xts_${dir}loop2x:
1043 ldx [$inp + 0], %o0
1044 ldx [$inp + 8], %o1
1045 ldx [$inp + 16], %o2
1046 brz,pt $ileft, 4f
1047 ldx [$inp + 24], %o3
1048
1049 ldx [$inp + 32], %o4
1050 sllx %o0, $ileft, %o0
1051 srlx %o1, $iright, %g1
1052 or %g1, %o0, %o0
1053 sllx %o1, $ileft, %o1
1054 srlx %o2, $iright, %g1
1055 or %g1, %o1, %o1
1056 sllx %o2, $ileft, %o2
1057 srlx %o3, $iright, %g1
1058 or %g1, %o2, %o2
1059 sllx %o3, $ileft, %o3
1060 srlx %o4, $iright, %o4
1061 or %o4, %o3, %o3
10624:
1063 movxtod %g2, %f12
1064 movxtod %g3, %f14
1065 bshuffle %f12, %f12, %f12
1066 bshuffle %f14, %f14, %f14
1067
1068 srax %g3, 63, %l7 ! next tweak value
1069 addcc %g2, %g2, %g2
1070 and %l7, 0x87, %l7
1071 addxc %g3, %g3, %g3
1072 xor %l7, %g2, %g2
1073
1074 movxtod %g2, %f8
1075 movxtod %g3, %f10
1076 bshuffle %f8, %f8, %f8
1077 bshuffle %f10, %f10, %f10
1078
1079 xor %g4, %o0, %o0 ! ^= rk[0]
1080 xor %g5, %o1, %o1
1081 xor %g4, %o2, %o2 ! ^= rk[0]
1082 xor %g5, %o3, %o3
1083 movxtod %o0, %f0
1084 movxtod %o1, %f2
1085 movxtod %o2, %f4
1086 movxtod %o3, %f6
1087
1088 fxor %f12, %f0, %f0 ! ^= tweak[0]
1089 fxor %f14, %f2, %f2
1090 fxor %f8, %f4, %f4 ! ^= tweak[0]
1091 fxor %f10, %f6, %f6
1092
1093 prefetch [$out + 63], 22
1094 prefetch [$inp + 32+63], 20
1095 call _${alg}${bits}_${dir}crypt_2x
1096 add $inp, 32, $inp
1097
1098 movxtod %g2, %f8
1099 movxtod %g3, %f10
1100
1101 srax %g3, 63, %l7 ! next tweak value
1102 addcc %g2, %g2, %g2
1103 and %l7, 0x87, %l7
1104 addxc %g3, %g3, %g3
1105 xor %l7, %g2, %g2
1106
1107 bshuffle %f8, %f8, %f8
1108 bshuffle %f10, %f10, %f10
1109
1110 fxor %f12, %f0, %f0 ! ^= tweak[0]
1111 fxor %f14, %f2, %f2
1112 fxor %f8, %f4, %f4
1113 fxor %f10, %f6, %f6
1114
1115 brnz,pn $ooff, 2f
1116 sub $len, 2, $len
1117
1118 std %f0, [$out + 0]
1119 std %f2, [$out + 8]
1120 std %f4, [$out + 16]
1121 std %f6, [$out + 24]
1122 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1123 add $out, 32, $out
1124
1125 fsrc2 %f4, %f0
1126 fsrc2 %f6, %f2
1127 brnz,pn $rem, .L${bits}_xts_${dir}steal
1128 nop
1129
1130 ret
1131 restore
1132
1133.align 16
11342: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1135 ! and ~3x deterioration
1136 ! in inp==out case
1137 faligndata %f0, %f0, %f8 ! handle unaligned output
1138 faligndata %f0, %f2, %f10
1139 faligndata %f2, %f4, %f12
1140 faligndata %f4, %f6, %f14
1141 faligndata %f6, %f6, %f0
1142
1143 stda %f8, [$out + $omask]0xc0 ! partial store
1144 std %f10, [$out + 8]
1145 std %f12, [$out + 16]
1146 std %f14, [$out + 24]
1147 add $out, 32, $out
1148 orn %g0, $omask, $omask
1149 stda %f0, [$out + $omask]0xc0 ! partial store
1150
1151 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1152 orn %g0, $omask, $omask
1153
1154 fsrc2 %f4, %f0
1155 fsrc2 %f6, %f2
1156 brnz,pn $rem, .L${bits}_xts_${dir}steal
1157 nop
1158
1159 ret
1160 restore
1161
1162!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1163.align 32
1164.L${bits}_xts_${dir}blk:
1165 add $out, $len, $blk_init
1166 and $blk_init, 63, $blk_init ! tail
1167 sub $len, $blk_init, $len
1168 add $blk_init, 15, $blk_init ! round up to 16n
1169 srlx $len, 4, $len
1170 srl $blk_init, 4, $blk_init
1171 sub $len, 1, $len
1172 add $blk_init, 1, $blk_init
1173
1174.L${bits}_xts_${dir}blk2x:
1175 ldx [$inp + 0], %o0
1176 ldx [$inp + 8], %o1
1177 ldx [$inp + 16], %o2
1178 brz,pt $ileft, 5f
1179 ldx [$inp + 24], %o3
1180
1181 ldx [$inp + 32], %o4
1182 sllx %o0, $ileft, %o0
1183 srlx %o1, $iright, %g1
1184 or %g1, %o0, %o0
1185 sllx %o1, $ileft, %o1
1186 srlx %o2, $iright, %g1
1187 or %g1, %o1, %o1
1188 sllx %o2, $ileft, %o2
1189 srlx %o3, $iright, %g1
1190 or %g1, %o2, %o2
1191 sllx %o3, $ileft, %o3
1192 srlx %o4, $iright, %o4
1193 or %o4, %o3, %o3
11945:
1195 movxtod %g2, %f12
1196 movxtod %g3, %f14
1197 bshuffle %f12, %f12, %f12
1198 bshuffle %f14, %f14, %f14
1199
1200 srax %g3, 63, %l7 ! next tweak value
1201 addcc %g2, %g2, %g2
1202 and %l7, 0x87, %l7
1203 addxc %g3, %g3, %g3
1204 xor %l7, %g2, %g2
1205
1206 movxtod %g2, %f8
1207 movxtod %g3, %f10
1208 bshuffle %f8, %f8, %f8
1209 bshuffle %f10, %f10, %f10
1210
1211 xor %g4, %o0, %o0 ! ^= rk[0]
1212 xor %g5, %o1, %o1
1213 xor %g4, %o2, %o2 ! ^= rk[0]
1214 xor %g5, %o3, %o3
1215 movxtod %o0, %f0
1216 movxtod %o1, %f2
1217 movxtod %o2, %f4
1218 movxtod %o3, %f6
1219
1220 fxor %f12, %f0, %f0 ! ^= tweak[0]
1221 fxor %f14, %f2, %f2
1222 fxor %f8, %f4, %f4 ! ^= tweak[0]
1223 fxor %f10, %f6, %f6
1224
1225 prefetch [$inp + 32+63], 20
1226 call _${alg}${bits}_${dir}crypt_2x
1227 add $inp, 32, $inp
1228
1229 movxtod %g2, %f8
1230 movxtod %g3, %f10
1231
1232 srax %g3, 63, %l7 ! next tweak value
1233 addcc %g2, %g2, %g2
1234 and %l7, 0x87, %l7
1235 addxc %g3, %g3, %g3
1236 xor %l7, %g2, %g2
1237
1238 bshuffle %f8, %f8, %f8
1239 bshuffle %f10, %f10, %f10
1240
1241 fxor %f12, %f0, %f0 ! ^= tweak[0]
1242 fxor %f14, %f2, %f2
1243 fxor %f8, %f4, %f4
1244 fxor %f10, %f6, %f6
1245
1246 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1247 add $out, 8, $out
1248 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1249 add $out, 8, $out
1250 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1251 add $out, 8, $out
1252 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1253 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
1254 add $out, 8, $out
1255
1256 add $blk_init, $len, $len
1257 andcc $len, 1, %g0 ! is number of blocks even?
1258 membar #StoreLoad|#StoreStore
1259 bnz,pt %icc, .L${bits}_xts_${dir}loop
1260 srl $len, 0, $len
1261 brnz,pn $len, .L${bits}_xts_${dir}loop2x
1262 nop
1263
1264 fsrc2 %f4, %f0
1265 fsrc2 %f6, %f2
1266 brnz,pn $rem, .L${bits}_xts_${dir}steal
1267 nop
1268
1269 ret
1270 restore
1271!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1272___
1273$code.=<<___ if ($dir eq "en");
1274.align 32
1275.L${bits}_xts_${dir}steal:
1276 std %f0, [%fp + $::bias-16] ! copy of output
1277 std %f2, [%fp + $::bias-8]
1278
1279 srl $ileft, 3, $ileft
1280 add %fp, $::bias-16, %l7
1281 add $inp, $ileft, $inp ! original $inp+$len&-15
1282 add $out, $ooff, $out ! original $out+$len&-15
1283 mov 0, $ileft
1284 nop ! align
1285
1286.L${bits}_xts_${dir}stealing:
1287 ldub [$inp + $ileft], %o0
1288 ldub [%l7 + $ileft], %o1
1289 dec $rem
1290 stb %o0, [%l7 + $ileft]
1291 stb %o1, [$out + $ileft]
1292 brnz $rem, .L${bits}_xts_${dir}stealing
1293 inc $ileft
1294
1295 mov %l7, $inp
1296 sub $out, 16, $out
1297 mov 0, $ileft
1298 sub $out, $ooff, $out
1299 ba .L${bits}_xts_${dir}loop ! one more time
1300 mov 1, $len ! $rem is 0
1301___
1302$code.=<<___ if ($dir eq "de");
1303.align 32
1304.L${bits}_xts_${dir}steal:
1305 ldx [$inp + 0], %o0
1306 brz,pt $ileft, 8f
1307 ldx [$inp + 8], %o1
1308
1309 ldx [$inp + 16], %o2
1310 sllx %o0, $ileft, %o0
1311 srlx %o1, $iright, %g1
1312 sllx %o1, $ileft, %o1
1313 or %g1, %o0, %o0
1314 srlx %o2, $iright, %o2
1315 or %o2, %o1, %o1
13168:
1317 srax %g3, 63, %l7 ! next tweak value
1318 addcc %g2, %g2, %o2
1319 and %l7, 0x87, %l7
1320 addxc %g3, %g3, %o3
1321 xor %l7, %o2, %o2
1322
1323 movxtod %o2, %f12
1324 movxtod %o3, %f14
1325 bshuffle %f12, %f12, %f12
1326 bshuffle %f14, %f14, %f14
1327
1328 xor %g4, %o0, %o0 ! ^= rk[0]
1329 xor %g5, %o1, %o1
1330 movxtod %o0, %f0
1331 movxtod %o1, %f2
1332
1333 fxor %f12, %f0, %f0 ! ^= tweak[0]
1334 fxor %f14, %f2, %f2
1335
1336 call _${alg}${bits}_${dir}crypt_1x
1337 add $inp, 16, $inp
1338
1339 fxor %f12, %f0, %f0 ! ^= tweak[0]
1340 fxor %f14, %f2, %f2
1341
1342 std %f0, [%fp + $::bias-16]
1343 std %f2, [%fp + $::bias-8]
1344
1345 srl $ileft, 3, $ileft
1346 add %fp, $::bias-16, %l7
1347 add $inp, $ileft, $inp ! original $inp+$len&-15
1348 add $out, $ooff, $out ! original $out+$len&-15
1349 mov 0, $ileft
1350 add $out, 16, $out
1351 nop ! align
1352
1353.L${bits}_xts_${dir}stealing:
1354 ldub [$inp + $ileft], %o0
1355 ldub [%l7 + $ileft], %o1
1356 dec $rem
1357 stb %o0, [%l7 + $ileft]
1358 stb %o1, [$out + $ileft]
1359 brnz $rem, .L${bits}_xts_${dir}stealing
1360 inc $ileft
1361
1362 mov %l7, $inp
1363 sub $out, 16, $out
1364 mov 0, $ileft
1365 sub $out, $ooff, $out
1366 ba .L${bits}_xts_${dir}loop ! one more time
1367 mov 1, $len ! $rem is 0
1368___
1369$code.=<<___;
1370 ret
1371 restore
1372.type ${alg}${bits}_t4_xts_${dir}crypt,#function
1373.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1374___
1375}
1376
54a1f448
AP
1377# Purpose of these subroutines is to explicitly encode VIS instructions,
1378# so that one can compile the module without having to specify VIS
1379# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1380# Idea is to reserve for option to produce "universal" binary and let
1381# programmer detect if current CPU is VIS capable at run-time.
1382sub unvis {
1383my ($mnemonic,$rs1,$rs2,$rd)=@_;
1384my ($ref,$opf);
1385my %visopf = ( "faligndata" => 0x048,
cd686946 1386 "bshuffle" => 0x04c,
54a1f448
AP
1387 "fnot2" => 0x066,
1388 "fxor" => 0x06c,
1389 "fsrc2" => 0x078 );
1390
1391 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1392
1393 if ($opf=$visopf{$mnemonic}) {
1394 foreach ($rs1,$rs2,$rd) {
1395 return $ref if (!/%f([0-9]{1,2})/);
1396 $_=$1;
1397 if ($1>=32) {
1398 return $ref if ($1&1);
1399 # re-encode for upper double register addressing
1400 $_=($1|$1>>5)&31;
1401 }
1402 }
1403
1404 return sprintf ".word\t0x%08x !%s",
1405 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1406 $ref;
1407 } else {
1408 return $ref;
1409 }
1410}
cd686946 1411
b3aee265
AP
1412sub unvis3 {
1413my ($mnemonic,$rs1,$rs2,$rd)=@_;
1414my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1415my ($ref,$opf);
1416my %visopf = ( "addxc" => 0x011,
1417 "addxccc" => 0x013,
cd686946
AP
1418 "umulxhi" => 0x016,
1419 "alignaddr" => 0x018,
1420 "bmask" => 0x019,
1421 "alignaddrl" => 0x01a );
b3aee265
AP
1422
1423 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1424
1425 if ($opf=$visopf{$mnemonic}) {
1426 foreach ($rs1,$rs2,$rd) {
1427 return $ref if (!/%([goli])([0-9])/);
1428 $_=$bias{$1}+$2;
1429 }
1430
1431 return sprintf ".word\t0x%08x !%s",
1432 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1433 $ref;
1434 } else {
1435 return $ref;
1436 }
1437}
54a1f448
AP
1438
1439sub unaes_round { # 4-argument instructions
1440my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1441my ($ref,$opf);
1442my %aesopf = ( "aes_eround01" => 0,
1443 "aes_eround23" => 1,
1444 "aes_dround01" => 2,
1445 "aes_dround23" => 3,
1446 "aes_eround01_l"=> 4,
1447 "aes_eround23_l"=> 5,
1448 "aes_dround01_l"=> 6,
1449 "aes_dround23_l"=> 7,
1450 "aes_kexpand1" => 8 );
1451
1452 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1453
1454 if (defined($opf=$aesopf{$mnemonic})) {
1455 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1456 foreach ($rs1,$rs2,$rd) {
1457 return $ref if (!/%f([0-9]{1,2})/);
1458 $_=$1;
1459 if ($1>=32) {
1460 return $ref if ($1&1);
1461 # re-encode for upper double register addressing
1462 $_=($1|$1>>5)&31;
1463 }
1464 }
1465
1466 return sprintf ".word\t0x%08x !%s",
1467 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1468 $ref;
1469 } else {
1470 return $ref;
1471 }
1472}
1473
1474sub unaes_kexpand { # 3-argument instructions
1475my ($mnemonic,$rs1,$rs2,$rd)=@_;
1476my ($ref,$opf);
1477my %aesopf = ( "aes_kexpand0" => 0x130,
1478 "aes_kexpand2" => 0x131 );
1479
1480 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1481
1482 if (defined($opf=$aesopf{$mnemonic})) {
1483 foreach ($rs1,$rs2,$rd) {
1484 return $ref if (!/%f([0-9]{1,2})/);
1485 $_=$1;
1486 if ($1>=32) {
1487 return $ref if ($1&1);
1488 # re-encode for upper double register addressing
1489 $_=($1|$1>>5)&31;
1490 }
1491 }
1492
1493 return sprintf ".word\t0x%08x !%s",
1494 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1495 $ref;
1496 } else {
1497 return $ref;
1498 }
1499}
1500
1501sub uncamellia_f { # 4-argument instructions
1502my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1503my ($ref,$opf);
1504
1505 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1506
1507 if (1) {
1508 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1509 foreach ($rs1,$rs2,$rd) {
1510 return $ref if (!/%f([0-9]{1,2})/);
1511 $_=$1;
1512 if ($1>=32) {
1513 return $ref if ($1&1);
1514 # re-encode for upper double register addressing
1515 $_=($1|$1>>5)&31;
1516 }
1517 }
1518
1519 return sprintf ".word\t0x%08x !%s",
1520 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1521 $ref;
1522 } else {
1523 return $ref;
1524 }
1525}
1526
1527sub uncamellia3 { # 3-argument instructions
1528my ($mnemonic,$rs1,$rs2,$rd)=@_;
1529my ($ref,$opf);
1530my %cmllopf = ( "camellia_fl" => 0x13c,
1531 "camellia_fli" => 0x13d );
1532
1533 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1534
1535 if (defined($opf=$cmllopf{$mnemonic})) {
1536 foreach ($rs1,$rs2,$rd) {
1537 return $ref if (!/%f([0-9]{1,2})/);
1538 $_=$1;
1539 if ($1>=32) {
1540 return $ref if ($1&1);
1541 # re-encode for upper double register addressing
1542 $_=($1|$1>>5)&31;
1543 }
1544 }
1545
1546 return sprintf ".word\t0x%08x !%s",
1547 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1548 $ref;
1549 } else {
1550 return $ref;
1551 }
1552}
1553
1554sub unmovxtox { # 2-argument instructions
1555my ($mnemonic,$rs,$rd)=@_;
1556my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1557my ($ref,$opf);
1558my %movxopf = ( "movdtox" => 0x110,
1559 "movstouw" => 0x111,
1560 "movstosw" => 0x113,
1561 "movxtod" => 0x118,
1562 "movwtos" => 0x119 );
1563
1564 $ref = "$mnemonic\t$rs,$rd";
1565
1566 if (defined($opf=$movxopf{$mnemonic})) {
1567 foreach ($rs,$rd) {
1568 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1569 $_=$bias{$1}+$2;
1570 if ($2>=32) {
1571 return $ref if ($2&1);
1572 # re-encode for upper double register addressing
1573 $_=($2|$2>>5)&31;
1574 }
1575 }
1576
1577 return sprintf ".word\t0x%08x !%s",
1578 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1579 $ref;
1580 } else {
1581 return $ref;
1582 }
1583}
1584
1585sub emit_assembler {
1586 foreach (split("\n",$::code)) {
1587 s/\`([^\`]*)\`/eval $1/ge;
1588
cd686946 1589 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
54a1f448
AP
1590
1591 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1592 &unaes_round($1,$2,$3,$4,$5)
cd686946 1593 /geo or
54a1f448
AP
1594 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1595 &unaes_kexpand($1,$2,$3,$4)
cd686946 1596 /geo or
54a1f448
AP
1597 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1598 &uncamellia_f($1,$2,$3,$4,$5)
cd686946 1599 /geo or
54a1f448
AP
1600 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1601 &uncamellia3($1,$2,$3,$4)
cd686946 1602 /geo or
54a1f448
AP
1603 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1604 &unmovxtox($1,$2,$3)
cd686946 1605 /geo or
54a1f448
AP
1606 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1607 &unmovxtox($1,$2,$3)
cd686946
AP
1608 /geo or
1609 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
54a1f448 1610 &unvis($1,$2,$3,$4)
cd686946
AP
1611 /geo or
1612 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
b3aee265 1613 &unvis3($1,$2,$3,$4)
cd686946 1614 /geo;
54a1f448
AP
1615
1616 print $_,"\n";
1617 }
1618}
1619
16201;