3 # Specific modes implementations for SPARC Architecture 2011. There
4 # is T4 dependency though, an ASI value that is not specified in the
5 # Architecture Manual. But as SPARC universe is rather monocultural,
6 # we imply that processor capable of executing crypto instructions
7 # can handle the ASI in question as well. This means that we ought to
8 # keep eyes open when new processors emerge...
10 # As for above mentioned ASI. It's so called "block initializing
11 # store" which cancels "read" in "read-update-write" on cache lines.
12 # This is "cooperative" optimization, as it reduces overall pressure
13 # on memory interface. Benefits can't be observed/quantified with
14 # usual benchmarks, on the contrary you can notice that single-thread
15 # performance for parallelizable modes is ~1.5% worse for largest
16 # block sizes [though few percent better for not so long ones]. All
17 # this based on suggestions from David Miller.
19 sub asm_init
{ # to be called with @ARGV as argument
20 for (@_) { $::abibits
=64 if (/\-m64/ || /\-xarch\=v9/); }
21 if ($::abibits
==64) { $::bias
=2047; $::frame
=192; $::size_t_cc
="%xcc"; }
22 else { $::bias
=0; $::frame
=112; $::size_t_cc
="%icc"; }
26 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
28 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
30 sub alg_cbc_encrypt_implement
{
34 .globl
${alg
}${bits
}_t4_cbc_encrypt
36 ${alg
}${bits
}_t4_cbc_encrypt
:
37 save
%sp, -$::frame
, %sp
38 sub $inp, $out, $blk_init ! $inp!=$out
40 $::code
.=<<___
if (!$::evp
);
41 andcc
$ivec, 7, $ivoff
42 alignaddr
$ivec, %g0, $ivec
44 ldd
[$ivec + 0], %f0 ! load ivec
48 faligndata
%f0, %f2, %f0
49 faligndata
%f2, %f4, %f2
52 $::code
.=<<___
if ($::evp
);
60 prefetch
[$inp + 63], 20
61 call _
${alg
}${bits
}_load_enckey
67 sub $iright, $ileft, $iright
70 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
71 movleu
$::size_t_cc
, 0, $blk_init ! $len<128 ||
72 brnz
,pn
$blk_init, .L
${bits
}cbc_enc_blk
! $inp==$out)
73 srl
$omask, $ooff, $omask
75 alignaddrl
$out, %g0, $out
79 .L
${bits
}_cbc_enc_loop
:
86 srlx
%o1, $iright, %g1
89 srlx
%o2, $iright, %o2
92 xor %g4, %o0, %o0 ! ^= rk
[0]
97 fxor
%f12, %f0, %f0 ! ^= ivec
99 prefetch
[$out + 63], 22
100 prefetch
[$inp + 16+63], 20
101 call _
${alg
}${bits
}_encrypt_1x
109 brnz
,pt
$len, .L
${bits
}_cbc_enc_loop
112 $::code
.=<<___
if ($::evp
);
118 $::code
.=<<___
if (!$::evp
);
122 std
%f0, [$ivec + 0] ! write out ivec
130 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
131 ! and ~3x deterioration
133 faligndata
%f0, %f0, %f4 ! handle unaligned output
134 faligndata
%f0, %f2, %f6
135 faligndata
%f2, %f2, %f8
137 stda
%f4, [$out + $omask]0xc0 ! partial store
140 orn
%g0, $omask, $omask
141 stda
%f8, [$out + $omask]0xc0 ! partial store
143 brnz
,pt
$len, .L
${bits
}_cbc_enc_loop
+4
144 orn
%g0, $omask, $omask
146 $::code
.=<<___
if ($::evp
);
152 $::code
.=<<___
if (!$::evp
);
156 std
%f0, [$ivec + 0] ! write out ivec
162 3: alignaddrl
$ivec, $ivoff, %g0 ! handle unaligned ivec
164 srl
$omask, $ivoff, $omask
165 faligndata
%f0, %f0, %f4
166 faligndata
%f0, %f2, %f6
167 faligndata
%f2, %f2, %f8
168 stda
%f4, [$ivec + $omask]0xc0
171 orn
%g0, $omask, $omask
172 stda
%f8, [$ivec + $omask]0xc0
178 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
180 .L
${bits
}cbc_enc_blk
:
181 add
$out, $len, $blk_init
182 and $blk_init, 63, $blk_init ! tail
183 sub $len, $blk_init, $len
184 add
$blk_init, 15, $blk_init ! round up to
16n
186 srl
$blk_init, 4, $blk_init
188 .L
${bits
}_cbc_enc_blk_loop
:
194 sllx
%o0, $ileft, %o0
195 srlx
%o1, $iright, %g1
196 sllx
%o1, $ileft, %o1
198 srlx
%o2, $iright, %o2
201 xor %g4, %o0, %o0 ! ^= rk
[0]
206 fxor
%f12, %f0, %f0 ! ^= ivec
208 prefetch
[$inp + 16+63], 20
209 call _
${alg
}${bits
}_encrypt_1x
213 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
215 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
216 brnz
,pt
$len, .L
${bits
}_cbc_enc_blk_loop
219 membar
#StoreLoad|#StoreStore
220 brnz
,pt
$blk_init, .L
${bits
}_cbc_enc_loop
223 $::code
.=<<___
if ($::evp
);
229 $::code
.=<<___
if (!$::evp
);
233 std
%f0, [$ivec + 0] ! write out ivec
239 .type
${alg
}${bits
}_t4_cbc_encrypt
,#function
240 .size
${alg
}${bits
}_t4_cbc_encrypt
,.-${alg
}${bits
}_t4_cbc_encrypt
244 sub alg_cbc_decrypt_implement
{
245 my ($alg,$bits) = @_;
248 .globl
${alg
}${bits
}_t4_cbc_decrypt
250 ${alg
}${bits
}_t4_cbc_decrypt
:
251 save
%sp, -$::frame
, %sp
252 sub $inp, $out, $blk_init ! $inp!=$out
254 $::code
.=<<___
if (!$::evp
);
255 andcc
$ivec, 7, $ivoff
256 alignaddr
$ivec, %g0, $ivec
258 ldd
[$ivec + 0], %f12 ! load ivec
260 ldd
[$ivec + 8], %f14
261 ldd
[$ivec + 16], %f0
262 faligndata
%f12, %f14, %f12
263 faligndata
%f14, %f0, %f14
266 $::code
.=<<___
if ($::evp
);
267 ld
[$ivec + 0], %f12 ! load ivec
270 ld
[$ivec + 12], %f15
274 prefetch
[$inp + 63], 20
275 call _
${alg
}${bits
}_load_deckey
278 sll
$ileft, 3, $ileft
281 sub $iright, $ileft, $iright
284 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
285 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
286 brnz
,pn
$blk_init, .L
${bits
}cbc_dec_blk
! $inp==$out)
287 srl
$omask, $ooff, $omask
289 andcc
$len, 16, %g0 ! is number of blocks even?
291 alignaddrl
$out, %g0, $out
292 bz
%icc, .L
${bits
}_cbc_dec_loop2x
294 .L
${bits
}_cbc_dec_loop
:
300 sllx
%o0, $ileft, %o0
301 srlx
%o1, $iright, %g1
302 sllx
%o1, $ileft, %o1
304 srlx
%o2, $iright, %o2
307 xor %g4, %o0, %o2 ! ^= rk
[0]
312 prefetch
[$out + 63], 22
313 prefetch
[$inp + 16+63], 20
314 call _
${alg
}${bits
}_decrypt_1x
317 fxor
%f12, %f0, %f0 ! ^= ivec
327 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
330 $::code
.=<<___
if ($::evp
);
334 st
%f15, [$ivec + 12]
336 $::code
.=<<___
if (!$::evp
);
337 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
340 std
%f12, [$ivec + 0] ! write out ivec
341 std
%f14, [$ivec + 8]
348 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
349 ! and ~3x deterioration
351 faligndata
%f0, %f0, %f4 ! handle unaligned output
352 faligndata
%f0, %f2, %f6
353 faligndata
%f2, %f2, %f8
355 stda
%f4, [$out + $omask]0xc0 ! partial store
358 orn
%g0, $omask, $omask
359 stda
%f8, [$out + $omask]0xc0 ! partial store
361 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
+4
362 orn
%g0, $omask, $omask
364 $::code
.=<<___
if ($::evp
);
368 st
%f15, [$ivec + 12]
370 $::code
.=<<___
if (!$::evp
);
371 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
374 std
%f12, [$ivec + 0] ! write out ivec
375 std
%f14, [$ivec + 8]
381 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
383 .L
${bits
}_cbc_dec_loop2x
:
391 sllx
%o0, $ileft, %o0
392 srlx
%o1, $iright, %g1
394 sllx
%o1, $ileft, %o1
395 srlx
%o2, $iright, %g1
397 sllx
%o2, $ileft, %o2
398 srlx
%o3, $iright, %g1
400 sllx
%o3, $ileft, %o3
401 srlx
%o4, $iright, %o4
404 xor %g4, %o0, %o4 ! ^= rk
[0]
413 prefetch
[$out + 63], 22
414 prefetch
[$inp + 32+63], 20
415 call _
${alg
}${bits
}_decrypt_2x
420 fxor
%f12, %f0, %f0 ! ^= ivec
434 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
437 $::code
.=<<___
if ($::evp
);
441 st
%f15, [$ivec + 12]
443 $::code
.=<<___
if (!$::evp
);
444 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
447 std
%f12, [$ivec + 0] ! write out ivec
448 std
%f14, [$ivec + 8]
455 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
456 ! and ~3x deterioration
458 faligndata
%f0, %f0, %f8 ! handle unaligned output
459 faligndata
%f0, %f2, %f0
460 faligndata
%f2, %f4, %f2
461 faligndata
%f4, %f6, %f4
462 faligndata
%f6, %f6, %f6
463 stda
%f8, [$out + $omask]0xc0 ! partial store
468 orn
%g0, $omask, $omask
469 stda
%f6, [$out + $omask]0xc0 ! partial store
471 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
+4
472 orn
%g0, $omask, $omask
474 $::code
.=<<___
if ($::evp
);
478 st
%f15, [$ivec + 12]
480 $::code
.=<<___
if (!$::evp
);
481 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
484 std
%f12, [$ivec + 0] ! write out ivec
485 std
%f14, [$ivec + 8]
490 .L
${bits
}_cbc_dec_unaligned_ivec
:
491 alignaddrl
$ivec, $ivoff, %g0 ! handle unaligned ivec
493 srl
$omask, $ivoff, $omask
494 faligndata
%f12, %f12, %f0
495 faligndata
%f12, %f14, %f2
496 faligndata
%f14, %f14, %f4
497 stda
%f0, [$ivec + $omask]0xc0
500 orn
%g0, $omask, $omask
501 stda
%f4, [$ivec + $omask]0xc0
507 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
509 .L
${bits
}cbc_dec_blk
:
510 add
$out, $len, $blk_init
511 and $blk_init, 63, $blk_init ! tail
512 sub $len, $blk_init, $len
513 add
$blk_init, 15, $blk_init ! round up to
16n
515 srl
$blk_init, 4, $blk_init
517 add
$blk_init, 1, $blk_init
519 .L
${bits
}_cbc_dec_blk_loop2x
:
527 sllx
%o0, $ileft, %o0
528 srlx
%o1, $iright, %g1
530 sllx
%o1, $ileft, %o1
531 srlx
%o2, $iright, %g1
533 sllx
%o2, $ileft, %o2
534 srlx
%o3, $iright, %g1
536 sllx
%o3, $ileft, %o3
537 srlx
%o4, $iright, %o4
540 xor %g4, %o0, %o4 ! ^= rk
[0]
549 prefetch
[$inp + 32+63], 20
550 call _
${alg
}${bits
}_decrypt_2x
556 fxor
%f12, %f0, %f0 ! ^= ivec
563 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
565 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
567 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
569 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
570 bgu
,pt
$::size_t_cc
, .L
${bits
}_cbc_dec_blk_loop2x
573 add
$blk_init, $len, $len
574 andcc
$len, 1, %g0 ! is number of blocks even?
575 membar
#StoreLoad|#StoreStore
576 bnz
,pt
%icc, .L
${bits
}_cbc_dec_loop
578 brnz
,pn
$len, .L
${bits
}_cbc_dec_loop2x
581 $::code
.=<<___
if ($::evp
);
582 st
%f12, [$ivec + 0] ! write out ivec
585 st
%f15, [$ivec + 12]
587 $::code
.=<<___
if (!$::evp
);
591 std
%f12, [$ivec + 0] ! write out ivec
592 std
%f14, [$ivec + 8]
597 .type
${alg
}${bits
}_t4_cbc_decrypt
,#function
598 .size
${alg
}${bits
}_t4_cbc_decrypt
,.-${alg
}${bits
}_t4_cbc_decrypt
602 sub alg_ctr32_implement
{
603 my ($alg,$bits) = @_;
606 .globl
${alg
}${bits
}_t4_ctr32_encrypt
608 ${alg
}${bits
}_t4_ctr32_encrypt
:
609 save
%sp, -$::frame
, %sp
612 prefetch
[$inp + 63], 20
613 call _
${alg
}${bits
}_load_enckey
616 ld
[$ivec + 0], %l4 ! counter
624 xor %o5, %g4, %g4 ! ^= rk
[0]
626 movxtod
%g4, %f14 ! most significant
64 bits
628 sub $inp, $out, $blk_init ! $inp!=$out
631 sll
$ileft, 3, $ileft
634 sub $iright, $ileft, $iright
637 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
638 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
639 brnz
,pn
$blk_init, .L
${bits
}_ctr32_blk
! $inp==$out)
640 srl
$omask, $ooff, $omask
642 andcc
$len, 16, %g0 ! is number of blocks even?
643 alignaddrl
$out, %g0, $out
644 bz
%icc, .L
${bits
}_ctr32_loop2x
646 .L
${bits
}_ctr32_loop
:
652 sllx
%o0, $ileft, %o0
653 srlx
%o1, $iright, %g1
654 sllx
%o1, $ileft, %o1
656 srlx
%o2, $iright, %o2
659 xor %g5, %l7, %g1 ! ^= rk
[0]
662 srl
%l7, 0, %l7 ! clruw
663 prefetch
[$out + 63], 22
664 prefetch
[$inp + 16+63], 20
666 $::code
.=<<___
if ($alg eq "aes");
667 aes_eround01
%f16, %f14, %f2, %f4
668 aes_eround23
%f18, %f14, %f2, %f2
670 $::code
.=<<___
if ($alg eq "cmll");
671 camellia_f
%f16, %f2, %f14, %f2
672 camellia_f
%f18, %f14, %f2, %f0
675 call _
${alg
}${bits
}_encrypt_1x
+8
680 fxor
%f10, %f0, %f0 ! ^= inp
688 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
695 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
696 ! and ~3x deterioration
698 faligndata
%f0, %f0, %f4 ! handle unaligned output
699 faligndata
%f0, %f2, %f6
700 faligndata
%f2, %f2, %f8
701 stda
%f4, [$out + $omask]0xc0 ! partial store
704 orn
%g0, $omask, $omask
705 stda
%f8, [$out + $omask]0xc0 ! partial store
707 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
+4
708 orn
%g0, $omask, $omask
713 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
715 .L
${bits
}_ctr32_loop2x
:
723 sllx
%o0, $ileft, %o0
724 srlx
%o1, $iright, %g1
726 sllx
%o1, $ileft, %o1
727 srlx
%o2, $iright, %g1
729 sllx
%o2, $ileft, %o2
730 srlx
%o3, $iright, %g1
732 sllx
%o3, $ileft, %o3
733 srlx
%o4, $iright, %o4
736 xor %g5, %l7, %g1 ! ^= rk
[0]
739 srl
%l7, 0, %l7 ! clruw
743 srl
%l7, 0, %l7 ! clruw
744 prefetch
[$out + 63], 22
745 prefetch
[$inp + 32+63], 20
747 $::code
.=<<___
if ($alg eq "aes");
748 aes_eround01
%f16, %f14, %f2, %f8
749 aes_eround23
%f18, %f14, %f2, %f2
750 aes_eround01
%f16, %f14, %f6, %f10
751 aes_eround23
%f18, %f14, %f6, %f6
753 $::code
.=<<___
if ($alg eq "cmll");
754 camellia_f
%f16, %f2, %f14, %f2
755 camellia_f
%f16, %f6, %f14, %f6
756 camellia_f
%f18, %f14, %f2, %f0
757 camellia_f
%f18, %f14, %f6, %f4
760 call _
${alg
}${bits
}_encrypt_2x
+16
766 fxor
%f8, %f0, %f0 ! ^= inp
779 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
786 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
787 ! and ~3x deterioration
789 faligndata
%f0, %f0, %f8 ! handle unaligned output
790 faligndata
%f0, %f2, %f0
791 faligndata
%f2, %f4, %f2
792 faligndata
%f4, %f6, %f4
793 faligndata
%f6, %f6, %f6
795 stda
%f8, [$out + $omask]0xc0 ! partial store
800 orn
%g0, $omask, $omask
801 stda
%f6, [$out + $omask]0xc0 ! partial store
803 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
+4
804 orn
%g0, $omask, $omask
809 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
812 add
$out, $len, $blk_init
813 and $blk_init, 63, $blk_init ! tail
814 sub $len, $blk_init, $len
815 add
$blk_init, 15, $blk_init ! round up to
16n
817 srl
$blk_init, 4, $blk_init
819 add
$blk_init, 1, $blk_init
821 .L
${bits
}_ctr32_blk_loop2x
:
829 sllx
%o0, $ileft, %o0
830 srlx
%o1, $iright, %g1
832 sllx
%o1, $ileft, %o1
833 srlx
%o2, $iright, %g1
835 sllx
%o2, $ileft, %o2
836 srlx
%o3, $iright, %g1
838 sllx
%o3, $ileft, %o3
839 srlx
%o4, $iright, %o4
842 xor %g5, %l7, %g1 ! ^= rk
[0]
845 srl
%l7, 0, %l7 ! clruw
849 srl
%l7, 0, %l7 ! clruw
850 prefetch
[$inp + 32+63], 20
852 $::code
.=<<___
if ($alg eq "aes");
853 aes_eround01
%f16, %f14, %f2, %f8
854 aes_eround23
%f18, %f14, %f2, %f2
855 aes_eround01
%f16, %f14, %f6, %f10
856 aes_eround23
%f18, %f14, %f6, %f6
858 $::code
.=<<___
if ($alg eq "cmll");
859 camellia_f
%f16, %f2, %f14, %f2
860 camellia_f
%f16, %f6, %f14, %f6
861 camellia_f
%f18, %f14, %f2, %f0
862 camellia_f
%f18, %f14, %f6, %f4
865 call _
${alg
}${bits
}_encrypt_2x
+16
872 fxor
%f8, %f0, %f0 ! ^= inp
878 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
880 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
882 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
884 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
885 bgu
,pt
$::size_t_cc
, .L
${bits
}_ctr32_blk_loop2x
888 add
$blk_init, $len, $len
889 andcc
$len, 1, %g0 ! is number of blocks even?
890 membar
#StoreLoad|#StoreStore
891 bnz
,pt
%icc, .L
${bits
}_ctr32_loop
893 brnz
,pn
$len, .L
${bits
}_ctr32_loop2x
898 .type
${alg
}${bits
}_t4_ctr32_encrypt
,#function
899 .size
${alg
}${bits
}_t4_ctr32_encrypt
,.-${alg
}${bits
}_t4_ctr32_encrypt
903 sub alg_xts_implement
{
904 my ($alg,$bits,$dir) = @_;
905 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
909 .globl
${alg
}${bits
}_t4_xts_
${dir
}crypt
911 ${alg
}${bits
}_t4_xts_
${dir
}crypt:
912 save
%sp, -$::frame
-16, %sp
915 add
%fp, $::bias
-16, %o1
916 call
${alg
}_t4_encrypt
919 add
%fp, $::bias
-16, %l7
921 add
%fp, $::bias
-8, %l7
922 ldxa
[%l7]0x88, %g3 ! %g3:%g2 is tweak
924 sethi
%hi(0x76543210), %l7
925 or %l7, %lo(0x76543210), %l7
926 bmask
%l7, %g0, %g0 ! byte swap mask
929 prefetch
[$inp + 63], 20
930 call _
${alg
}${bits
}_load_
${dir
}ckey
934 $code.=<<___
if ($dir eq "de");
941 sub $inp, $out, $blk_init ! $inp!=$out
944 sll
$ileft, 3, $ileft
947 sub $iright, $ileft, $iright
950 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
951 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
952 brnz
,pn
$blk_init, .L
${bits
}_xts_
${dir
}blk
! $inp==$out)
953 srl
$omask, $ooff, $omask
955 andcc
$len, 16, %g0 ! is number of blocks even?
957 $code.=<<___
if ($dir eq "de");
958 brz
,pn
$len, .L
${bits
}_xts_
${dir
}steal
961 alignaddrl
$out, %g0, $out
962 bz
%icc, .L
${bits
}_xts_
${dir
}loop2x
964 .L
${bits
}_xts_
${dir
}loop:
970 sllx
%o0, $ileft, %o0
971 srlx
%o1, $iright, %g1
972 sllx
%o1, $ileft, %o1
974 srlx
%o2, $iright, %o2
979 bshuffle
%f12, %f12, %f12
980 bshuffle
%f14, %f14, %f14
982 xor %g4, %o0, %o0 ! ^= rk
[0]
987 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
990 prefetch
[$out + 63], 22
991 prefetch
[$inp + 16+63], 20
992 call _
${alg
}${bits
}_
${dir
}crypt_1x
995 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
998 srax
%g3, 63, %l7 ! next tweak value
1009 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
1012 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1019 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
1020 ! and ~3x deterioration
1022 faligndata
%f0, %f0, %f4 ! handle unaligned output
1023 faligndata
%f0, %f2, %f6
1024 faligndata
%f2, %f2, %f8
1025 stda
%f4, [$out + $omask]0xc0 ! partial store
1028 orn
%g0, $omask, $omask
1029 stda
%f8, [$out + $omask]0xc0 ! partial store
1031 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
+4
1032 orn
%g0, $omask, $omask
1034 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1040 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1042 .L
${bits
}_xts_
${dir
}loop2x
:
1045 ldx
[$inp + 16], %o2
1047 ldx
[$inp + 24], %o3
1049 ldx
[$inp + 32], %o4
1050 sllx
%o0, $ileft, %o0
1051 srlx
%o1, $iright, %g1
1053 sllx
%o1, $ileft, %o1
1054 srlx
%o2, $iright, %g1
1056 sllx
%o2, $ileft, %o2
1057 srlx
%o3, $iright, %g1
1059 sllx
%o3, $ileft, %o3
1060 srlx
%o4, $iright, %o4
1065 bshuffle
%f12, %f12, %f12
1066 bshuffle
%f14, %f14, %f14
1068 srax
%g3, 63, %l7 ! next tweak value
1076 bshuffle
%f8, %f8, %f8
1077 bshuffle
%f10, %f10, %f10
1079 xor %g4, %o0, %o0 ! ^= rk
[0]
1081 xor %g4, %o2, %o2 ! ^= rk
[0]
1088 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1090 fxor
%f8, %f4, %f4 ! ^= tweak
[0]
1093 prefetch
[$out + 63], 22
1094 prefetch
[$inp + 32+63], 20
1095 call _
${alg
}${bits
}_
${dir
}crypt_2x
1101 srax
%g3, 63, %l7 ! next tweak value
1107 bshuffle
%f8, %f8, %f8
1108 bshuffle
%f10, %f10, %f10
1110 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1120 std
%f4, [$out + 16]
1121 std
%f6, [$out + 24]
1122 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
1127 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1134 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
1135 ! and ~3x deterioration
1137 faligndata
%f0, %f0, %f8 ! handle unaligned output
1138 faligndata
%f0, %f2, %f10
1139 faligndata
%f2, %f4, %f12
1140 faligndata
%f4, %f6, %f14
1141 faligndata
%f6, %f6, %f0
1143 stda
%f8, [$out + $omask]0xc0 ! partial store
1144 std
%f10, [$out + 8]
1145 std
%f12, [$out + 16]
1146 std
%f14, [$out + 24]
1148 orn
%g0, $omask, $omask
1149 stda
%f0, [$out + $omask]0xc0 ! partial store
1151 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
+4
1152 orn
%g0, $omask, $omask
1156 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1162 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1164 .L
${bits
}_xts_
${dir
}blk
:
1165 add
$out, $len, $blk_init
1166 and $blk_init, 63, $blk_init ! tail
1167 sub $len, $blk_init, $len
1168 add
$blk_init, 15, $blk_init ! round up to
16n
1170 srl
$blk_init, 4, $blk_init
1172 add
$blk_init, 1, $blk_init
1174 .L
${bits
}_xts_
${dir
}blk2x
:
1177 ldx
[$inp + 16], %o2
1179 ldx
[$inp + 24], %o3
1181 ldx
[$inp + 32], %o4
1182 sllx
%o0, $ileft, %o0
1183 srlx
%o1, $iright, %g1
1185 sllx
%o1, $ileft, %o1
1186 srlx
%o2, $iright, %g1
1188 sllx
%o2, $ileft, %o2
1189 srlx
%o3, $iright, %g1
1191 sllx
%o3, $ileft, %o3
1192 srlx
%o4, $iright, %o4
1197 bshuffle
%f12, %f12, %f12
1198 bshuffle
%f14, %f14, %f14
1200 srax
%g3, 63, %l7 ! next tweak value
1208 bshuffle
%f8, %f8, %f8
1209 bshuffle
%f10, %f10, %f10
1211 xor %g4, %o0, %o0 ! ^= rk
[0]
1213 xor %g4, %o2, %o2 ! ^= rk
[0]
1220 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1222 fxor
%f8, %f4, %f4 ! ^= tweak
[0]
1225 prefetch
[$inp + 32+63], 20
1226 call _
${alg
}${bits
}_
${dir
}crypt_2x
1232 srax
%g3, 63, %l7 ! next tweak value
1238 bshuffle
%f8, %f8, %f8
1239 bshuffle
%f10, %f10, %f10
1241 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1246 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1248 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1250 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1252 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1253 bgu
,pt
$::size_t_cc
, .L
${bits
}_xts_
${dir
}blk2x
1256 add
$blk_init, $len, $len
1257 andcc
$len, 1, %g0 ! is number of blocks even?
1258 membar
#StoreLoad|#StoreStore
1259 bnz
,pt
%icc, .L
${bits
}_xts_
${dir
}loop
1261 brnz
,pn
$len, .L
${bits
}_xts_
${dir
}loop2x
1266 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1271 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1273 $code.=<<___
if ($dir eq "en");
1275 .L
${bits
}_xts_
${dir
}steal
:
1276 std
%f0, [%fp + $::bias
-16] ! copy of output
1277 std
%f2, [%fp + $::bias
-8]
1279 srl
$ileft, 3, $ileft
1280 add
%fp, $::bias
-16, %l7
1281 add
$inp, $ileft, $inp ! original
$inp+$len&-15
1282 add
$out, $ooff, $out ! original
$out+$len&-15
1286 .L
${bits
}_xts_
${dir
}stealing
:
1287 ldub
[$inp + $ileft], %o0
1288 ldub
[%l7 + $ileft], %o1
1290 stb
%o0, [%l7 + $ileft]
1291 stb
%o1, [$out + $ileft]
1292 brnz
$rem, .L
${bits
}_xts_
${dir
}stealing
1298 sub $out, $ooff, $out
1299 ba
.L
${bits
}_xts_
${dir
}loop ! one more
time
1300 mov
1, $len ! $rem is
0
1302 $code.=<<___
if ($dir eq "de");
1304 .L
${bits
}_xts_
${dir
}steal
:
1309 ldx
[$inp + 16], %o2
1310 sllx
%o0, $ileft, %o0
1311 srlx
%o1, $iright, %g1
1312 sllx
%o1, $ileft, %o1
1314 srlx
%o2, $iright, %o2
1317 srax
%g3, 63, %l7 ! next tweak value
1325 bshuffle
%f12, %f12, %f12
1326 bshuffle
%f14, %f14, %f14
1328 xor %g4, %o0, %o0 ! ^= rk
[0]
1333 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1336 call _
${alg
}${bits
}_
${dir
}crypt_1x
1339 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1342 std
%f0, [%fp + $::bias
-16]
1343 std
%f2, [%fp + $::bias
-8]
1345 srl
$ileft, 3, $ileft
1346 add
%fp, $::bias
-16, %l7
1347 add
$inp, $ileft, $inp ! original
$inp+$len&-15
1348 add
$out, $ooff, $out ! original
$out+$len&-15
1353 .L
${bits
}_xts_
${dir
}stealing
:
1354 ldub
[$inp + $ileft], %o0
1355 ldub
[%l7 + $ileft], %o1
1357 stb
%o0, [%l7 + $ileft]
1358 stb
%o1, [$out + $ileft]
1359 brnz
$rem, .L
${bits
}_xts_
${dir
}stealing
1365 sub $out, $ooff, $out
1366 ba
.L
${bits
}_xts_
${dir
}loop ! one more
time
1367 mov
1, $len ! $rem is
0
1372 .type
${alg
}${bits
}_t4_xts_
${dir
}crypt,#function
1373 .size
${alg
}${bits
}_t4_xts_
${dir
}crypt,.-${alg
}${bits
}_t4_xts_
${dir
}crypt
1377 # Purpose of these subroutines is to explicitly encode VIS instructions,
1378 # so that one can compile the module without having to specify VIS
1379 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1380 # Idea is to reserve for option to produce "universal" binary and let
1381 # programmer detect if current CPU is VIS capable at run-time.
1383 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1385 my %visopf = ( "faligndata" => 0x048,
1386 "bshuffle" => 0x04c,
1391 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1393 if ($opf=$visopf{$mnemonic}) {
1394 foreach ($rs1,$rs2,$rd) {
1395 return $ref if (!/%f([0-9]{1,2})/);
1398 return $ref if ($1&1);
1399 # re-encode for upper double register addressing
1404 return sprintf ".word\t0x%08x !%s",
1405 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1413 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1414 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1416 my %visopf = ( "addxc" => 0x011,
1419 "alignaddr" => 0x018,
1421 "alignaddrl" => 0x01a );
1423 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1425 if ($opf=$visopf{$mnemonic}) {
1426 foreach ($rs1,$rs2,$rd) {
1427 return $ref if (!/%([goli])([0-9])/);
1431 return sprintf ".word\t0x%08x !%s",
1432 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1439 sub unaes_round
{ # 4-argument instructions
1440 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1442 my %aesopf = ( "aes_eround01" => 0,
1443 "aes_eround23" => 1,
1444 "aes_dround01" => 2,
1445 "aes_dround23" => 3,
1446 "aes_eround01_l"=> 4,
1447 "aes_eround23_l"=> 5,
1448 "aes_dround01_l"=> 6,
1449 "aes_dround23_l"=> 7,
1450 "aes_kexpand1" => 8 );
1452 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1454 if (defined($opf=$aesopf{$mnemonic})) {
1455 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ?
(($1|$1>>5)&31) : $rs3;
1456 foreach ($rs1,$rs2,$rd) {
1457 return $ref if (!/%f([0-9]{1,2})/);
1460 return $ref if ($1&1);
1461 # re-encode for upper double register addressing
1466 return sprintf ".word\t0x%08x !%s",
1467 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1474 sub unaes_kexpand
{ # 3-argument instructions
1475 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1477 my %aesopf = ( "aes_kexpand0" => 0x130,
1478 "aes_kexpand2" => 0x131 );
1480 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1482 if (defined($opf=$aesopf{$mnemonic})) {
1483 foreach ($rs1,$rs2,$rd) {
1484 return $ref if (!/%f([0-9]{1,2})/);
1487 return $ref if ($1&1);
1488 # re-encode for upper double register addressing
1493 return sprintf ".word\t0x%08x !%s",
1494 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1501 sub uncamellia_f
{ # 4-argument instructions
1502 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1505 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1508 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ?
(($1|$1>>5)&31) : $rs3;
1509 foreach ($rs1,$rs2,$rd) {
1510 return $ref if (!/%f([0-9]{1,2})/);
1513 return $ref if ($1&1);
1514 # re-encode for upper double register addressing
1519 return sprintf ".word\t0x%08x !%s",
1520 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1527 sub uncamellia3
{ # 3-argument instructions
1528 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1530 my %cmllopf = ( "camellia_fl" => 0x13c,
1531 "camellia_fli" => 0x13d );
1533 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1535 if (defined($opf=$cmllopf{$mnemonic})) {
1536 foreach ($rs1,$rs2,$rd) {
1537 return $ref if (!/%f([0-9]{1,2})/);
1540 return $ref if ($1&1);
1541 # re-encode for upper double register addressing
1546 return sprintf ".word\t0x%08x !%s",
1547 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1554 sub unmovxtox
{ # 2-argument instructions
1555 my ($mnemonic,$rs,$rd)=@_;
1556 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1558 my %movxopf = ( "movdtox" => 0x110,
1559 "movstouw" => 0x111,
1560 "movstosw" => 0x113,
1562 "movwtos" => 0x119 );
1564 $ref = "$mnemonic\t$rs,$rd";
1566 if (defined($opf=$movxopf{$mnemonic})) {
1568 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1571 return $ref if ($2&1);
1572 # re-encode for upper double register addressing
1577 return sprintf ".word\t0x%08x !%s",
1578 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1586 my ($mnemonic)=shift;
1589 my %desopf = ( "des_round" => 0b1001
,
1590 "des_ip" => 0b100110100
,
1591 "des_iip" => 0b100110101
,
1592 "des_kexpand" => 0b100110110
);
1594 $ref = "$mnemonic\t".join(",",@_);
1596 if (defined($opf=$desopf{$mnemonic})) { # 4-arg
1597 if ($mnemonic eq "des_round") {
1598 foreach (@args[0..3]) {
1599 return $ref if (!/%f([0-9]{1,2})/);
1602 return $ref if ($1&1);
1603 # re-encode for upper double register addressing
1607 return sprintf ".word\t0x%08x !%s",
1608 2<<30|0b011001
<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1610 } elsif ($mnemonic eq "des_kexpand") { # 3-arg
1611 foreach (@args[0..2]) {
1612 return $ref if (!/(%f)?([0-9]{1,2})/);
1615 return $ref if ($2&1);
1616 # re-encode for upper double register addressing
1620 return sprintf ".word\t0x%08x !%s",
1621 2<<30|0b110110
<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1624 foreach (@args[0..1]) {
1625 return $ref if (!/%f([0-9]{1,2})/);
1628 return $ref if ($2&1);
1629 # re-encode for upper double register addressing
1633 return sprintf ".word\t0x%08x !%s",
1634 2<<30|0b110110
<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1642 sub emit_assembler
{
1643 foreach (split("\n",$::code
)) {
1644 s/\`([^\`]*)\`/eval $1/ge;
1646 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1648 s
/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1649 &unaes_round
($1,$2,$3,$4,$5)
1651 s
/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1652 &unaes_kexpand
($1,$2,$3,$4)
1654 s
/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1655 &uncamellia_f
($1,$2,$3,$4,$5)
1657 s
/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1658 &uncamellia3
($1,$2,$3,$4)
1660 s
/\b(des_\w+)\s+(?<rs1>%f[0-9]{1,2}),\s*(?<rs2>[%fx0-9]+)(,\s*(?<rs3>%f[0-9]{1,2})(,\s*(?<rs4>%f[0-9]{1,2}))?)?/
1661 &undes
($1,$+{rs1
},$+{rs2
},$+{rs3
},$+{rs4
})
1663 s
/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1664 &unmovxtox
($1,$2,$3)
1666 s
/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1667 &unmovxtox
($1,$2,$3)
1669 s
/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1672 s
/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1673 &unvis3
($1,$2,$3,$4)