2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # Specific modes implementations for SPARC Architecture 2011. There
11 # is T4 dependency though, an ASI value that is not specified in the
12 # Architecture Manual. But as SPARC universe is rather monocultural,
13 # we imply that processor capable of executing crypto instructions
14 # can handle the ASI in question as well. This means that we ought to
15 # keep eyes open when new processors emerge...
17 # As for above mentioned ASI. It's so called "block initializing
18 # store" which cancels "read" in "read-update-write" on cache lines.
19 # This is "cooperative" optimization, as it reduces overall pressure
20 # on memory interface. Benefits can't be observed/quantified with
21 # usual benchmarks, on the contrary you can notice that single-thread
22 # performance for parallelizable modes is ~1.5% worse for largest
23 # block sizes [though few percent better for not so long ones]. All
24 # this based on suggestions from David Miller.
27 $::frame
="STACK_FRAME";
28 $::size_t_cc
="SIZE_T_CC";
30 sub asm_init
{ # to be called with @ARGV as argument
31 for (@_) { $::abibits
=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($::abibits
==64) { $::bias
=2047; $::frame
=192; $::size_t_cc
="%xcc"; }
33 else { $::bias
=0; $::frame
=112; $::size_t_cc
="%icc"; }
37 my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
39 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
41 sub alg_cbc_encrypt_implement
{
45 .globl
${alg
}${bits
}_t4_cbc_encrypt
47 ${alg
}${bits
}_t4_cbc_encrypt
:
48 save
%sp, -$::frame
, %sp
50 be
,pn
$::size_t_cc
, .L
${bits
}_cbc_enc_abort
51 sub $inp, $out, $blk_init ! $inp!=$out
53 $::code
.=<<___
if (!$::evp
);
54 andcc
$ivec, 7, $ivoff
55 alignaddr
$ivec, %g0, $ivec
57 ldd
[$ivec + 0], %f0 ! load ivec
61 faligndata
%f0, %f2, %f0
62 faligndata
%f2, %f4, %f2
65 $::code
.=<<___
if ($::evp
);
73 prefetch
[$inp + 63], 20
74 call _
${alg
}${bits
}_load_enckey
80 sub $iright, $ileft, $iright
83 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
84 movleu
$::size_t_cc
, 0, $blk_init ! $len<128 ||
85 brnz
,pn
$blk_init, .L
${bits
}cbc_enc_blk
! $inp==$out)
86 srl
$omask, $ooff, $omask
88 alignaddrl
$out, %g0, $out
92 .L
${bits
}_cbc_enc_loop
:
99 srlx
%o1, $iright, %g1
100 sllx
%o1, $ileft, %o1
102 srlx
%o2, $iright, %o2
105 xor %g4, %o0, %o0 ! ^= rk
[0]
110 fxor
%f12, %f0, %f0 ! ^= ivec
112 prefetch
[$out + 63], 22
113 prefetch
[$inp + 16+63], 20
114 call _
${alg
}${bits
}_encrypt_1x
122 brnz
,pt
$len, .L
${bits
}_cbc_enc_loop
125 $::code
.=<<___
if ($::evp
);
131 $::code
.=<<___
if (!$::evp
);
135 std
%f0, [$ivec + 0] ! write out ivec
139 .L
${bits
}_cbc_enc_abort
:
144 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
145 ! and ~3x deterioration
147 faligndata
%f0, %f0, %f4 ! handle unaligned output
148 faligndata
%f0, %f2, %f6
149 faligndata
%f2, %f2, %f8
151 stda
%f4, [$out + $omask]0xc0 ! partial store
154 orn
%g0, $omask, $omask
155 stda
%f8, [$out + $omask]0xc0 ! partial store
157 brnz
,pt
$len, .L
${bits
}_cbc_enc_loop
+4
158 orn
%g0, $omask, $omask
160 $::code
.=<<___
if ($::evp
);
166 $::code
.=<<___
if (!$::evp
);
170 std
%f0, [$ivec + 0] ! write out ivec
176 3: alignaddrl
$ivec, $ivoff, %g0 ! handle unaligned ivec
178 srl
$omask, $ivoff, $omask
179 faligndata
%f0, %f0, %f4
180 faligndata
%f0, %f2, %f6
181 faligndata
%f2, %f2, %f8
182 stda
%f4, [$ivec + $omask]0xc0
185 orn
%g0, $omask, $omask
186 stda
%f8, [$ivec + $omask]0xc0
192 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194 .L
${bits
}cbc_enc_blk
:
195 add
$out, $len, $blk_init
196 and $blk_init, 63, $blk_init ! tail
197 sub $len, $blk_init, $len
198 add
$blk_init, 15, $blk_init ! round up to
16n
200 srl
$blk_init, 4, $blk_init
202 .L
${bits
}_cbc_enc_blk_loop
:
208 sllx
%o0, $ileft, %o0
209 srlx
%o1, $iright, %g1
210 sllx
%o1, $ileft, %o1
212 srlx
%o2, $iright, %o2
215 xor %g4, %o0, %o0 ! ^= rk
[0]
220 fxor
%f12, %f0, %f0 ! ^= ivec
222 prefetch
[$inp + 16+63], 20
223 call _
${alg
}${bits
}_encrypt_1x
227 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
229 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
230 brnz
,pt
$len, .L
${bits
}_cbc_enc_blk_loop
233 membar
#StoreLoad|#StoreStore
234 brnz
,pt
$blk_init, .L
${bits
}_cbc_enc_loop
237 $::code
.=<<___
if ($::evp
);
243 $::code
.=<<___
if (!$::evp
);
247 std
%f0, [$ivec + 0] ! write out ivec
253 .type
${alg
}${bits
}_t4_cbc_encrypt
,#function
254 .size
${alg
}${bits
}_t4_cbc_encrypt
,.-${alg
}${bits
}_t4_cbc_encrypt
258 sub alg_cbc_decrypt_implement
{
259 my ($alg,$bits) = @_;
262 .globl
${alg
}${bits
}_t4_cbc_decrypt
264 ${alg
}${bits
}_t4_cbc_decrypt
:
265 save
%sp, -$::frame
, %sp
267 be
,pn
$::size_t_cc
, .L
${bits
}_cbc_dec_abort
268 sub $inp, $out, $blk_init ! $inp!=$out
270 $::code
.=<<___
if (!$::evp
);
271 andcc
$ivec, 7, $ivoff
272 alignaddr
$ivec, %g0, $ivec
274 ldd
[$ivec + 0], %f12 ! load ivec
276 ldd
[$ivec + 8], %f14
277 ldd
[$ivec + 16], %f0
278 faligndata
%f12, %f14, %f12
279 faligndata
%f14, %f0, %f14
282 $::code
.=<<___
if ($::evp
);
283 ld
[$ivec + 0], %f12 ! load ivec
286 ld
[$ivec + 12], %f15
290 prefetch
[$inp + 63], 20
291 call _
${alg
}${bits
}_load_deckey
294 sll
$ileft, 3, $ileft
297 sub $iright, $ileft, $iright
300 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
301 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
302 brnz
,pn
$blk_init, .L
${bits
}cbc_dec_blk
! $inp==$out)
303 srl
$omask, $ooff, $omask
305 andcc
$len, 16, %g0 ! is number of blocks even?
307 alignaddrl
$out, %g0, $out
308 bz
%icc, .L
${bits
}_cbc_dec_loop2x
310 .L
${bits
}_cbc_dec_loop
:
316 sllx
%o0, $ileft, %o0
317 srlx
%o1, $iright, %g1
318 sllx
%o1, $ileft, %o1
320 srlx
%o2, $iright, %o2
323 xor %g4, %o0, %o2 ! ^= rk
[0]
328 prefetch
[$out + 63], 22
329 prefetch
[$inp + 16+63], 20
330 call _
${alg
}${bits
}_decrypt_1x
333 fxor
%f12, %f0, %f0 ! ^= ivec
343 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
346 $::code
.=<<___
if ($::evp
);
350 st
%f15, [$ivec + 12]
352 $::code
.=<<___
if (!$::evp
);
353 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
356 std
%f12, [$ivec + 0] ! write out ivec
357 std
%f14, [$ivec + 8]
360 .L
${bits
}_cbc_dec_abort
:
365 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
366 ! and ~3x deterioration
368 faligndata
%f0, %f0, %f4 ! handle unaligned output
369 faligndata
%f0, %f2, %f6
370 faligndata
%f2, %f2, %f8
372 stda
%f4, [$out + $omask]0xc0 ! partial store
375 orn
%g0, $omask, $omask
376 stda
%f8, [$out + $omask]0xc0 ! partial store
378 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
+4
379 orn
%g0, $omask, $omask
381 $::code
.=<<___
if ($::evp
);
385 st
%f15, [$ivec + 12]
387 $::code
.=<<___
if (!$::evp
);
388 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
391 std
%f12, [$ivec + 0] ! write out ivec
392 std
%f14, [$ivec + 8]
398 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
400 .L
${bits
}_cbc_dec_loop2x
:
408 sllx
%o0, $ileft, %o0
409 srlx
%o1, $iright, %g1
411 sllx
%o1, $ileft, %o1
412 srlx
%o2, $iright, %g1
414 sllx
%o2, $ileft, %o2
415 srlx
%o3, $iright, %g1
417 sllx
%o3, $ileft, %o3
418 srlx
%o4, $iright, %o4
421 xor %g4, %o0, %o4 ! ^= rk
[0]
430 prefetch
[$out + 63], 22
431 prefetch
[$inp + 32+63], 20
432 call _
${alg
}${bits
}_decrypt_2x
437 fxor
%f12, %f0, %f0 ! ^= ivec
451 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
454 $::code
.=<<___
if ($::evp
);
458 st
%f15, [$ivec + 12]
460 $::code
.=<<___
if (!$::evp
);
461 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
464 std
%f12, [$ivec + 0] ! write out ivec
465 std
%f14, [$ivec + 8]
472 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
473 ! and ~3x deterioration
475 faligndata
%f0, %f0, %f8 ! handle unaligned output
476 faligndata
%f0, %f2, %f0
477 faligndata
%f2, %f4, %f2
478 faligndata
%f4, %f6, %f4
479 faligndata
%f6, %f6, %f6
480 stda
%f8, [$out + $omask]0xc0 ! partial store
485 orn
%g0, $omask, $omask
486 stda
%f6, [$out + $omask]0xc0 ! partial store
488 brnz
,pt
$len, .L
${bits
}_cbc_dec_loop2x
+4
489 orn
%g0, $omask, $omask
491 $::code
.=<<___
if ($::evp
);
495 st
%f15, [$ivec + 12]
497 $::code
.=<<___
if (!$::evp
);
498 brnz
,pn
$ivoff, .L
${bits
}_cbc_dec_unaligned_ivec
501 std
%f12, [$ivec + 0] ! write out ivec
502 std
%f14, [$ivec + 8]
507 .L
${bits
}_cbc_dec_unaligned_ivec
:
508 alignaddrl
$ivec, $ivoff, %g0 ! handle unaligned ivec
510 srl
$omask, $ivoff, $omask
511 faligndata
%f12, %f12, %f0
512 faligndata
%f12, %f14, %f2
513 faligndata
%f14, %f14, %f4
514 stda
%f0, [$ivec + $omask]0xc0
517 orn
%g0, $omask, $omask
518 stda
%f4, [$ivec + $omask]0xc0
524 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
526 .L
${bits
}cbc_dec_blk
:
527 add
$out, $len, $blk_init
528 and $blk_init, 63, $blk_init ! tail
529 sub $len, $blk_init, $len
530 add
$blk_init, 15, $blk_init ! round up to
16n
532 srl
$blk_init, 4, $blk_init
534 add
$blk_init, 1, $blk_init
536 .L
${bits
}_cbc_dec_blk_loop2x
:
544 sllx
%o0, $ileft, %o0
545 srlx
%o1, $iright, %g1
547 sllx
%o1, $ileft, %o1
548 srlx
%o2, $iright, %g1
550 sllx
%o2, $ileft, %o2
551 srlx
%o3, $iright, %g1
553 sllx
%o3, $ileft, %o3
554 srlx
%o4, $iright, %o4
557 xor %g4, %o0, %o4 ! ^= rk
[0]
566 prefetch
[$inp + 32+63], 20
567 call _
${alg
}${bits
}_decrypt_2x
573 fxor
%f12, %f0, %f0 ! ^= ivec
580 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
582 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
584 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
586 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
587 bgu
,pt
$::size_t_cc
, .L
${bits
}_cbc_dec_blk_loop2x
590 add
$blk_init, $len, $len
591 andcc
$len, 1, %g0 ! is number of blocks even?
592 membar
#StoreLoad|#StoreStore
593 bnz
,pt
%icc, .L
${bits
}_cbc_dec_loop
595 brnz
,pn
$len, .L
${bits
}_cbc_dec_loop2x
598 $::code
.=<<___
if ($::evp
);
599 st
%f12, [$ivec + 0] ! write out ivec
602 st
%f15, [$ivec + 12]
604 $::code
.=<<___
if (!$::evp
);
608 std
%f12, [$ivec + 0] ! write out ivec
609 std
%f14, [$ivec + 8]
614 .type
${alg
}${bits
}_t4_cbc_decrypt
,#function
615 .size
${alg
}${bits
}_t4_cbc_decrypt
,.-${alg
}${bits
}_t4_cbc_decrypt
619 sub alg_ctr32_implement
{
620 my ($alg,$bits) = @_;
623 .globl
${alg
}${bits
}_t4_ctr32_encrypt
625 ${alg
}${bits
}_t4_ctr32_encrypt
:
626 save
%sp, -$::frame
, %sp
629 prefetch
[$inp + 63], 20
630 call _
${alg
}${bits
}_load_enckey
633 ld
[$ivec + 0], %l4 ! counter
641 xor %o5, %g4, %g4 ! ^= rk
[0]
643 movxtod
%g4, %f14 ! most significant
64 bits
645 sub $inp, $out, $blk_init ! $inp!=$out
648 sll
$ileft, 3, $ileft
651 sub $iright, $ileft, $iright
654 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
655 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
656 brnz
,pn
$blk_init, .L
${bits
}_ctr32_blk
! $inp==$out)
657 srl
$omask, $ooff, $omask
659 andcc
$len, 16, %g0 ! is number of blocks even?
660 alignaddrl
$out, %g0, $out
661 bz
%icc, .L
${bits
}_ctr32_loop2x
663 .L
${bits
}_ctr32_loop
:
669 sllx
%o0, $ileft, %o0
670 srlx
%o1, $iright, %g1
671 sllx
%o1, $ileft, %o1
673 srlx
%o2, $iright, %o2
676 xor %g5, %l7, %g1 ! ^= rk
[0]
679 srl
%l7, 0, %l7 ! clruw
680 prefetch
[$out + 63], 22
681 prefetch
[$inp + 16+63], 20
683 $::code
.=<<___
if ($alg eq "aes");
684 aes_eround01
%f16, %f14, %f2, %f4
685 aes_eround23
%f18, %f14, %f2, %f2
687 $::code
.=<<___
if ($alg eq "cmll");
688 camellia_f
%f16, %f2, %f14, %f2
689 camellia_f
%f18, %f14, %f2, %f0
692 call _
${alg
}${bits
}_encrypt_1x
+8
697 fxor
%f10, %f0, %f0 ! ^= inp
705 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
712 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
713 ! and ~3x deterioration
715 faligndata
%f0, %f0, %f4 ! handle unaligned output
716 faligndata
%f0, %f2, %f6
717 faligndata
%f2, %f2, %f8
718 stda
%f4, [$out + $omask]0xc0 ! partial store
721 orn
%g0, $omask, $omask
722 stda
%f8, [$out + $omask]0xc0 ! partial store
724 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
+4
725 orn
%g0, $omask, $omask
730 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
732 .L
${bits
}_ctr32_loop2x
:
740 sllx
%o0, $ileft, %o0
741 srlx
%o1, $iright, %g1
743 sllx
%o1, $ileft, %o1
744 srlx
%o2, $iright, %g1
746 sllx
%o2, $ileft, %o2
747 srlx
%o3, $iright, %g1
749 sllx
%o3, $ileft, %o3
750 srlx
%o4, $iright, %o4
753 xor %g5, %l7, %g1 ! ^= rk
[0]
756 srl
%l7, 0, %l7 ! clruw
760 srl
%l7, 0, %l7 ! clruw
761 prefetch
[$out + 63], 22
762 prefetch
[$inp + 32+63], 20
764 $::code
.=<<___
if ($alg eq "aes");
765 aes_eround01
%f16, %f14, %f2, %f8
766 aes_eround23
%f18, %f14, %f2, %f2
767 aes_eround01
%f16, %f14, %f6, %f10
768 aes_eround23
%f18, %f14, %f6, %f6
770 $::code
.=<<___
if ($alg eq "cmll");
771 camellia_f
%f16, %f2, %f14, %f2
772 camellia_f
%f16, %f6, %f14, %f6
773 camellia_f
%f18, %f14, %f2, %f0
774 camellia_f
%f18, %f14, %f6, %f4
777 call _
${alg
}${bits
}_encrypt_2x
+16
783 fxor
%f8, %f0, %f0 ! ^= inp
796 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
803 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
804 ! and ~3x deterioration
806 faligndata
%f0, %f0, %f8 ! handle unaligned output
807 faligndata
%f0, %f2, %f0
808 faligndata
%f2, %f4, %f2
809 faligndata
%f4, %f6, %f4
810 faligndata
%f6, %f6, %f6
812 stda
%f8, [$out + $omask]0xc0 ! partial store
817 orn
%g0, $omask, $omask
818 stda
%f6, [$out + $omask]0xc0 ! partial store
820 brnz
,pt
$len, .L
${bits
}_ctr32_loop2x
+4
821 orn
%g0, $omask, $omask
826 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
829 add
$out, $len, $blk_init
830 and $blk_init, 63, $blk_init ! tail
831 sub $len, $blk_init, $len
832 add
$blk_init, 15, $blk_init ! round up to
16n
834 srl
$blk_init, 4, $blk_init
836 add
$blk_init, 1, $blk_init
838 .L
${bits
}_ctr32_blk_loop2x
:
846 sllx
%o0, $ileft, %o0
847 srlx
%o1, $iright, %g1
849 sllx
%o1, $ileft, %o1
850 srlx
%o2, $iright, %g1
852 sllx
%o2, $ileft, %o2
853 srlx
%o3, $iright, %g1
855 sllx
%o3, $ileft, %o3
856 srlx
%o4, $iright, %o4
859 xor %g5, %l7, %g1 ! ^= rk
[0]
862 srl
%l7, 0, %l7 ! clruw
866 srl
%l7, 0, %l7 ! clruw
867 prefetch
[$inp + 32+63], 20
869 $::code
.=<<___
if ($alg eq "aes");
870 aes_eround01
%f16, %f14, %f2, %f8
871 aes_eround23
%f18, %f14, %f2, %f2
872 aes_eround01
%f16, %f14, %f6, %f10
873 aes_eround23
%f18, %f14, %f6, %f6
875 $::code
.=<<___
if ($alg eq "cmll");
876 camellia_f
%f16, %f2, %f14, %f2
877 camellia_f
%f16, %f6, %f14, %f6
878 camellia_f
%f18, %f14, %f2, %f0
879 camellia_f
%f18, %f14, %f6, %f4
882 call _
${alg
}${bits
}_encrypt_2x
+16
889 fxor
%f8, %f0, %f0 ! ^= inp
895 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
897 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
899 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
901 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
902 bgu
,pt
$::size_t_cc
, .L
${bits
}_ctr32_blk_loop2x
905 add
$blk_init, $len, $len
906 andcc
$len, 1, %g0 ! is number of blocks even?
907 membar
#StoreLoad|#StoreStore
908 bnz
,pt
%icc, .L
${bits
}_ctr32_loop
910 brnz
,pn
$len, .L
${bits
}_ctr32_loop2x
915 .type
${alg
}${bits
}_t4_ctr32_encrypt
,#function
916 .size
${alg
}${bits
}_t4_ctr32_encrypt
,.-${alg
}${bits
}_t4_ctr32_encrypt
920 sub alg_xts_implement
{
921 my ($alg,$bits,$dir) = @_;
922 my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
926 .globl
${alg
}${bits
}_t4_xts_
${dir
}crypt
928 ${alg
}${bits
}_t4_xts_
${dir
}crypt:
929 save
%sp, -$::frame
-16, %sp
932 add
%fp, $::bias
-16, %o1
933 call
${alg
}_t4_encrypt
936 add
%fp, $::bias
-16, %l7
938 add
%fp, $::bias
-8, %l7
939 ldxa
[%l7]0x88, %g3 ! %g3:%g2 is tweak
941 sethi
%hi(0x76543210), %l7
942 or %l7, %lo(0x76543210), %l7
943 bmask
%l7, %g0, %g0 ! byte swap mask
946 prefetch
[$inp + 63], 20
947 call _
${alg
}${bits
}_load_
${dir
}ckey
951 $code.=<<___
if ($dir eq "de");
958 sub $inp, $out, $blk_init ! $inp!=$out
961 sll
$ileft, 3, $ileft
964 sub $iright, $ileft, $iright
967 movrnz
$ooff, 0, $blk_init ! if ( $out&7 ||
968 movleu
$::size_t_cc
, 0, $blk_init ! $len<256 ||
969 brnz
,pn
$blk_init, .L
${bits
}_xts_
${dir
}blk
! $inp==$out)
970 srl
$omask, $ooff, $omask
972 andcc
$len, 16, %g0 ! is number of blocks even?
974 $code.=<<___
if ($dir eq "de");
975 brz
,pn
$len, .L
${bits
}_xts_
${dir
}steal
978 alignaddrl
$out, %g0, $out
979 bz
%icc, .L
${bits
}_xts_
${dir
}loop2x
981 .L
${bits
}_xts_
${dir
}loop:
987 sllx
%o0, $ileft, %o0
988 srlx
%o1, $iright, %g1
989 sllx
%o1, $ileft, %o1
991 srlx
%o2, $iright, %o2
996 bshuffle
%f12, %f12, %f12
997 bshuffle
%f14, %f14, %f14
999 xor %g4, %o0, %o0 ! ^= rk
[0]
1004 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1007 prefetch
[$out + 63], 22
1008 prefetch
[$inp + 16+63], 20
1009 call _
${alg
}${bits
}_
${dir
}crypt_1x
1012 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1015 srax
%g3, 63, %l7 ! next tweak value
1026 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
1029 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1036 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
1037 ! and ~3x deterioration
1039 faligndata
%f0, %f0, %f4 ! handle unaligned output
1040 faligndata
%f0, %f2, %f6
1041 faligndata
%f2, %f2, %f8
1042 stda
%f4, [$out + $omask]0xc0 ! partial store
1045 orn
%g0, $omask, $omask
1046 stda
%f8, [$out + $omask]0xc0 ! partial store
1048 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
+4
1049 orn
%g0, $omask, $omask
1051 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1057 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1059 .L
${bits
}_xts_
${dir
}loop2x
:
1062 ldx
[$inp + 16], %o2
1064 ldx
[$inp + 24], %o3
1066 ldx
[$inp + 32], %o4
1067 sllx
%o0, $ileft, %o0
1068 srlx
%o1, $iright, %g1
1070 sllx
%o1, $ileft, %o1
1071 srlx
%o2, $iright, %g1
1073 sllx
%o2, $ileft, %o2
1074 srlx
%o3, $iright, %g1
1076 sllx
%o3, $ileft, %o3
1077 srlx
%o4, $iright, %o4
1082 bshuffle
%f12, %f12, %f12
1083 bshuffle
%f14, %f14, %f14
1085 srax
%g3, 63, %l7 ! next tweak value
1093 bshuffle
%f8, %f8, %f8
1094 bshuffle
%f10, %f10, %f10
1096 xor %g4, %o0, %o0 ! ^= rk
[0]
1098 xor %g4, %o2, %o2 ! ^= rk
[0]
1105 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1107 fxor
%f8, %f4, %f4 ! ^= tweak
[0]
1110 prefetch
[$out + 63], 22
1111 prefetch
[$inp + 32+63], 20
1112 call _
${alg
}${bits
}_
${dir
}crypt_2x
1118 srax
%g3, 63, %l7 ! next tweak value
1124 bshuffle
%f8, %f8, %f8
1125 bshuffle
%f10, %f10, %f10
1127 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1137 std
%f4, [$out + 16]
1138 std
%f6, [$out + 24]
1139 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
1144 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1151 2: ldxa
[$inp]0x82, %o0 ! avoid
read-after
-write hazard
1152 ! and ~3x deterioration
1154 faligndata
%f0, %f0, %f8 ! handle unaligned output
1155 faligndata
%f0, %f2, %f10
1156 faligndata
%f2, %f4, %f12
1157 faligndata
%f4, %f6, %f14
1158 faligndata
%f6, %f6, %f0
1160 stda
%f8, [$out + $omask]0xc0 ! partial store
1161 std
%f10, [$out + 8]
1162 std
%f12, [$out + 16]
1163 std
%f14, [$out + 24]
1165 orn
%g0, $omask, $omask
1166 stda
%f0, [$out + $omask]0xc0 ! partial store
1168 brnz
,pt
$len, .L
${bits
}_xts_
${dir
}loop2x
+4
1169 orn
%g0, $omask, $omask
1173 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1179 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1181 .L
${bits
}_xts_
${dir
}blk
:
1182 add
$out, $len, $blk_init
1183 and $blk_init, 63, $blk_init ! tail
1184 sub $len, $blk_init, $len
1185 add
$blk_init, 15, $blk_init ! round up to
16n
1187 srl
$blk_init, 4, $blk_init
1189 add
$blk_init, 1, $blk_init
1191 .L
${bits
}_xts_
${dir
}blk2x
:
1194 ldx
[$inp + 16], %o2
1196 ldx
[$inp + 24], %o3
1198 ldx
[$inp + 32], %o4
1199 sllx
%o0, $ileft, %o0
1200 srlx
%o1, $iright, %g1
1202 sllx
%o1, $ileft, %o1
1203 srlx
%o2, $iright, %g1
1205 sllx
%o2, $ileft, %o2
1206 srlx
%o3, $iright, %g1
1208 sllx
%o3, $ileft, %o3
1209 srlx
%o4, $iright, %o4
1214 bshuffle
%f12, %f12, %f12
1215 bshuffle
%f14, %f14, %f14
1217 srax
%g3, 63, %l7 ! next tweak value
1225 bshuffle
%f8, %f8, %f8
1226 bshuffle
%f10, %f10, %f10
1228 xor %g4, %o0, %o0 ! ^= rk
[0]
1230 xor %g4, %o2, %o2 ! ^= rk
[0]
1237 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1239 fxor
%f8, %f4, %f4 ! ^= tweak
[0]
1242 prefetch
[$inp + 32+63], 20
1243 call _
${alg
}${bits
}_
${dir
}crypt_2x
1249 srax
%g3, 63, %l7 ! next tweak value
1255 bshuffle
%f8, %f8, %f8
1256 bshuffle
%f10, %f10, %f10
1258 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1264 stda
%f0, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1266 stda
%f2, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1268 stda
%f4, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1270 stda
%f6, [$out]0xe2 ! ASI_BLK_INIT
, T4
-specific
1271 bgu
,pt
$::size_t_cc
, .L
${bits
}_xts_
${dir
}blk2x
1274 add
$blk_init, $len, $len
1275 andcc
$len, 1, %g0 ! is number of blocks even?
1276 membar
#StoreLoad|#StoreStore
1277 bnz
,pt
%icc, .L
${bits
}_xts_
${dir
}loop
1279 brnz
,pn
$len, .L
${bits
}_xts_
${dir
}loop2x
1284 brnz
,pn
$rem, .L
${bits
}_xts_
${dir
}steal
1289 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1291 $code.=<<___
if ($dir eq "en");
1293 .L
${bits
}_xts_
${dir
}steal
:
1294 std
%f0, [%fp + $::bias
-16] ! copy of output
1295 std
%f2, [%fp + $::bias
-8]
1297 srl
$ileft, 3, $ileft
1298 add
%fp, $::bias
-16, %l7
1299 add
$inp, $ileft, $inp ! original
$inp+$len&-15
1300 add
$out, $ooff, $out ! original
$out+$len&-15
1304 .L
${bits
}_xts_
${dir
}stealing
:
1305 ldub
[$inp + $ileft], %o0
1306 ldub
[%l7 + $ileft], %o1
1308 stb
%o0, [%l7 + $ileft]
1309 stb
%o1, [$out + $ileft]
1310 brnz
$rem, .L
${bits
}_xts_
${dir
}stealing
1316 sub $out, $ooff, $out
1317 ba
.L
${bits
}_xts_
${dir
}loop ! one more
time
1318 mov
1, $len ! $rem is
0
1320 $code.=<<___
if ($dir eq "de");
1322 .L
${bits
}_xts_
${dir
}steal
:
1327 ldx
[$inp + 16], %o2
1328 sllx
%o0, $ileft, %o0
1329 srlx
%o1, $iright, %g1
1330 sllx
%o1, $ileft, %o1
1332 srlx
%o2, $iright, %o2
1335 srax
%g3, 63, %l7 ! next tweak value
1343 bshuffle
%f12, %f12, %f12
1344 bshuffle
%f14, %f14, %f14
1346 xor %g4, %o0, %o0 ! ^= rk
[0]
1351 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1354 call _
${alg
}${bits
}_
${dir
}crypt_1x
1357 fxor
%f12, %f0, %f0 ! ^= tweak
[0]
1360 std
%f0, [%fp + $::bias
-16]
1361 std
%f2, [%fp + $::bias
-8]
1363 srl
$ileft, 3, $ileft
1364 add
%fp, $::bias
-16, %l7
1365 add
$inp, $ileft, $inp ! original
$inp+$len&-15
1366 add
$out, $ooff, $out ! original
$out+$len&-15
1371 .L
${bits
}_xts_
${dir
}stealing
:
1372 ldub
[$inp + $ileft], %o0
1373 ldub
[%l7 + $ileft], %o1
1375 stb
%o0, [%l7 + $ileft]
1376 stb
%o1, [$out + $ileft]
1377 brnz
$rem, .L
${bits
}_xts_
${dir
}stealing
1383 sub $out, $ooff, $out
1384 ba
.L
${bits
}_xts_
${dir
}loop ! one more
time
1385 mov
1, $len ! $rem is
0
1390 .type
${alg
}${bits
}_t4_xts_
${dir
}crypt,#function
1391 .size
${alg
}${bits
}_t4_xts_
${dir
}crypt,.-${alg
}${bits
}_t4_xts_
${dir
}crypt
1395 # Purpose of these subroutines is to explicitly encode VIS instructions,
1396 # so that one can compile the module without having to specify VIS
1397 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1398 # Idea is to reserve for option to produce "universal" binary and let
1399 # programmer detect if current CPU is VIS capable at run-time.
1401 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1403 my %visopf = ( "faligndata" => 0x048,
1404 "bshuffle" => 0x04c,
1409 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1411 if ($opf=$visopf{$mnemonic}) {
1412 foreach ($rs1,$rs2,$rd) {
1413 return $ref if (!/%f([0-9]{1,2})/);
1416 return $ref if ($1&1);
1417 # re-encode for upper double register addressing
1422 return sprintf ".word\t0x%08x !%s",
1423 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1431 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1432 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1434 my %visopf = ( "addxc" => 0x011,
1437 "alignaddr" => 0x018,
1439 "alignaddrl" => 0x01a );
1441 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1443 if ($opf=$visopf{$mnemonic}) {
1444 foreach ($rs1,$rs2,$rd) {
1445 return $ref if (!/%([goli])([0-9])/);
1449 return sprintf ".word\t0x%08x !%s",
1450 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1457 sub unaes_round
{ # 4-argument instructions
1458 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1460 my %aesopf = ( "aes_eround01" => 0,
1461 "aes_eround23" => 1,
1462 "aes_dround01" => 2,
1463 "aes_dround23" => 3,
1464 "aes_eround01_l"=> 4,
1465 "aes_eround23_l"=> 5,
1466 "aes_dround01_l"=> 6,
1467 "aes_dround23_l"=> 7,
1468 "aes_kexpand1" => 8 );
1470 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1472 if (defined($opf=$aesopf{$mnemonic})) {
1473 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ?
(($1|$1>>5)&31) : $rs3;
1474 foreach ($rs1,$rs2,$rd) {
1475 return $ref if (!/%f([0-9]{1,2})/);
1478 return $ref if ($1&1);
1479 # re-encode for upper double register addressing
1484 return sprintf ".word\t0x%08x !%s",
1485 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1492 sub unaes_kexpand
{ # 3-argument instructions
1493 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1495 my %aesopf = ( "aes_kexpand0" => 0x130,
1496 "aes_kexpand2" => 0x131 );
1498 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1500 if (defined($opf=$aesopf{$mnemonic})) {
1501 foreach ($rs1,$rs2,$rd) {
1502 return $ref if (!/%f([0-9]{1,2})/);
1505 return $ref if ($1&1);
1506 # re-encode for upper double register addressing
1511 return sprintf ".word\t0x%08x !%s",
1512 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1519 sub uncamellia_f
{ # 4-argument instructions
1520 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1523 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1526 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ?
(($1|$1>>5)&31) : $rs3;
1527 foreach ($rs1,$rs2,$rd) {
1528 return $ref if (!/%f([0-9]{1,2})/);
1531 return $ref if ($1&1);
1532 # re-encode for upper double register addressing
1537 return sprintf ".word\t0x%08x !%s",
1538 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1545 sub uncamellia3
{ # 3-argument instructions
1546 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1548 my %cmllopf = ( "camellia_fl" => 0x13c,
1549 "camellia_fli" => 0x13d );
1551 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1553 if (defined($opf=$cmllopf{$mnemonic})) {
1554 foreach ($rs1,$rs2,$rd) {
1555 return $ref if (!/%f([0-9]{1,2})/);
1558 return $ref if ($1&1);
1559 # re-encode for upper double register addressing
1564 return sprintf ".word\t0x%08x !%s",
1565 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1572 sub unmovxtox
{ # 2-argument instructions
1573 my ($mnemonic,$rs,$rd)=@_;
1574 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1576 my %movxopf = ( "movdtox" => 0x110,
1577 "movstouw" => 0x111,
1578 "movstosw" => 0x113,
1580 "movwtos" => 0x119 );
1582 $ref = "$mnemonic\t$rs,$rd";
1584 if (defined($opf=$movxopf{$mnemonic})) {
1586 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1589 return $ref if ($2&1);
1590 # re-encode for upper double register addressing
1595 return sprintf ".word\t0x%08x !%s",
1596 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1604 my ($mnemonic)=shift;
1607 my %desopf = ( "des_round" => 0b1001
,
1608 "des_ip" => 0b100110100
,
1609 "des_iip" => 0b100110101
,
1610 "des_kexpand" => 0b100110110
);
1612 $ref = "$mnemonic\t".join(",",@_);
1614 if (defined($opf=$desopf{$mnemonic})) { # 4-arg
1615 if ($mnemonic eq "des_round") {
1616 foreach (@args[0..3]) {
1617 return $ref if (!/%f([0-9]{1,2})/);
1620 return $ref if ($1&1);
1621 # re-encode for upper double register addressing
1625 return sprintf ".word\t0x%08x !%s",
1626 2<<30|0b011001
<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1628 } elsif ($mnemonic eq "des_kexpand") { # 3-arg
1629 foreach (@args[0..2]) {
1630 return $ref if (!/(%f)?([0-9]{1,2})/);
1633 return $ref if ($2&1);
1634 # re-encode for upper double register addressing
1638 return sprintf ".word\t0x%08x !%s",
1639 2<<30|0b110110
<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1642 foreach (@args[0..1]) {
1643 return $ref if (!/%f([0-9]{1,2})/);
1646 return $ref if ($2&1);
1647 # re-encode for upper double register addressing
1651 return sprintf ".word\t0x%08x !%s",
1652 2<<30|0b110110
<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1660 sub emit_assembler
{
1661 foreach (split("\n",$::code
)) {
1662 s/\`([^\`]*)\`/eval $1/ge;
1664 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1666 s
/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1667 &unaes_round
($1,$2,$3,$4,$5)
1669 s
/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1670 &unaes_kexpand
($1,$2,$3,$4)
1672 s
/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1673 &uncamellia_f
($1,$2,$3,$4,$5)
1675 s
/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1676 &uncamellia3
($1,$2,$3,$4)
1678 s
/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1679 &undes
($1,$2,$3,$4,$5)
1681 s
/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1682 &unmovxtox
($1,$2,$3)
1684 s
/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1685 &unmovxtox
($1,$2,$3)
1687 s
/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1690 s
/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1691 &unvis3
($1,$2,$3,$4)