2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # ECP_NISTZ256 module for PPC64.
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
24 # with/without -DECP_NISTZ256_ASM
29 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
31 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
32 ( $xlate="${dir}ppc-xlate.pl" and -f
$xlate ) or
33 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f
$xlate) or
34 die "can't locate ppc-xlate.pl";
36 open OUT
,"| \"$^X\" $xlate $flavour $output";
42 my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
43 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
44 map("r$_",(3..12,22..31));
46 my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
52 ########################################################################
53 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
55 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
56 open TABLE
,"<ecp_nistz256_table.c" or
57 open TABLE
,"<${dir}../ecp_nistz256_table.c" or
58 die "failed to open ecp_nistz256_table.c:",$!;
63 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
68 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
70 die "insane number of elements" if ($#arr != 64*16*37-1);
73 .type ecp_nistz256_precomputed
,\
@object
74 .globl ecp_nistz256_precomputed
76 ecp_nistz256_precomputed
:
78 ########################################################################
79 # this conversion smashes P256_POINT_AFFINE by individual bytes with
80 # 64 byte interval, similar to
84 @tbl = splice(@arr,0,64*16);
85 for($i=0;$i<64;$i++) {
87 for($j=0;$j<64;$j++) {
88 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
91 $code.=join(',',map { sprintf "0x%02x",$_} @line);
97 .size ecp_nistz256_precomputed
,.-ecp_nistz256_precomputed
98 .asciz
"ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
100 # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
101 # const BN_ULONG x2[4]);
102 .globl ecp_nistz256_mul_mont
104 ecp_nistz256_mul_mont
:
125 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
127 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
129 bl __ecp_nistz256_mul_mont
145 .byte
0,12,4,0,0x80,10,3,0
147 .size ecp_nistz256_mul_mont
,.-ecp_nistz256_mul_mont
149 # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
150 .globl ecp_nistz256_sqr_mont
152 ecp_nistz256_sqr_mont
:
172 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
174 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
176 bl __ecp_nistz256_sqr_mont
192 .byte
0,12,4,0,0x80,10,2,0
194 .size ecp_nistz256_sqr_mont
,.-ecp_nistz256_sqr_mont
196 # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
197 # const BN_ULONG x2[4]);
198 .globl ecp_nistz256_add
218 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
220 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
222 bl __ecp_nistz256_add
232 .byte
0,12,4,0,0x80,4,3,0
234 .size ecp_nistz256_add
,.-ecp_nistz256_add
236 # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
237 .globl ecp_nistz256_div_by_2
239 ecp_nistz256_div_by_2
:
253 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
255 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
257 bl __ecp_nistz256_div_by_2
267 .byte
0,12,4,0,0x80,4,2,0
269 .size ecp_nistz256_div_by_2
,.-ecp_nistz256_div_by_2
271 # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
272 .globl ecp_nistz256_mul_by_2
274 ecp_nistz256_mul_by_2
:
293 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
295 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
297 bl __ecp_nistz256_add
# ret = a+a // 2*a
307 .byte
0,12,4,0,0x80,4,3,0
309 .size ecp_nistz256_mul_by_2
,.-ecp_nistz256_mul_by_2
311 # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
312 .globl ecp_nistz256_mul_by_3
314 ecp_nistz256_mul_by_3
:
337 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
339 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
341 bl __ecp_nistz256_add
# ret = a+a // 2*a
348 bl __ecp_nistz256_add
# ret += a // 2*a+a=3*a
358 .byte
0,12,4,0,0x80,4,2,0
360 .size ecp_nistz256_mul_by_3
,.-ecp_nistz256_mul_by_3
362 # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
363 # const BN_ULONG x2[4]);
364 .globl ecp_nistz256_sub
380 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
382 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
384 bl __ecp_nistz256_sub_from
394 .byte
0,12,4,0,0x80,4,3,0
396 .size ecp_nistz256_sub
,.-ecp_nistz256_sub
398 # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
399 .globl ecp_nistz256_neg
416 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
418 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
420 bl __ecp_nistz256_sub_from
430 .byte
0,12,4,0,0x80,4,2,0
432 .size ecp_nistz256_neg
,.-ecp_nistz256_neg
434 # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
435 # to $a0-$a3 and b[0] - to $bi
436 .type __ecp_nistz256_mul_mont
,\
@function
438 __ecp_nistz256_mul_mont
:
439 mulld
$acc0,$a0,$bi # a[0]*b[0]
442 mulld
$acc1,$a1,$bi # a[1]*b[0]
445 mulld
$acc2,$a2,$bi # a[2]*b[0]
448 mulld
$acc3,$a3,$bi # a[3]*b[0]
452 addc
$acc1,$acc1,$t0 # accumulate high parts of multiplication
460 for($i=1;$i<4;$i++) {
461 ################################################################
462 # Reduction iteration is normally performed by accumulating
463 # result of multiplication of modulus by "magic" digit [and
464 # omitting least significant word, which is guaranteed to
465 # be 0], but thanks to special form of modulus and "magic"
466 # digit being equal to least significant word, it can be
467 # performed with additions and subtractions alone. Indeed:
469 # ffff0001.00000000.0000ffff.ffffffff
471 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
473 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
476 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
477 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
478 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
480 # or marking redundant operations:
482 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
483 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
484 # - 0000abcd.efgh0000.--------.--------.--------
487 subfc
$t2,$t0,$acc0 # "*0xffff0001"
489 addc
$acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
491 adde
$acc2,$acc3,$t2 # +=acc[0]*0xffff0001
495 mulld
$t0,$a0,$bi # lo(a[0]*b[i])
496 mulld
$t1,$a1,$bi # lo(a[1]*b[i])
497 mulld
$t2,$a2,$bi # lo(a[2]*b[i])
498 mulld
$t3,$a3,$bi # lo(a[3]*b[i])
499 addc
$acc0,$acc0,$t0 # accumulate low parts of multiplication
500 mulhdu
$t0,$a0,$bi # hi(a[0]*b[i])
502 mulhdu
$t1,$a1,$bi # hi(a[1]*b[i])
504 mulhdu
$t2,$a2,$bi # hi(a[2]*b[i])
506 mulhdu
$t3,$a3,$bi # hi(a[3]*b[i])
509 $code.=<<___
if ($i<3);
510 ld
$bi,8*($i+1)($bp) # b[$i+1]
513 addc
$acc1,$acc1,$t0 # accumulate high parts of multiplication
525 subfc
$t2,$t0,$acc0 # "*0xffff0001"
527 addc
$acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
529 adde
$acc2,$acc3,$t2 # +=acc[0]*0xffff0001
534 addic
$acc0,$acc0,1 # ret -= modulus
535 subfe
$acc1,$poly1,$acc1
536 subfe
$acc2,$t2,$acc2
537 subfe
$acc3,$poly3,$acc3
538 subfe
$acc4,$t2,$acc4
540 addc
$acc0,$acc0,$acc4 # ret += modulus if borrow
554 .byte
0,12,0x14,0,0,0,1,0
556 .size __ecp_nistz256_mul_mont
,.-__ecp_nistz256_mul_mont
558 # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
560 .type __ecp_nistz256_sqr_mont
,\
@function
562 __ecp_nistz256_sqr_mont
:
563 ################################################################
564 # | | | | | |a1*a0| |
565 # | | | | |a2*a0| | |
566 # | |a3*a2|a3*a0| | | |
567 # | | | |a2*a1| | | |
568 # | | |a3*a1| | | | |
569 # *| | | | | | | | 2|
570 # +|a3*a3|a2*a2|a1*a1|a0*a0|
571 # |--+--+--+--+--+--+--+--|
572 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
574 # "can't overflow" below mark carrying into high part of
575 # multiplication result, which can't overflow, because it
576 # can never be all ones.
578 mulld
$acc1,$a1,$a0 # a[1]*a[0]
580 mulld
$acc2,$a2,$a0 # a[2]*a[0]
582 mulld
$acc3,$a3,$a0 # a[3]*a[0]
585 addc
$acc2,$acc2,$t1 # accumulate high parts of multiplication
586 mulld
$t0,$a2,$a1 # a[2]*a[1]
589 mulld
$t2,$a3,$a1 # a[3]*a[1]
591 addze
$acc4,$acc4 # can't overflow
593 mulld
$acc5,$a3,$a2 # a[3]*a[2]
596 addc
$t1,$t1,$t2 # accumulate high parts of multiplication
597 addze
$t2,$t3 # can't overflow
599 addc
$acc3,$acc3,$t0 # accumulate low parts of multiplication
602 addze
$acc6,$acc6 # can't overflow
604 addc
$acc1,$acc1,$acc1 # acc[1-6]*=2
605 adde
$acc2,$acc2,$acc2
606 adde
$acc3,$acc3,$acc3
607 adde
$acc4,$acc4,$acc4
608 adde
$acc5,$acc5,$acc5
609 adde
$acc6,$acc6,$acc6
613 mulld
$acc0,$a0,$a0 # a[0]*a[0]
615 mulld
$t1,$a1,$a1 # a[1]*a[1]
617 mulld
$t2,$a2,$a2 # a[2]*a[2]
619 mulld
$t3,$a3,$a3 # a[3]*a[3]
621 addc
$acc1,$acc1,$a0 # +a[i]*a[i]
631 for($i=0;$i<3;$i++) { # reductions, see commentary in
632 # multiplication for details
634 subfc
$t2,$t0,$acc0 # "*0xffff0001"
636 addc
$acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
640 adde
$acc2,$acc3,$t2 # +=acc[0]*0xffff0001
641 addze
$acc3,$t3 # can't overflow
645 subfc
$t2,$t0,$acc0 # "*0xffff0001"
647 addc
$acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
649 adde
$acc2,$acc3,$t2 # +=acc[0]*0xffff0001
650 addze
$acc3,$t3 # can't overflow
652 addc
$acc0,$acc0,$acc4 # accumulate upper half
653 adde
$acc1,$acc1,$acc5
654 adde
$acc2,$acc2,$acc6
655 adde
$acc3,$acc3,$acc7
659 addic
$acc0,$acc0,1 # ret -= modulus
660 subfe
$acc1,$poly1,$acc1
661 subfe
$acc2,$t2,$acc2
662 subfe
$acc3,$poly3,$acc3
663 subfe
$acc4,$t2,$acc4
665 addc
$acc0,$acc0,$acc4 # ret += modulus if borrow
679 .byte
0,12,0x14,0,0,0,1,0
681 .size __ecp_nistz256_sqr_mont
,.-__ecp_nistz256_sqr_mont
683 # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
684 # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
685 # contexts, e.g. in multiplication by 2 and 3...
686 .type __ecp_nistz256_add
,\
@function
689 addc
$acc0,$acc0,$t0 # ret = a+b
696 # if a+b >= modulus, subtract modulus
698 # But since comparison implies subtraction, we subtract
699 # modulus and then add it back if subtraction borrowed.
702 subfe
$acc1,$poly1,$acc1
703 subfe
$acc2,$t2,$acc2
704 subfe
$acc3,$poly3,$acc3
721 .byte
0,12,0x14,0,0,0,3,0
723 .size __ecp_nistz256_add
,.-__ecp_nistz256_add
725 .type __ecp_nistz256_sub_from
,\
@function
727 __ecp_nistz256_sub_from
:
732 subfc
$acc0,$t0,$acc0 # ret = a-b
733 subfe
$acc1,$t1,$acc1
734 subfe
$acc2,$t2,$acc2
735 subfe
$acc3,$t3,$acc3
736 subfe
$t0,$t0,$t0 # t0 = borrow ? -1 : 0
738 # if a-b borrowed, add modulus
740 addc
$acc0,$acc0,$t0 # ret -= modulus & t0
754 .byte
0,12,0x14,0,0,0,3,0
756 .size __ecp_nistz256_sub_from
,.-__ecp_nistz256_sub_from
758 .type __ecp_nistz256_sub_morf
,\
@function
760 __ecp_nistz256_sub_morf
:
765 subfc
$acc0,$acc0,$t0 # ret = b-a
766 subfe
$acc1,$acc1,$t1
767 subfe
$acc2,$acc2,$t2
768 subfe
$acc3,$acc3,$t3
769 subfe
$t0,$t0,$t0 # t0 = borrow ? -1 : 0
771 # if b-a borrowed, add modulus
773 addc
$acc0,$acc0,$t0 # ret -= modulus & t0
787 .byte
0,12,0x14,0,0,0,3,0
789 .size __ecp_nistz256_sub_morf
,.-__ecp_nistz256_sub_morf
791 .type __ecp_nistz256_div_by_2
,\
@function
793 __ecp_nistz256_div_by_2
:
795 addic
$acc0,$acc0,-1 # a += modulus
797 adde
$acc1,$acc1,$poly1
801 adde
$acc3,$acc3,$poly3
803 addze
$ap,$t2 # ap = carry
806 subfc
$acc0,$t0,$acc0 # a -= modulus if a was even
807 subfe
$acc1,$t1,$acc1
808 subfe
$acc2,$t2,$acc2
809 subfe
$acc3,$t3,$acc3
832 .byte
0,12,0x14,0,0,0,1,0
834 .size __ecp_nistz256_div_by_2
,.-__ecp_nistz256_div_by_2
836 ########################################################################
837 # following subroutines are "literal" implementation of those found in
840 ########################################################################
841 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
844 my $FRAME=64+32*4+12*8;
845 my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
846 # above map() describes stack layout with 4 temporary
847 # 256-bit vectors on top.
848 my ($rp_real,$ap_real) = map("r$_",(20,21));
851 .globl ecp_nistz256_point_double
853 ecp_nistz256_point_double
:
854 stdu
$sp,-$FRAME($sp)
856 std r20
,$FRAME-8*12($sp)
857 std r21
,$FRAME-8*11($sp)
858 std r22
,$FRAME-8*10($sp)
859 std r23
,$FRAME-8*9($sp)
860 std r24
,$FRAME-8*8($sp)
861 std r25
,$FRAME-8*7($sp)
862 std r26
,$FRAME-8*6($sp)
863 std r27
,$FRAME-8*5($sp)
864 std r28
,$FRAME-8*4($sp)
865 std r29
,$FRAME-8*3($sp)
866 std r30
,$FRAME-8*2($sp)
867 std r31
,$FRAME-8*1($sp)
870 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
872 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
882 ld
$a0,64($ap) # forward load for p256_sqr_mont
889 bl __ecp_nistz256_add
# p256_mul_by_2(S, in_y);
892 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Zsqr, in_z);
898 mr
$a0,$acc0 # put Zsqr aside for p256_sub
903 bl __ecp_nistz256_add
# p256_add(M, Zsqr, in_x);
906 mr
$acc0,$a0 # restore Zsqr
910 ld
$a0,$S+0($sp) # forward load for p256_sqr_mont
915 bl __ecp_nistz256_sub_morf
# p256_sub(Zsqr, in_x, Zsqr);
918 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(S, S);
927 bl __ecp_nistz256_mul_mont
# p256_mul_mont(tmp0, in_z, in_y);
933 ld
$a0,$S+0($sp) # forward load for p256_sqr_mont
938 bl __ecp_nistz256_add
# p256_mul_by_2(res_z, tmp0);
941 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(tmp0, S);
943 ld
$bi,$Zsqr($sp) # forward load for p256_mul_mont
949 bl __ecp_nistz256_div_by_2
# p256_div_by_2(res_y, tmp0);
953 bl __ecp_nistz256_mul_mont
# p256_mul_mont(M, M, Zsqr);
955 mr
$t0,$acc0 # duplicate M
959 mr
$a0,$acc0 # put M aside
964 bl __ecp_nistz256_add
965 mr
$t0,$a0 # restore M
969 ld
$bi,0($ap_real) # forward load for p256_mul_mont
974 bl __ecp_nistz256_add
# p256_mul_by_3(M, M);
978 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S, S, in_x);
984 ld
$a0,$M+0($sp) # forward load for p256_sqr_mont
989 bl __ecp_nistz256_add
# p256_mul_by_2(tmp0, S);
992 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(res_x, M);
995 bl __ecp_nistz256_sub_from
# p256_sub(res_x, res_x, tmp0);
999 bl __ecp_nistz256_sub_morf
# p256_sub(S, S, res_x);
1002 mr
$a0,$acc0 # copy S
1007 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S, S, M);
1009 addi
$bp,$rp_real,32
1010 addi
$rp,$rp_real,32
1011 bl __ecp_nistz256_sub_from
# p256_sub(res_y, S, res_y);
1014 ld r20
,$FRAME-8*12($sp)
1015 ld r21
,$FRAME-8*11($sp)
1016 ld r22
,$FRAME-8*10($sp)
1017 ld r23
,$FRAME-8*9($sp)
1018 ld r24
,$FRAME-8*8($sp)
1019 ld r25
,$FRAME-8*7($sp)
1020 ld r26
,$FRAME-8*6($sp)
1021 ld r27
,$FRAME-8*5($sp)
1022 ld r28
,$FRAME-8*4($sp)
1023 ld r29
,$FRAME-8*3($sp)
1024 ld r30
,$FRAME-8*2($sp)
1025 ld r31
,$FRAME-8*1($sp)
1029 .byte
0,12,4,0,0x80,12,2,0
1031 .size ecp_nistz256_point_double
,.-ecp_nistz256_point_double
1035 ########################################################################
1036 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1037 # const P256_POINT *in2);
1039 my $FRAME = 64 + 32*12 + 16*8;
1040 my ($res_x,$res_y,$res_z,
1041 $H,$Hsqr,$R,$Rsqr,$Hcub,
1042 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1043 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1044 # above map() describes stack layout with 12 temporary
1045 # 256-bit vectors on top.
1046 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1049 .globl ecp_nistz256_point_add
1051 ecp_nistz256_point_add
:
1052 stdu
$sp,-$FRAME($sp)
1054 std r16
,$FRAME-8*16($sp)
1055 std r17
,$FRAME-8*15($sp)
1056 std r18
,$FRAME-8*14($sp)
1057 std r19
,$FRAME-8*13($sp)
1058 std r20
,$FRAME-8*12($sp)
1059 std r21
,$FRAME-8*11($sp)
1060 std r22
,$FRAME-8*10($sp)
1061 std r23
,$FRAME-8*9($sp)
1062 std r24
,$FRAME-8*8($sp)
1063 std r25
,$FRAME-8*7($sp)
1064 std r26
,$FRAME-8*6($sp)
1065 std r27
,$FRAME-8*5($sp)
1066 std r28
,$FRAME-8*4($sp)
1067 std r29
,$FRAME-8*3($sp)
1068 std r30
,$FRAME-8*2($sp)
1069 std r31
,$FRAME-8*1($sp)
1072 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
1074 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
1076 ld
$a0,64($bp) # in2_z
1085 or $in2infty,$t0,$t2
1087 or $in2infty,$in2infty,$t0
1088 sradi
$in2infty,$in2infty,63 # !in2infty
1090 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Z2sqr, in2_z);
1092 ld
$a0,64($ap_real) # in1_z
1098 or $in1infty,$t0,$t2
1100 or $in1infty,$in1infty,$t0
1101 sradi
$in1infty,$in1infty,63 # !in1infty
1103 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Z1sqr, in1_z);
1106 ld
$a0,$Z2sqr+0($sp)
1107 ld
$a1,$Z2sqr+8($sp)
1108 ld
$a2,$Z2sqr+16($sp)
1109 ld
$a3,$Z2sqr+24($sp)
1110 addi
$bp,$bp_real,64
1112 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S1, Z2sqr, in2_z);
1115 ld
$a0,$Z1sqr+0($sp)
1116 ld
$a1,$Z1sqr+8($sp)
1117 ld
$a2,$Z1sqr+16($sp)
1118 ld
$a3,$Z1sqr+24($sp)
1119 addi
$bp,$ap_real,64
1121 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, Z1sqr, in1_z);
1128 addi
$bp,$ap_real,32
1130 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S1, S1, in1_y);
1137 addi
$bp,$bp_real,32
1139 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, S2, in2_y);
1142 ld
$bi,$Z2sqr($sp) # forward load for p256_mul_mont
1148 bl __ecp_nistz256_sub_from
# p256_sub(R, S2, S1);
1150 or $acc0,$acc0,$acc1 # see if result is zero
1151 or $acc2,$acc2,$acc3
1152 or $temp,$acc0,$acc2
1156 bl __ecp_nistz256_mul_mont
# p256_mul_mont(U1, in1_x, Z2sqr);
1165 bl __ecp_nistz256_mul_mont
# p256_mul_mont(U2, in2_x, Z1sqr);
1168 ld
$a0,$R+0($sp) # forward load for p256_sqr_mont
1173 bl __ecp_nistz256_sub_from
# p256_sub(H, U2, U1);
1175 or $acc0,$acc0,$acc1 # see if result is zero
1176 or $acc2,$acc2,$acc3
1177 or. $acc0,$acc0,$acc2
1178 bne
.Ladd_proceed
# is_equal(U1,U2)?
1180 and. $t0,$in1infty,$in2infty
1181 beq
.Ladd_proceed
# (in1infty || in2infty)?
1184 beq
.Ladd_double
# is_equal(S1,S2)?
1189 std
$a0,16($rp_real)
1190 std
$a0,24($rp_real)
1191 std
$a0,32($rp_real)
1192 std
$a0,40($rp_real)
1193 std
$a0,48($rp_real)
1194 std
$a0,56($rp_real)
1195 std
$a0,64($rp_real)
1196 std
$a0,72($rp_real)
1197 std
$a0,80($rp_real)
1198 std
$a0,88($rp_real)
1203 ld
$bp,0($sp) # back-link
1206 ld r16
,$FRAME-8*16($sp)
1207 ld r17
,$FRAME-8*15($sp)
1208 ld r18
,$FRAME-8*14($sp)
1209 ld r19
,$FRAME-8*13($sp)
1210 stdu
$bp,$FRAME-288($sp) # difference in stack frame sizes
1216 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Rsqr, R);
1223 addi
$bp,$ap_real,64
1225 bl __ecp_nistz256_mul_mont
# p256_mul_mont(res_z, H, in1_z);
1232 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Hsqr, H);
1235 ld
$a0,$res_z+0($sp)
1236 ld
$a1,$res_z+8($sp)
1237 ld
$a2,$res_z+16($sp)
1238 ld
$a3,$res_z+24($sp)
1239 addi
$bp,$bp_real,64
1241 bl __ecp_nistz256_mul_mont
# p256_mul_mont(res_z, res_z, in2_z);
1246 ld
$a2,$Hsqr+16($sp)
1247 ld
$a3,$Hsqr+24($sp)
1250 bl __ecp_nistz256_mul_mont
# p256_mul_mont(Hcub, Hsqr, H);
1259 bl __ecp_nistz256_mul_mont
# p256_mul_mont(U2, U1, Hsqr);
1266 bl __ecp_nistz256_add
# p256_mul_by_2(Hsqr, U2);
1270 bl __ecp_nistz256_sub_morf
# p256_sub(res_x, Rsqr, Hsqr);
1273 bl __ecp_nistz256_sub_from
# p256_sub(res_x, res_x, Hcub);
1276 ld
$bi,$Hcub($sp) # forward load for p256_mul_mont
1282 bl __ecp_nistz256_sub_morf
# p256_sub(res_y, U2, res_x);
1286 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, S1, Hcub);
1289 ld
$a0,$res_y+0($sp)
1290 ld
$a1,$res_y+8($sp)
1291 ld
$a2,$res_y+16($sp)
1292 ld
$a3,$res_y+24($sp)
1295 bl __ecp_nistz256_mul_mont
# p256_mul_mont(res_y, res_y, R);
1298 bl __ecp_nistz256_sub_from
# p256_sub(res_y, res_y, S2);
1300 ld
$t0,0($bp_real) # in2
1304 ld
$a0,$res_x+0($sp) # res
1305 ld
$a1,$res_x+8($sp)
1306 ld
$a2,$res_x+16($sp)
1307 ld
$a3,$res_x+24($sp)
1309 for($i=0;$i<64;$i+=32) { # conditional moves
1311 ld
$acc0,$i+0($ap_real) # in1
1312 ld
$acc1,$i+8($ap_real)
1313 ld
$acc2,$i+16($ap_real)
1314 ld
$acc3,$i+24($ap_real)
1315 andc
$t0,$t0,$in1infty
1316 andc
$t1,$t1,$in1infty
1317 andc
$t2,$t2,$in1infty
1318 andc
$t3,$t3,$in1infty
1319 and $a0,$a0,$in1infty
1320 and $a1,$a1,$in1infty
1321 and $a2,$a2,$in1infty
1322 and $a3,$a3,$in1infty
1327 andc
$acc0,$acc0,$in2infty
1328 andc
$acc1,$acc1,$in2infty
1329 andc
$acc2,$acc2,$in2infty
1330 andc
$acc3,$acc3,$in2infty
1331 and $t0,$t0,$in2infty
1332 and $t1,$t1,$in2infty
1333 and $t2,$t2,$in2infty
1334 and $t3,$t3,$in2infty
1340 ld
$t0,$i+32($bp_real) # in2
1341 ld
$t1,$i+40($bp_real)
1342 ld
$t2,$i+48($bp_real)
1343 ld
$t3,$i+56($bp_real)
1344 ld
$a0,$res_x+$i+32($sp)
1345 ld
$a1,$res_x+$i+40($sp)
1346 ld
$a2,$res_x+$i+48($sp)
1347 ld
$a3,$res_x+$i+56($sp)
1348 std
$acc0,$i+0($rp_real)
1349 std
$acc1,$i+8($rp_real)
1350 std
$acc2,$i+16($rp_real)
1351 std
$acc3,$i+24($rp_real)
1355 ld
$acc0,$i+0($ap_real) # in1
1356 ld
$acc1,$i+8($ap_real)
1357 ld
$acc2,$i+16($ap_real)
1358 ld
$acc3,$i+24($ap_real)
1359 andc
$t0,$t0,$in1infty
1360 andc
$t1,$t1,$in1infty
1361 andc
$t2,$t2,$in1infty
1362 andc
$t3,$t3,$in1infty
1363 and $a0,$a0,$in1infty
1364 and $a1,$a1,$in1infty
1365 and $a2,$a2,$in1infty
1366 and $a3,$a3,$in1infty
1371 andc
$acc0,$acc0,$in2infty
1372 andc
$acc1,$acc1,$in2infty
1373 andc
$acc2,$acc2,$in2infty
1374 andc
$acc3,$acc3,$in2infty
1375 and $t0,$t0,$in2infty
1376 and $t1,$t1,$in2infty
1377 and $t2,$t2,$in2infty
1378 and $t3,$t3,$in2infty
1383 std
$acc0,$i+0($rp_real)
1384 std
$acc1,$i+8($rp_real)
1385 std
$acc2,$i+16($rp_real)
1386 std
$acc3,$i+24($rp_real)
1390 ld r16
,$FRAME-8*16($sp)
1391 ld r17
,$FRAME-8*15($sp)
1392 ld r18
,$FRAME-8*14($sp)
1393 ld r19
,$FRAME-8*13($sp)
1394 ld r20
,$FRAME-8*12($sp)
1395 ld r21
,$FRAME-8*11($sp)
1396 ld r22
,$FRAME-8*10($sp)
1397 ld r23
,$FRAME-8*9($sp)
1398 ld r24
,$FRAME-8*8($sp)
1399 ld r25
,$FRAME-8*7($sp)
1400 ld r26
,$FRAME-8*6($sp)
1401 ld r27
,$FRAME-8*5($sp)
1402 ld r28
,$FRAME-8*4($sp)
1403 ld r29
,$FRAME-8*3($sp)
1404 ld r30
,$FRAME-8*2($sp)
1405 ld r31
,$FRAME-8*1($sp)
1409 .byte
0,12,4,0,0x80,16,3,0
1411 .size ecp_nistz256_point_add
,.-ecp_nistz256_point_add
1415 ########################################################################
1416 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1417 # const P256_POINT_AFFINE *in2);
1419 my $FRAME = 64 + 32*10 + 16*8;
1420 my ($res_x,$res_y,$res_z,
1421 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1423 # above map() describes stack layout with 10 temporary
1424 # 256-bit vectors on top.
1425 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1428 .globl ecp_nistz256_point_add_affine
1430 ecp_nistz256_point_add_affine
:
1431 stdu
$sp,-$FRAME($sp)
1433 std r16
,$FRAME-8*16($sp)
1434 std r17
,$FRAME-8*15($sp)
1435 std r18
,$FRAME-8*14($sp)
1436 std r19
,$FRAME-8*13($sp)
1437 std r20
,$FRAME-8*12($sp)
1438 std r21
,$FRAME-8*11($sp)
1439 std r22
,$FRAME-8*10($sp)
1440 std r23
,$FRAME-8*9($sp)
1441 std r24
,$FRAME-8*8($sp)
1442 std r25
,$FRAME-8*7($sp)
1443 std r26
,$FRAME-8*6($sp)
1444 std r27
,$FRAME-8*5($sp)
1445 std r28
,$FRAME-8*4($sp)
1446 std r29
,$FRAME-8*3($sp)
1447 std r30
,$FRAME-8*2($sp)
1448 std r31
,$FRAME-8*1($sp)
1451 srdi
$poly1,$poly1,32 # 0x00000000ffffffff
1453 orc
$poly3,$poly3,$poly1 # 0xffffffff00000001
1459 ld
$a0,64($ap) # in1_z
1465 or $in1infty,$t0,$t2
1467 or $in1infty,$in1infty,$t0
1468 sradi
$in1infty,$in1infty,63 # !in1infty
1470 ld
$acc0,0($bp) # in2_x
1474 ld
$t0,32($bp) # in2_y
1478 or $acc0,$acc0,$acc1
1479 or $acc2,$acc2,$acc3
1480 or $acc0,$acc0,$acc2
1484 or $in2infty,$acc0,$t0
1486 or $in2infty,$in2infty,$t0
1487 sradi
$in2infty,$in2infty,63 # !in2infty
1490 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Z1sqr, in1_z);
1499 bl __ecp_nistz256_mul_mont
# p256_mul_mont(U2, Z1sqr, in2_x);
1502 ld
$bi,64($ap_real) # forward load for p256_mul_mont
1503 ld
$a0,$Z1sqr+0($sp)
1504 ld
$a1,$Z1sqr+8($sp)
1505 ld
$a2,$Z1sqr+16($sp)
1506 ld
$a3,$Z1sqr+24($sp)
1508 bl __ecp_nistz256_sub_from
# p256_sub(H, U2, in1_x);
1510 addi
$bp,$ap_real,64
1512 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, Z1sqr, in1_z);
1519 addi
$bp,$ap_real,64
1521 bl __ecp_nistz256_mul_mont
# p256_mul_mont(res_z, H, in1_z);
1528 addi
$bp,$bp_real,32
1530 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, S2, in2_y);
1532 addi
$bp,$ap_real,32
1533 ld
$a0,$H+0($sp) # forward load for p256_sqr_mont
1538 bl __ecp_nistz256_sub_from
# p256_sub(R, S2, in1_y);
1541 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Hsqr, H);
1548 bl __ecp_nistz256_sqr_mont
# p256_sqr_mont(Rsqr, R);
1553 ld
$a2,$Hsqr+16($sp)
1554 ld
$a3,$Hsqr+24($sp)
1557 bl __ecp_nistz256_mul_mont
# p256_mul_mont(Hcub, Hsqr, H);
1562 ld
$a2,$Hsqr+16($sp)
1563 ld
$a3,$Hsqr+24($sp)
1566 bl __ecp_nistz256_mul_mont
# p256_mul_mont(U2, in1_x, Hsqr);
1573 bl __ecp_nistz256_add
# p256_mul_by_2(Hsqr, U2);
1577 bl __ecp_nistz256_sub_morf
# p256_sub(res_x, Rsqr, Hsqr);
1580 bl __ecp_nistz256_sub_from
# p256_sub(res_x, res_x, Hcub);
1583 ld
$bi,32($ap_real) # forward load for p256_mul_mont
1586 ld
$a2,$Hcub+16($sp)
1587 ld
$a3,$Hcub+24($sp)
1589 bl __ecp_nistz256_sub_morf
# p256_sub(res_y, U2, res_x);
1591 addi
$bp,$ap_real,32
1593 bl __ecp_nistz256_mul_mont
# p256_mul_mont(S2, in1_y, Hcub);
1596 ld
$a0,$res_y+0($sp)
1597 ld
$a1,$res_y+8($sp)
1598 ld
$a2,$res_y+16($sp)
1599 ld
$a3,$res_y+24($sp)
1602 bl __ecp_nistz256_mul_mont
# p256_mul_mont(res_y, res_y, R);
1605 bl __ecp_nistz256_sub_from
# p256_sub(res_y, res_y, S2);
1607 ld
$t0,0($bp_real) # in2
1611 ld
$a0,$res_x+0($sp) # res
1612 ld
$a1,$res_x+8($sp)
1613 ld
$a2,$res_x+16($sp)
1614 ld
$a3,$res_x+24($sp)
1616 for($i=0;$i<64;$i+=32) { # conditional moves
1618 ld
$acc0,$i+0($ap_real) # in1
1619 ld
$acc1,$i+8($ap_real)
1620 ld
$acc2,$i+16($ap_real)
1621 ld
$acc3,$i+24($ap_real)
1622 andc
$t0,$t0,$in1infty
1623 andc
$t1,$t1,$in1infty
1624 andc
$t2,$t2,$in1infty
1625 andc
$t3,$t3,$in1infty
1626 and $a0,$a0,$in1infty
1627 and $a1,$a1,$in1infty
1628 and $a2,$a2,$in1infty
1629 and $a3,$a3,$in1infty
1634 andc
$acc0,$acc0,$in2infty
1635 andc
$acc1,$acc1,$in2infty
1636 andc
$acc2,$acc2,$in2infty
1637 andc
$acc3,$acc3,$in2infty
1638 and $t0,$t0,$in2infty
1639 and $t1,$t1,$in2infty
1640 and $t2,$t2,$in2infty
1641 and $t3,$t3,$in2infty
1647 $code.=<<___
if ($i==0);
1648 ld
$t0,32($bp_real) # in2
1653 $code.=<<___
if ($i==32);
1654 li
$t0,1 # Lone_mont
1660 ld
$a0,$res_x+$i+32($sp)
1661 ld
$a1,$res_x+$i+40($sp)
1662 ld
$a2,$res_x+$i+48($sp)
1663 ld
$a3,$res_x+$i+56($sp)
1664 std
$acc0,$i+0($rp_real)
1665 std
$acc1,$i+8($rp_real)
1666 std
$acc2,$i+16($rp_real)
1667 std
$acc3,$i+24($rp_real)
1671 ld
$acc0,$i+0($ap_real) # in1
1672 ld
$acc1,$i+8($ap_real)
1673 ld
$acc2,$i+16($ap_real)
1674 ld
$acc3,$i+24($ap_real)
1675 andc
$t0,$t0,$in1infty
1676 andc
$t1,$t1,$in1infty
1677 andc
$t2,$t2,$in1infty
1678 andc
$t3,$t3,$in1infty
1679 and $a0,$a0,$in1infty
1680 and $a1,$a1,$in1infty
1681 and $a2,$a2,$in1infty
1682 and $a3,$a3,$in1infty
1687 andc
$acc0,$acc0,$in2infty
1688 andc
$acc1,$acc1,$in2infty
1689 andc
$acc2,$acc2,$in2infty
1690 andc
$acc3,$acc3,$in2infty
1691 and $t0,$t0,$in2infty
1692 and $t1,$t1,$in2infty
1693 and $t2,$t2,$in2infty
1694 and $t3,$t3,$in2infty
1699 std
$acc0,$i+0($rp_real)
1700 std
$acc1,$i+8($rp_real)
1701 std
$acc2,$i+16($rp_real)
1702 std
$acc3,$i+24($rp_real)
1705 ld r16
,$FRAME-8*16($sp)
1706 ld r17
,$FRAME-8*15($sp)
1707 ld r18
,$FRAME-8*14($sp)
1708 ld r19
,$FRAME-8*13($sp)
1709 ld r20
,$FRAME-8*12($sp)
1710 ld r21
,$FRAME-8*11($sp)
1711 ld r22
,$FRAME-8*10($sp)
1712 ld r23
,$FRAME-8*9($sp)
1713 ld r24
,$FRAME-8*8($sp)
1714 ld r25
,$FRAME-8*7($sp)
1715 ld r26
,$FRAME-8*6($sp)
1716 ld r27
,$FRAME-8*5($sp)
1717 ld r28
,$FRAME-8*4($sp)
1718 ld r29
,$FRAME-8*3($sp)
1719 ld r30
,$FRAME-8*2($sp)
1720 ld r31
,$FRAME-8*1($sp)
1724 .byte
0,12,4,0,0x80,16,3,0
1726 .size ecp_nistz256_point_add_affine
,.-ecp_nistz256_point_add_affine
1730 my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1731 my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1734 ########################################################################
1735 # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1737 .globl ecp_nistz256_ord_mul_mont
1739 ecp_nistz256_ord_mul_mont
:
1765 ori
$ordk,$ordk,0xc8aa
1766 ori
$ord0,$ord0,0xcac2
1767 ori
$ord1,$ord1,0xfaad
1771 oris
$ordk,$ordk,0xee00
1772 oris
$ord0,$ord0,0xfc63
1773 oris
$ord1,$ord1,0xa717
1774 ori
$ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1775 ori
$ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1776 ori
$ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1777 li
$ord2,-1 # 0xffffffffffffffff
1778 sldi
$ord3,$ord2,32 # 0xffffffff00000000
1781 mulld
$acc0,$a0,$bi # a[0]*b[0]
1784 mulld
$acc1,$a1,$bi # a[1]*b[0]
1787 mulld
$acc2,$a2,$bi # a[2]*b[0]
1790 mulld
$acc3,$a3,$bi # a[3]*b[0]
1791 mulhdu
$acc4,$a3,$bi
1793 mulld
$t4,$acc0,$ordk
1795 addc
$acc1,$acc1,$t0 # accumulate high parts of multiplication
1796 adde
$acc2,$acc2,$t1
1797 adde
$acc3,$acc3,$t2
1801 for ($i=1;$i<4;$i++) {
1802 ################################################################
1803 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1805 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1807 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1810 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1811 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1812 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1814 ld
$bi,8*$i($bp) # b[i]
1817 subfc
$acc2,$t4,$acc2
1819 subfe
$acc3,$t0,$acc3
1820 subfe
$acc4,$t1,$acc4
1821 subfe
$acc5,$zr,$acc5
1823 addic
$t0,$acc0,-1 # discarded
1824 mulhdu
$t1,$ord0,$t4
1826 mulhdu
$t3,$ord1,$t4
1833 addc
$acc0,$acc1,$t2
1835 adde
$acc1,$acc2,$t3
1837 adde
$acc2,$acc3,$t4
1838 adde
$acc3,$acc4,$t4
1841 addc
$acc0,$acc0,$t0 # accumulate low parts
1843 adde
$acc1,$acc1,$t1
1845 adde
$acc2,$acc2,$t2
1847 adde
$acc3,$acc3,$t3
1850 mulld
$t4,$acc0,$ordk
1851 addc
$acc1,$acc1,$t0 # accumulate high parts
1852 adde
$acc2,$acc2,$t1
1853 adde
$acc3,$acc3,$t2
1854 adde
$acc4,$acc4,$t3
1859 sldi
$t0,$t4,32 # last reduction
1860 subfc
$acc2,$t4,$acc2
1862 subfe
$acc3,$t0,$acc3
1863 subfe
$acc4,$t1,$acc4
1864 subfe
$acc5,$zr,$acc5
1866 addic
$t0,$acc0,-1 # discarded
1867 mulhdu
$t1,$ord0,$t4
1869 mulhdu
$t3,$ord1,$t4
1874 addc
$acc0,$acc1,$t2
1875 adde
$acc1,$acc2,$t3
1876 adde
$acc2,$acc3,$t4
1877 adde
$acc3,$acc4,$t4
1880 subfc
$acc0,$ord0,$acc0 # ret -= modulus
1881 subfe
$acc1,$ord1,$acc1
1882 subfe
$acc2,$ord2,$acc2
1883 subfe
$acc3,$ord3,$acc3
1884 subfe
$acc4,$zr,$acc4
1888 addc
$acc0,$acc0,$t0 # ret += modulus if borrow
1890 adde
$acc1,$acc1,$t1
1891 adde
$acc2,$acc2,$acc4
1892 adde
$acc3,$acc3,$t3
1916 .byte
0,12,4,0,0x80,14,3,0
1918 .size ecp_nistz256_ord_mul_mont
,.-ecp_nistz256_ord_mul_mont
1920 ################################################################################
1921 # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1923 .globl ecp_nistz256_ord_sqr_mont
1925 ecp_nistz256_ord_sqr_mont
:
1952 ori
$ordk,$ordk,0xc8aa
1953 ori
$ord0,$ord0,0xcac2
1954 ori
$ord1,$ord1,0xfaad
1958 oris
$ordk,$ordk,0xee00
1959 oris
$ord0,$ord0,0xfc63
1960 oris
$ord1,$ord1,0xa717
1961 ori
$ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1962 ori
$ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1963 ori
$ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1964 li
$ord2,-1 # 0xffffffffffffffff
1965 sldi
$ord3,$ord2,32 # 0xffffffff00000000
1971 ################################################################
1972 # | | | | | |a1*a0| |
1973 # | | | | |a2*a0| | |
1974 # | |a3*a2|a3*a0| | | |
1975 # | | | |a2*a1| | | |
1976 # | | |a3*a1| | | | |
1977 # *| | | | | | | | 2|
1978 # +|a3*a3|a2*a2|a1*a1|a0*a0|
1979 # |--+--+--+--+--+--+--+--|
1980 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1982 # "can't overflow" below mark carrying into high part of
1983 # multiplication result, which can't overflow, because it
1984 # can never be all ones.
1986 mulld
$acc1,$a1,$a0 # a[1]*a[0]
1988 mulld
$acc2,$a2,$a0 # a[2]*a[0]
1990 mulld
$acc3,$a3,$a0 # a[3]*a[0]
1991 mulhdu
$acc4,$a3,$a0
1993 addc
$acc2,$acc2,$t1 # accumulate high parts of multiplication
1994 mulld
$t0,$a2,$a1 # a[2]*a[1]
1996 adde
$acc3,$acc3,$t2
1997 mulld
$t2,$a3,$a1 # a[3]*a[1]
1999 addze
$acc4,$acc4 # can't overflow
2001 mulld
$acc5,$a3,$a2 # a[3]*a[2]
2002 mulhdu
$acc6,$a3,$a2
2004 addc
$t1,$t1,$t2 # accumulate high parts of multiplication
2005 mulld
$acc0,$a0,$a0 # a[0]*a[0]
2006 addze
$t2,$t3 # can't overflow
2008 addc
$acc3,$acc3,$t0 # accumulate low parts of multiplication
2010 adde
$acc4,$acc4,$t1
2011 mulld
$t1,$a1,$a1 # a[1]*a[1]
2012 adde
$acc5,$acc5,$t2
2014 addze
$acc6,$acc6 # can't overflow
2016 addc
$acc1,$acc1,$acc1 # acc[1-6]*=2
2017 mulld
$t2,$a2,$a2 # a[2]*a[2]
2018 adde
$acc2,$acc2,$acc2
2020 adde
$acc3,$acc3,$acc3
2021 mulld
$t3,$a3,$a3 # a[3]*a[3]
2022 adde
$acc4,$acc4,$acc4
2024 adde
$acc5,$acc5,$acc5
2025 adde
$acc6,$acc6,$acc6
2028 addc
$acc1,$acc1,$a0 # +a[i]*a[i]
2029 mulld
$t4,$acc0,$ordk
2030 adde
$acc2,$acc2,$t1
2031 adde
$acc3,$acc3,$a1
2032 adde
$acc4,$acc4,$t2
2033 adde
$acc5,$acc5,$a2
2034 adde
$acc6,$acc6,$t3
2035 adde
$acc7,$acc7,$a3
2037 for($i=0; $i<4; $i++) { # reductions
2039 addic
$t0,$acc0,-1 # discarded
2040 mulhdu
$t1,$ord0,$t4
2042 mulhdu
$t3,$ord1,$t4
2047 addc
$acc0,$acc1,$t2
2048 adde
$acc1,$acc2,$t3
2049 adde
$acc2,$acc3,$t4
2050 adde
$acc3,$zr,$t4 # can't overflow
2052 $code.=<<___
if ($i<3);
2053 mulld
$t3,$acc0,$ordk
2057 subfc
$acc1,$t4,$acc1
2059 subfe
$acc2,$t0,$acc2
2060 subfe
$acc3,$t1,$acc3 # can't borrow
2062 ($t3,$t4) = ($t4,$t3);
2065 addc
$acc0,$acc0,$acc4 # accumulate upper half
2066 adde
$acc1,$acc1,$acc5
2067 adde
$acc2,$acc2,$acc6
2068 adde
$acc3,$acc3,$acc7
2071 subfc
$acc0,$ord0,$acc0 # ret -= modulus
2072 subfe
$acc1,$ord1,$acc1
2073 subfe
$acc2,$ord2,$acc2
2074 subfe
$acc3,$ord3,$acc3
2075 subfe
$acc4,$zr,$acc4
2079 addc
$a0,$acc0,$t0 # ret += modulus if borrow
2082 adde
$a2,$acc2,$acc4
2109 .byte
0,12,4,0,0x80,14,3,0
2111 .size ecp_nistz256_ord_sqr_mont
,.-ecp_nistz256_ord_sqr_mont
2115 ########################################################################
2116 # scatter-gather subroutines
2118 my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2120 ########################################################################
2121 # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2123 .globl ecp_nistz256_scatter_w5
2125 ecp_nistz256_scatter_w5
:
2126 slwi
$index,$index,2
2127 add
$out,$out,$index
2134 stw r8
, 64*0-4($out)
2136 stw r9
, 64*1-4($out)
2138 stw r10
,64*2-4($out)
2140 stw r11
,64*3-4($out)
2142 stw r8
, 64*4-4($out)
2143 stw r9
, 64*5-4($out)
2144 stw r10
,64*6-4($out)
2145 stw r11
,64*7-4($out)
2153 stw r8
, 64*0-4($out)
2155 stw r9
, 64*1-4($out)
2157 stw r10
,64*2-4($out)
2159 stw r11
,64*3-4($out)
2161 stw r8
, 64*4-4($out)
2162 stw r9
, 64*5-4($out)
2163 stw r10
,64*6-4($out)
2164 stw r11
,64*7-4($out)
2172 stw r8
, 64*0-4($out)
2174 stw r9
, 64*1-4($out)
2176 stw r10
,64*2-4($out)
2178 stw r11
,64*3-4($out)
2180 stw r8
, 64*4-4($out)
2181 stw r9
, 64*5-4($out)
2182 stw r10
,64*6-4($out)
2183 stw r11
,64*7-4($out)
2187 .byte
0,12,0x14,0,0,0,3,0
2189 .size ecp_nistz256_scatter_w5
,.-ecp_nistz256_scatter_w5
2191 ########################################################################
2192 # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2194 .globl ecp_nistz256_gather_w5
2196 ecp_nistz256_gather_w5
:
2200 add
$index,$index,r0
2201 slwi
$index,$index,2
2202 add
$inp,$inp,$index
2283 .byte
0,12,0x14,0,0,0,3,0
2285 .size ecp_nistz256_gather_w5
,.-ecp_nistz256_gather_w5
2287 ########################################################################
2288 # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2290 .globl ecp_nistz256_scatter_w7
2292 ecp_nistz256_scatter_w7
:
2295 add
$out,$out,$index
2316 bdnz
.Loop_scatter_w7
2320 .byte
0,12,0x14,0,0,0,3,0
2322 .size ecp_nistz256_scatter_w7
,.-ecp_nistz256_scatter_w7
2324 ########################################################################
2325 # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2327 .globl ecp_nistz256_gather_w7
2329 ecp_nistz256_gather_w7
:
2335 add
$index,$index,r0
2336 add
$inp,$inp,$index
2367 bdnz
.Loop_gather_w7
2371 .byte
0,12,0x14,0,0,0,3,0
2373 .size ecp_nistz256_gather_w7
,.-ecp_nistz256_gather_w7
2377 foreach (split("\n",$code)) {
2378 s/\`([^\`]*)\`/eval $1/ge;
2382 close STDOUT
; # enforce flush