2 # Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # ECP_NISTZ256 module for ARMv8.
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
24 # with/without -DECP_NISTZ256_ASM
26 # Cortex-A53 +190-400%
27 # Cortex-A57 +190-350%
30 # Ranges denote minimum and maximum improvement coefficients depending
31 # on benchmark. Lower coefficients are for ECDSA sign, server-side
32 # operation. Keep in mind that +400% means 5x improvement.
35 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
37 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
38 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
40 die "can't locate arm-xlate.pl";
42 open OUT
,"| \"$^X\" $xlate $flavour $output";
46 my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
48 map("x$_",(0..17,19,20));
50 my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
57 ########################################################################
58 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
60 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
61 open TABLE
,"<ecp_nistz256_table.c" or
62 open TABLE
,"<${dir}../ecp_nistz256_table.c" or
63 die "failed to open ecp_nistz256_table.c:",$!;
68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
72 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
73 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
75 die "insane number of elements" if ($#arr != 64*16*37-1);
78 .globl ecp_nistz256_precomputed
79 .type ecp_nistz256_precomputed
,%object
81 ecp_nistz256_precomputed
:
83 ########################################################################
84 # this conversion smashes P256_POINT_AFFINE by individual bytes with
85 # 64 byte interval, similar to
89 @tbl = splice(@arr,0,64*16);
90 for($i=0;$i<64;$i++) {
92 for($j=0;$j<64;$j++) {
93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
96 $code.=join(',',map { sprintf "0x%02x",$_} @line);
101 .size ecp_nistz256_precomputed
,.-ecp_nistz256_precomputed
104 .quad
0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
105 .LRR
: // 2^512 mod P precomputed
for NIST P256 polynomial
106 .quad
0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
108 .quad
0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112 .quad
0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
114 .quad
0xccd1c8aaee00bc4f
115 .asciz
"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
117 // void ecp_nistz256_to_mont
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
118 .globl ecp_nistz256_to_mont
119 .type ecp_nistz256_to_mont
,%function
121 ecp_nistz256_to_mont
:
122 .inst
0xd503233f // paciasp
123 stp x29
,x30
,[sp
,#-32]!
127 ldr
$bi,.LRR
// bp
[0]
129 ldp
$a2,$a3,[$ap,#16]
132 adr
$bp,.LRR
// &bp
[0]
134 bl __ecp_nistz256_mul_mont
138 .inst
0xd50323bf // autiasp
140 .size ecp_nistz256_to_mont
,.-ecp_nistz256_to_mont
142 // void ecp_nistz256_from_mont
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
143 .globl ecp_nistz256_from_mont
144 .type ecp_nistz256_from_mont
,%function
146 ecp_nistz256_from_mont
:
147 .inst
0xd503233f // paciasp
148 stp x29
,x30
,[sp
,#-32]!
154 ldp
$a2,$a3,[$ap,#16]
157 adr
$bp,.Lone
// &bp
[0]
159 bl __ecp_nistz256_mul_mont
163 .inst
0xd50323bf // autiasp
165 .size ecp_nistz256_from_mont
,.-ecp_nistz256_from_mont
167 // void ecp_nistz256_mul_mont
(BN_ULONG x0
[4],const BN_ULONG x1
[4],
168 // const BN_ULONG x2
[4]);
169 .globl ecp_nistz256_mul_mont
170 .type ecp_nistz256_mul_mont
,%function
172 ecp_nistz256_mul_mont
:
173 .inst
0xd503233f // paciasp
174 stp x29
,x30
,[sp
,#-32]!
178 ldr
$bi,[$bp] // bp
[0]
180 ldp
$a2,$a3,[$ap,#16]
184 bl __ecp_nistz256_mul_mont
188 .inst
0xd50323bf // autiasp
190 .size ecp_nistz256_mul_mont
,.-ecp_nistz256_mul_mont
192 // void ecp_nistz256_sqr_mont
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
193 .globl ecp_nistz256_sqr_mont
194 .type ecp_nistz256_sqr_mont
,%function
196 ecp_nistz256_sqr_mont
:
197 .inst
0xd503233f // paciasp
198 stp x29
,x30
,[sp
,#-32]!
203 ldp
$a2,$a3,[$ap,#16]
207 bl __ecp_nistz256_sqr_mont
211 .inst
0xd50323bf // autiasp
213 .size ecp_nistz256_sqr_mont
,.-ecp_nistz256_sqr_mont
215 // void ecp_nistz256_add
(BN_ULONG x0
[4],const BN_ULONG x1
[4],
216 // const BN_ULONG x2
[4]);
217 .globl ecp_nistz256_add
218 .type ecp_nistz256_add
,%function
221 .inst
0xd503233f // paciasp
222 stp x29
,x30
,[sp
,#-16]!
225 ldp
$acc0,$acc1,[$ap]
227 ldp
$acc2,$acc3,[$ap,#16]
228 ldp
$t2,$t3,[$bp,#16]
232 bl __ecp_nistz256_add
235 .inst
0xd50323bf // autiasp
237 .size ecp_nistz256_add
,.-ecp_nistz256_add
239 // void ecp_nistz256_div_by_2
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
240 .globl ecp_nistz256_div_by_2
241 .type ecp_nistz256_div_by_2
,%function
243 ecp_nistz256_div_by_2
:
244 .inst
0xd503233f // paciasp
245 stp x29
,x30
,[sp
,#-16]!
248 ldp
$acc0,$acc1,[$ap]
249 ldp
$acc2,$acc3,[$ap,#16]
253 bl __ecp_nistz256_div_by_2
256 .inst
0xd50323bf // autiasp
258 .size ecp_nistz256_div_by_2
,.-ecp_nistz256_div_by_2
260 // void ecp_nistz256_mul_by_2
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
261 .globl ecp_nistz256_mul_by_2
262 .type ecp_nistz256_mul_by_2
,%function
264 ecp_nistz256_mul_by_2
:
265 .inst
0xd503233f // paciasp
266 stp x29
,x30
,[sp
,#-16]!
269 ldp
$acc0,$acc1,[$ap]
270 ldp
$acc2,$acc3,[$ap,#16]
278 bl __ecp_nistz256_add
// ret
= a
+a
// 2*a
281 .inst
0xd50323bf // autiasp
283 .size ecp_nistz256_mul_by_2
,.-ecp_nistz256_mul_by_2
285 // void ecp_nistz256_mul_by_3
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
286 .globl ecp_nistz256_mul_by_3
287 .type ecp_nistz256_mul_by_3
,%function
289 ecp_nistz256_mul_by_3
:
290 .inst
0xd503233f // paciasp
291 stp x29
,x30
,[sp
,#-16]!
294 ldp
$acc0,$acc1,[$ap]
295 ldp
$acc2,$acc3,[$ap,#16]
307 bl __ecp_nistz256_add
// ret
= a
+a
// 2*a
314 bl __ecp_nistz256_add
// ret
+= a
// 2*a
+a
=3*a
317 .inst
0xd50323bf // autiasp
319 .size ecp_nistz256_mul_by_3
,.-ecp_nistz256_mul_by_3
321 // void ecp_nistz256_sub
(BN_ULONG x0
[4],const BN_ULONG x1
[4],
322 // const BN_ULONG x2
[4]);
323 .globl ecp_nistz256_sub
324 .type ecp_nistz256_sub
,%function
327 .inst
0xd503233f // paciasp
328 stp x29
,x30
,[sp
,#-16]!
331 ldp
$acc0,$acc1,[$ap]
332 ldp
$acc2,$acc3,[$ap,#16]
336 bl __ecp_nistz256_sub_from
339 .inst
0xd50323bf // autiasp
341 .size ecp_nistz256_sub
,.-ecp_nistz256_sub
343 // void ecp_nistz256_neg
(BN_ULONG x0
[4],const BN_ULONG x1
[4]);
344 .globl ecp_nistz256_neg
345 .type ecp_nistz256_neg
,%function
348 .inst
0xd503233f // paciasp
349 stp x29
,x30
,[sp
,#-16]!
353 mov
$acc0,xzr
// a
= 0
360 bl __ecp_nistz256_sub_from
363 .inst
0xd50323bf // autiasp
365 .size ecp_nistz256_neg
,.-ecp_nistz256_neg
367 // note that __ecp_nistz256_mul_mont expects a
[0-3] input pre
-loaded
368 // to
$a0-$a3 and b
[0] - to
$bi
369 .type __ecp_nistz256_mul_mont
,%function
371 __ecp_nistz256_mul_mont
:
372 mul
$acc0,$a0,$bi // a
[0]*b
[0]
375 mul
$acc1,$a1,$bi // a
[1]*b
[0]
378 mul
$acc2,$a2,$bi // a
[2]*b
[0]
381 mul
$acc3,$a3,$bi // a
[3]*b
[0]
383 ldr
$bi,[$bp,#8] // b[1]
385 adds
$acc1,$acc1,$t0 // accumulate high parts of multiplication
393 for($i=1;$i<4;$i++) {
394 # Reduction iteration is normally performed by accumulating
395 # result of multiplication of modulus by "magic" digit [and
396 # omitting least significant word, which is guaranteed to
397 # be 0], but thanks to special form of modulus and "magic"
398 # digit being equal to least significant word, it can be
399 # performed with additions and subtractions alone. Indeed:
401 # ffff0001.00000000.0000ffff.ffffffff
403 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
405 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
408 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
409 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
410 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
412 # or marking redundant operations:
414 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
415 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
416 # - 0000abcd.efgh0000.--------.--------.--------
419 subs
$t2,$acc0,$t0 // "*0xffff0001"
421 adds
$acc0,$acc1,$t0 // +=acc
[0]<<96 and omit acc
[0]
422 mul
$t0,$a0,$bi // lo
(a
[0]*b
[i
])
424 mul
$t1,$a1,$bi // lo
(a
[1]*b
[i
])
425 adcs
$acc2,$acc3,$t2 // +=acc
[0]*0xffff0001
426 mul
$t2,$a2,$bi // lo
(a
[2]*b
[i
])
428 mul
$t3,$a3,$bi // lo
(a
[3]*b
[i
])
431 adds
$acc0,$acc0,$t0 // accumulate low parts of multiplication
432 umulh
$t0,$a0,$bi // hi
(a
[0]*b
[i
])
434 umulh
$t1,$a1,$bi // hi
(a
[1]*b
[i
])
436 umulh
$t2,$a2,$bi // hi
(a
[2]*b
[i
])
438 umulh
$t3,$a3,$bi // hi
(a
[3]*b
[i
])
441 $code.=<<___
if ($i<3);
442 ldr
$bi,[$bp,#8*($i+1)] // b[$i+1]
445 adds
$acc1,$acc1,$t0 // accumulate high parts of multiplication
456 subs
$t2,$acc0,$t0 // "*0xffff0001"
458 adds
$acc0,$acc1,$t0 // +=acc
[0]<<96 and omit acc
[0]
460 adcs
$acc2,$acc3,$t2 // +=acc
[0]*0xffff0001
464 adds
$t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
465 sbcs
$t1,$acc1,$poly1
467 sbcs
$t3,$acc3,$poly3
468 sbcs xzr
,$acc4,xzr
// did it borrow?
470 csel
$acc0,$acc0,$t0,lo
// ret
= borrow ? ret
: ret
-modulus
471 csel
$acc1,$acc1,$t1,lo
472 csel
$acc2,$acc2,$t2,lo
473 stp
$acc0,$acc1,[$rp]
474 csel
$acc3,$acc3,$t3,lo
475 stp
$acc2,$acc3,[$rp,#16]
478 .size __ecp_nistz256_mul_mont
,.-__ecp_nistz256_mul_mont
480 // note that __ecp_nistz256_sqr_mont expects a
[0-3] input pre
-loaded
482 .type __ecp_nistz256_sqr_mont
,%function
484 __ecp_nistz256_sqr_mont
:
485 // | | | | | |a1
*a0
| |
486 // | | | | |a2
*a0
| | |
487 // | |a3
*a2
|a3
*a0
| | | |
488 // | | | |a2
*a1
| | | |
489 // | | |a3
*a1
| | | | |
490 // *| | | | | | | | 2|
491 // +|a3
*a3
|a2
*a2
|a1
*a1
|a0
*a0
|
492 // |--+--+--+--+--+--+--+--|
493 // |A7
|A6
|A5
|A4
|A3
|A2
|A1
|A0
|, where Ax is
$accx, i
.e
. follow
$accx
495 // "can't overflow" below mark carrying into high part of
496 // multiplication result
, which can
't overflow, because it
497 // can never be all ones.
499 mul $acc1,$a1,$a0 // a[1]*a[0]
501 mul $acc2,$a2,$a0 // a[2]*a[0]
503 mul $acc3,$a3,$a0 // a[3]*a[0]
506 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
507 mul $t0,$a2,$a1 // a[2]*a[1]
510 mul $t2,$a3,$a1 // a[3]*a[1]
512 adc $acc4,$acc4,xzr // can't overflow
514 mul
$acc5,$a3,$a2 // a
[3]*a
[2]
517 adds
$t1,$t1,$t2 // accumulate high parts of multiplication
518 mul
$acc0,$a0,$a0 // a
[0]*a
[0]
519 adc
$t2,$t3,xzr
// can
't overflow
521 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
524 mul $t1,$a1,$a1 // a[1]*a[1]
527 adc $acc6,$acc6,xzr // can't overflow
529 adds
$acc1,$acc1,$acc1 // acc
[1-6]*=2
530 mul
$t2,$a2,$a2 // a
[2]*a
[2]
531 adcs
$acc2,$acc2,$acc2
533 adcs
$acc3,$acc3,$acc3
534 mul
$t3,$a3,$a3 // a
[3]*a
[3]
535 adcs
$acc4,$acc4,$acc4
537 adcs
$acc5,$acc5,$acc5
538 adcs
$acc6,$acc6,$acc6
541 adds
$acc1,$acc1,$a0 // +a
[i
]*a
[i
]
551 for($i=0;$i<3;$i++) { # reductions, see commentary in
552 # multiplication for details
554 subs
$t2,$acc0,$t0 // "*0xffff0001"
556 adds
$acc0,$acc1,$t0 // +=acc
[0]<<96 and omit acc
[0]
559 adcs
$acc2,$acc3,$t2 // +=acc
[0]*0xffff0001
561 adc
$acc3,$t3,xzr
// can
't overflow
565 subs $t2,$acc0,$t0 // "*0xffff0001"
567 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
569 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
570 adc $acc3,$t3,xzr // can't overflow
572 adds
$acc0,$acc0,$acc4 // accumulate upper half
573 adcs
$acc1,$acc1,$acc5
574 adcs
$acc2,$acc2,$acc6
575 adcs
$acc3,$acc3,$acc7
578 adds
$t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
579 sbcs
$t1,$acc1,$poly1
581 sbcs
$t3,$acc3,$poly3
582 sbcs xzr
,$acc4,xzr
// did it borrow?
584 csel
$acc0,$acc0,$t0,lo
// ret
= borrow ? ret
: ret
-modulus
585 csel
$acc1,$acc1,$t1,lo
586 csel
$acc2,$acc2,$t2,lo
587 stp
$acc0,$acc1,[$rp]
588 csel
$acc3,$acc3,$t3,lo
589 stp
$acc2,$acc3,[$rp,#16]
592 .size __ecp_nistz256_sqr_mont
,.-__ecp_nistz256_sqr_mont
594 // Note that __ecp_nistz256_add expects both input vectors pre
-loaded to
595 // $a0-$a3 and $t0-$t3. This is done because it
's used in multiple
596 // contexts, e.g. in multiplication by 2 and 3...
597 .type __ecp_nistz256_add,%function
600 adds $acc0,$acc0,$t0 // ret = a+b
604 adc $ap,xzr,xzr // zap $ap
606 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
607 sbcs $t1,$acc1,$poly1
609 sbcs $t3,$acc3,$poly3
610 sbcs xzr,$ap,xzr // did subtraction borrow?
612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
613 csel $acc1,$acc1,$t1,lo
614 csel $acc2,$acc2,$t2,lo
615 stp $acc0,$acc1,[$rp]
616 csel $acc3,$acc3,$t3,lo
617 stp $acc2,$acc3,[$rp,#16]
620 .size __ecp_nistz256_add,.-__ecp_nistz256_add
622 .type __ecp_nistz256_sub_from,%function
624 __ecp_nistz256_sub_from:
626 ldp $t2,$t3,[$bp,#16]
627 subs $acc0,$acc0,$t0 // ret = a-b
631 sbc $ap,xzr,xzr // zap $ap
633 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
634 adcs $t1,$acc1,$poly1
637 cmp $ap,xzr // did subtraction borrow?
639 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
640 csel $acc1,$acc1,$t1,eq
641 csel $acc2,$acc2,$t2,eq
642 stp $acc0,$acc1,[$rp]
643 csel $acc3,$acc3,$t3,eq
644 stp $acc2,$acc3,[$rp,#16]
647 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
649 .type __ecp_nistz256_sub_morf,%function
651 __ecp_nistz256_sub_morf:
653 ldp $t2,$t3,[$bp,#16]
654 subs $acc0,$t0,$acc0 // ret = b-a
658 sbc $ap,xzr,xzr // zap $ap
660 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
661 adcs $t1,$acc1,$poly1
664 cmp $ap,xzr // did subtraction borrow?
666 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
667 csel $acc1,$acc1,$t1,eq
668 csel $acc2,$acc2,$t2,eq
669 stp $acc0,$acc1,[$rp]
670 csel $acc3,$acc3,$t3,eq
671 stp $acc2,$acc3,[$rp,#16]
674 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
676 .type __ecp_nistz256_div_by_2,%function
678 __ecp_nistz256_div_by_2:
679 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
680 adcs $t1,$acc1,$poly1
682 adcs $t3,$acc3,$poly3
683 adc $ap,xzr,xzr // zap $ap
684 tst $acc0,#1 // is a even?
686 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
687 csel $acc1,$acc1,$t1,eq
688 csel $acc2,$acc2,$t2,eq
689 csel $acc3,$acc3,$t3,eq
692 lsr $acc0,$acc0,#1 // ret >>= 1
693 orr $acc0,$acc0,$acc1,lsl#63
695 orr $acc1,$acc1,$acc2,lsl#63
697 orr $acc2,$acc2,$acc3,lsl#63
699 stp $acc0,$acc1,[$rp]
700 orr $acc3,$acc3,$ap,lsl#63
701 stp $acc2,$acc3,[$rp,#16]
704 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
706 ########################################################################
707 # following subroutines are "literal" implementation of those found in
710 ########################################################################
711 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
714 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
715 # above map() describes stack layout with 4 temporary
716 # 256-bit vectors on top.
717 my ($rp_real,$ap_real) = map("x$_",(21,22));
720 .globl ecp_nistz256_point_double
721 .type ecp_nistz256_point_double,%function
723 ecp_nistz256_point_double:
724 .inst 0xd503233f // paciasp
725 stp x29,x30,[sp,#-96]!
732 ldp $acc0,$acc1,[$ap,#32]
734 ldp $acc2,$acc3,[$ap,#48]
740 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
743 ldp $a2,$a3,[$ap_real,#64+16]
745 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
748 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
750 ldp $t0,$t1,[$ap_real]
751 ldp $t2,$t3,[$ap_real,#16]
752 mov $a0,$acc0 // put Zsqr aside for p256_sub
757 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
760 mov $acc0,$a0 // restore Zsqr
762 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
765 ldp $a2,$a3,[sp,#$S+16]
767 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
770 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
772 ldr $bi,[$ap_real,#32]
773 ldp $a0,$a1,[$ap_real,#64]
774 ldp $a2,$a3,[$ap_real,#64+16]
777 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
781 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
784 ldp $a2,$a3,[sp,#$S+16]
786 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
791 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
793 ldp $a2,$a3,[sp,#$M+16]
795 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
799 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
801 mov $t0,$acc0 // duplicate M
805 mov $a0,$acc0 // put M aside
810 bl __ecp_nistz256_add
811 mov $t0,$a0 // restore M
813 ldr $bi,[$ap_real] // forward load for p256_mul_mont
817 ldp $a2,$a3,[sp,#$S+16]
818 bl __ecp_nistz256_add // p256_mul_by_3(M, M);
822 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
826 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
829 ldp $a2,$a3,[sp,#$M+16]
831 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
834 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
837 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
841 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
844 mov $a0,$acc0 // copy S
849 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
853 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
855 add sp,x29,#0 // destroy frame
856 ldp x19,x20,[x29,#16]
857 ldp x21,x22,[x29,#32]
859 .inst 0xd50323bf // autiasp
861 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
865 ########################################################################
866 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
867 # const P256_POINT *in2);
869 my ($res_x,$res_y,$res_z,
870 $H,$Hsqr,$R,$Rsqr,$Hcub,
871 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
872 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
873 # above map() describes stack layout with 12 temporary
874 # 256-bit vectors on top.
875 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
878 .globl ecp_nistz256_point_add
879 .type ecp_nistz256_point_add,%function
881 ecp_nistz256_point_add:
882 .inst 0xd503233f // paciasp
883 stp x29,x30,[sp,#-96]!
892 ldp $a0,$a1,[$bp,#64] // in2_z
893 ldp $a2,$a3,[$bp,#64+16]
901 orr $in2infty,$t0,$t2
903 csetm $in2infty,ne // ~in2infty
905 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
907 ldp $a0,$a1,[$ap_real,#64] // in1_z
908 ldp $a2,$a3,[$ap_real,#64+16]
911 orr $in1infty,$t0,$t2
913 csetm $in1infty,ne // ~in1infty
915 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
917 ldr $bi,[$bp_real,#64]
918 ldp $a0,$a1,[sp,#$Z2sqr]
919 ldp $a2,$a3,[sp,#$Z2sqr+16]
922 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
924 ldr $bi,[$ap_real,#64]
925 ldp $a0,$a1,[sp,#$Z1sqr]
926 ldp $a2,$a3,[sp,#$Z1sqr+16]
929 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
931 ldr $bi,[$ap_real,#32]
932 ldp $a0,$a1,[sp,#$S1]
933 ldp $a2,$a3,[sp,#$S1+16]
936 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
938 ldr $bi,[$bp_real,#32]
939 ldp $a0,$a1,[sp,#$S2]
940 ldp $a2,$a3,[sp,#$S2+16]
943 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
946 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
947 ldp $a0,$a1,[$ap_real]
948 ldp $a2,$a3,[$ap_real,#16]
950 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
952 orr $acc0,$acc0,$acc1 // see if result is zero
953 orr $acc2,$acc2,$acc3
954 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2)
958 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
961 ldp $a0,$a1,[$bp_real]
962 ldp $a2,$a3,[$bp_real,#16]
965 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
968 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
969 ldp $a2,$a3,[sp,#$R+16]
971 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
973 orr $acc0,$acc0,$acc1 // see if result is zero
974 orr $acc2,$acc2,$acc3
975 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2)
977 mvn $temp1,$in1infty // -1/0 -> 0/-1
978 mvn $temp2,$in2infty // -1/0 -> 0/-1
979 orr $acc0,$acc0,$temp1
980 orr $acc0,$acc0,$temp2
981 orr $acc0,$acc0,$temp0
982 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
987 ldp x23,x24,[x29,#48]
988 ldp x25,x26,[x29,#64]
989 ldp x27,x28,[x29,#80]
990 add sp,sp,#32*(12-4) // difference in stack frames
996 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
998 ldr $bi,[$ap_real,#64]
1000 ldp $a2,$a3,[sp,#$H+16]
1001 add $bp,$ap_real,#64
1003 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1005 ldp $a0,$a1,[sp,#$H]
1006 ldp $a2,$a3,[sp,#$H+16]
1008 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1010 ldr $bi,[$bp_real,#64]
1011 ldp $a0,$a1,[sp,#$res_z]
1012 ldp $a2,$a3,[sp,#$res_z+16]
1013 add $bp,$bp_real,#64
1015 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
1018 ldp $a0,$a1,[sp,#$Hsqr]
1019 ldp $a2,$a3,[sp,#$Hsqr+16]
1022 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1025 ldp $a0,$a1,[sp,#$U1]
1026 ldp $a2,$a3,[sp,#$U1+16]
1029 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
1036 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1040 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1043 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1046 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
1047 ldp $a0,$a1,[sp,#$S1]
1048 ldp $a2,$a3,[sp,#$S1+16]
1050 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1054 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
1057 ldp $a0,$a1,[sp,#$res_y]
1058 ldp $a2,$a3,[sp,#$res_y+16]
1061 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1064 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1066 ldp $a0,$a1,[sp,#$res_x] // res
1067 ldp $a2,$a3,[sp,#$res_x+16]
1068 ldp $t0,$t1,[$bp_real] // in2
1069 ldp $t2,$t3,[$bp_real,#16]
1071 for($i=0;$i<64;$i+=32) { # conditional moves
1073 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1074 cmp $in1infty,#0 // ~$in1intfy, remember?
1075 ldp $acc2,$acc3,[$ap_real,#$i+16]
1078 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1081 cmp $in2infty,#0 // ~$in2intfy, remember?
1082 ldp $a2,$a3,[sp,#$res_x+$i+48]
1083 csel $acc0,$t0,$acc0,ne
1084 csel $acc1,$t1,$acc1,ne
1085 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1086 csel $acc2,$t2,$acc2,ne
1087 csel $acc3,$t3,$acc3,ne
1088 ldp $t2,$t3,[$bp_real,#$i+48]
1089 stp $acc0,$acc1,[$rp_real,#$i]
1090 stp $acc2,$acc3,[$rp_real,#$i+16]
1094 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1095 cmp $in1infty,#0 // ~$in1intfy, remember?
1096 ldp $acc2,$acc3,[$ap_real,#$i+16]
1101 cmp $in2infty,#0 // ~$in2intfy, remember?
1102 csel $acc0,$t0,$acc0,ne
1103 csel $acc1,$t1,$acc1,ne
1104 csel $acc2,$t2,$acc2,ne
1105 csel $acc3,$t3,$acc3,ne
1106 stp $acc0,$acc1,[$rp_real,#$i]
1107 stp $acc2,$acc3,[$rp_real,#$i+16]
1110 add sp,x29,#0 // destroy frame
1111 ldp x19,x20,[x29,#16]
1112 ldp x21,x22,[x29,#32]
1113 ldp x23,x24,[x29,#48]
1114 ldp x25,x26,[x29,#64]
1115 ldp x27,x28,[x29,#80]
1116 ldp x29,x30,[sp],#96
1117 .inst 0xd50323bf // autiasp
1119 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1123 ########################################################################
1124 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1125 # const P256_POINT_AFFINE *in2);
1127 my ($res_x,$res_y,$res_z,
1128 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1130 # above map() describes stack layout with 10 temporary
1131 # 256-bit vectors on top.
1132 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1135 .globl ecp_nistz256_point_add_affine
1136 .type ecp_nistz256_point_add_affine,%function
1138 ecp_nistz256_point_add_affine:
1139 .inst 0xd503233f // paciasp
1140 stp x29,x30,[sp,#-80]!
1142 stp x19,x20,[sp,#16]
1143 stp x21,x22,[sp,#32]
1144 stp x23,x24,[sp,#48]
1145 stp x25,x26,[sp,#64]
1152 ldr $poly3,.Lpoly+24
1154 ldp $a0,$a1,[$ap,#64] // in1_z
1155 ldp $a2,$a3,[$ap,#64+16]
1158 orr $in1infty,$t0,$t2
1160 csetm $in1infty,ne // ~in1infty
1162 ldp $acc0,$acc1,[$bp] // in2_x
1163 ldp $acc2,$acc3,[$bp,#16]
1164 ldp $t0,$t1,[$bp,#32] // in2_y
1165 ldp $t2,$t3,[$bp,#48]
1166 orr $acc0,$acc0,$acc1
1167 orr $acc2,$acc2,$acc3
1170 orr $acc0,$acc0,$acc2
1172 orr $in2infty,$acc0,$t0
1174 csetm $in2infty,ne // ~in2infty
1177 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
1186 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
1189 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
1190 ldp $a0,$a1,[sp,#$Z1sqr]
1191 ldp $a2,$a3,[sp,#$Z1sqr+16]
1193 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
1195 add $bp,$ap_real,#64
1197 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
1199 ldr $bi,[$ap_real,#64]
1200 ldp $a0,$a1,[sp,#$H]
1201 ldp $a2,$a3,[sp,#$H+16]
1202 add $bp,$ap_real,#64
1204 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1206 ldr $bi,[$bp_real,#32]
1207 ldp $a0,$a1,[sp,#$S2]
1208 ldp $a2,$a3,[sp,#$S2+16]
1209 add $bp,$bp_real,#32
1211 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
1213 add $bp,$ap_real,#32
1214 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
1215 ldp $a2,$a3,[sp,#$H+16]
1217 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
1220 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1222 ldp $a0,$a1,[sp,#$R]
1223 ldp $a2,$a3,[sp,#$R+16]
1225 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1228 ldp $a0,$a1,[sp,#$Hsqr]
1229 ldp $a2,$a3,[sp,#$Hsqr+16]
1232 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1235 ldp $a0,$a1,[sp,#$Hsqr]
1236 ldp $a2,$a3,[sp,#$Hsqr+16]
1239 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
1246 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1250 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1253 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1256 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
1257 ldp $a0,$a1,[sp,#$Hcub]
1258 ldp $a2,$a3,[sp,#$Hcub+16]
1260 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1262 add $bp,$ap_real,#32
1264 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
1267 ldp $a0,$a1,[sp,#$res_y]
1268 ldp $a2,$a3,[sp,#$res_y+16]
1271 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1274 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1276 ldp $a0,$a1,[sp,#$res_x] // res
1277 ldp $a2,$a3,[sp,#$res_x+16]
1278 ldp $t0,$t1,[$bp_real] // in2
1279 ldp $t2,$t3,[$bp_real,#16]
1281 for($i=0;$i<64;$i+=32) { # conditional moves
1283 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1284 cmp $in1infty,#0 // ~$in1intfy, remember?
1285 ldp $acc2,$acc3,[$ap_real,#$i+16]
1288 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1291 cmp $in2infty,#0 // ~$in2intfy, remember?
1292 ldp $a2,$a3,[sp,#$res_x+$i+48]
1293 csel $acc0,$t0,$acc0,ne
1294 csel $acc1,$t1,$acc1,ne
1295 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1296 csel $acc2,$t2,$acc2,ne
1297 csel $acc3,$t3,$acc3,ne
1298 ldp $t2,$t3,[$bp_real,#$i+48]
1299 stp $acc0,$acc1,[$rp_real,#$i]
1300 stp $acc2,$acc3,[$rp_real,#$i+16]
1302 $code.=<<___ if ($i == 0);
1303 adr $bp_real,.Lone_mont-64
1307 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1308 cmp $in1infty,#0 // ~$in1intfy, remember?
1309 ldp $acc2,$acc3,[$ap_real,#$i+16]
1314 cmp $in2infty,#0 // ~$in2intfy, remember?
1315 csel $acc0,$t0,$acc0,ne
1316 csel $acc1,$t1,$acc1,ne
1317 csel $acc2,$t2,$acc2,ne
1318 csel $acc3,$t3,$acc3,ne
1319 stp $acc0,$acc1,[$rp_real,#$i]
1320 stp $acc2,$acc3,[$rp_real,#$i+16]
1322 add sp,x29,#0 // destroy frame
1323 ldp x19,x20,[x29,#16]
1324 ldp x21,x22,[x29,#32]
1325 ldp x23,x24,[x29,#48]
1326 ldp x25,x26,[x29,#64]
1327 ldp x29,x30,[sp],#80
1328 .inst 0xd50323bf // autiasp
1330 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1334 my ($ord0,$ord1) = ($poly1,$poly3);
1335 my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1339 ////////////////////////////////////////////////////////////////////////
1340 // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1342 .globl ecp_nistz256_ord_mul_mont
1343 .type ecp_nistz256_ord_mul_mont,%function
1345 ecp_nistz256_ord_mul_mont:
1346 stp x29,x30,[sp,#-64]!
1348 stp x19,x20,[sp,#16]
1349 stp x21,x22,[sp,#32]
1350 stp x23,x24,[sp,#48]
1353 ldr $bi,[$bp] // bp[0]
1355 ldp $a2,$a3,[$ap,#16]
1357 ldp $ord0,$ord1,[$ordk,#0]
1358 ldp $ord2,$ord3,[$ordk,#16]
1359 ldr $ordk,[$ordk,#32]
1361 mul $acc0,$a0,$bi // a[0]*b[0]
1364 mul $acc1,$a1,$bi // a[1]*b[0]
1367 mul $acc2,$a2,$bi // a[2]*b[0]
1370 mul $acc3,$a3,$bi // a[3]*b[0]
1375 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
1376 adcs $acc2,$acc2,$t1
1377 adcs $acc3,$acc3,$t2
1381 for ($i=1;$i<4;$i++) {
1382 ################################################################
1383 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1385 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1387 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1390 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1391 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1392 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1394 ldr $bi,[$bp,#8*$i] // b[i]
1397 subs $acc2,$acc2,$t4
1399 sbcs $acc3,$acc3,$t0
1400 sbcs $acc4,$acc4,$t1
1413 adds $acc0,$acc1,$t2
1415 adcs $acc1,$acc2,$t3
1417 adcs $acc2,$acc3,$t4
1418 adcs $acc3,$acc4,$t4
1421 adds $acc0,$acc0,$t0 // accumulate low parts
1423 adcs $acc1,$acc1,$t1
1425 adcs $acc2,$acc2,$t2
1427 adcs $acc3,$acc3,$t3
1431 adds $acc1,$acc1,$t0 // accumulate high parts
1432 adcs $acc2,$acc2,$t1
1433 adcs $acc3,$acc3,$t2
1434 adcs $acc4,$acc4,$t3
1439 lsl $t0,$t4,#32 // last reduction
1440 subs $acc2,$acc2,$t4
1442 sbcs $acc3,$acc3,$t0
1443 sbcs $acc4,$acc4,$t1
1454 adds $acc0,$acc1,$t2
1455 adcs $acc1,$acc2,$t3
1456 adcs $acc2,$acc3,$t4
1457 adcs $acc3,$acc4,$t4
1460 subs $t0,$acc0,$ord0 // ret -= modulus
1461 sbcs $t1,$acc1,$ord1
1462 sbcs $t2,$acc2,$ord2
1463 sbcs $t3,$acc3,$ord3
1466 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1467 csel $acc1,$acc1,$t1,lo
1468 csel $acc2,$acc2,$t2,lo
1469 stp $acc0,$acc1,[$rp]
1470 csel $acc3,$acc3,$t3,lo
1471 stp $acc2,$acc3,[$rp,#16]
1473 ldp x19,x20,[sp,#16]
1474 ldp x21,x22,[sp,#32]
1475 ldp x23,x24,[sp,#48]
1478 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1480 ////////////////////////////////////////////////////////////////////////
1481 // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1483 .globl ecp_nistz256_ord_sqr_mont
1484 .type ecp_nistz256_ord_sqr_mont,%function
1486 ecp_nistz256_ord_sqr_mont:
1487 stp x29,x30,[sp,#-64]!
1489 stp x19,x20,[sp,#16]
1490 stp x21,x22,[sp,#32]
1491 stp x23,x24,[sp,#48]
1495 ldp $a2,$a3,[$ap,#16]
1497 ldp $ord0,$ord1,[$ordk,#0]
1498 ldp $ord2,$ord3,[$ordk,#16]
1499 ldr $ordk,[$ordk,#32]
1505 ////////////////////////////////////////////////////////////////
1506 // | | | | | |a1*a0| |
1507 // | | | | |a2*a0| | |
1508 // | |a3*a2|a3*a0| | | |
1509 // | | | |a2*a1| | | |
1510 // | | |a3*a1| | | | |
1511 // *| | | | | | | | 2|
1512 // +|a3*a3|a2*a2|a1*a1|a0*a0|
1513 // |--+--+--+--+--+--+--+--|
1514 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1516 // "can't overflow
" below mark carrying into high part of
1517 // multiplication result, which can't overflow, because it
1518 // can never be all ones.
1520 mul $acc1,$a1,$a0 // a[1]*a[0]
1522 mul $acc2,$a2,$a0 // a[2]*a[0]
1524 mul $acc3,$a3,$a0 // a[3]*a[0]
1527 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
1528 mul $t0,$a2,$a1 // a[2]*a[1]
1530 adcs $acc3,$acc3,$t2
1531 mul $t2,$a3,$a1 // a[3]*a[1]
1533 adc $acc4,$acc4,xzr // can't overflow
1535 mul $acc5,$a3,$a2 // a[3]*a[2]
1538 adds $t1,$t1,$t2 // accumulate high parts of multiplication
1539 mul $acc0,$a0,$a0 // a[0]*a[0]
1540 adc $t2,$t3,xzr // can't overflow
1542 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
1544 adcs $acc4,$acc4,$t1
1545 mul $t1,$a1,$a1 // a[1]*a[1]
1546 adcs $acc5,$acc5,$t2
1548 adc $acc6,$acc6,xzr // can't overflow
1550 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
1551 mul $t2,$a2,$a2 // a[2]*a[2]
1552 adcs $acc2,$acc2,$acc2
1554 adcs $acc3,$acc3,$acc3
1555 mul $t3,$a3,$a3 // a[3]*a[3]
1556 adcs $acc4,$acc4,$acc4
1558 adcs $acc5,$acc5,$acc5
1559 adcs $acc6,$acc6,$acc6
1562 adds $acc1,$acc1,$a0 // +a[i]*a[i]
1564 adcs $acc2,$acc2,$t1
1565 adcs $acc3,$acc3,$a1
1566 adcs $acc4,$acc4,$t2
1567 adcs $acc5,$acc5,$a2
1568 adcs $acc6,$acc6,$t3
1571 for($i=0; $i<4; $i++) { # reductions
1581 adds $acc0,$acc1,$t2
1582 adcs $acc1,$acc2,$t3
1583 adcs $acc2,$acc3,$t4
1584 adc $acc3,xzr,$t4 // can't overflow
1586 $code.=<<___ if ($i<3);
1591 subs $acc1,$acc1,$t4
1593 sbcs $acc2,$acc2,$t0
1594 sbc $acc3,$acc3,$t1 // can't borrow
1596 ($t3,$t4) = ($t4,$t3);
1599 adds $acc0,$acc0,$acc4 // accumulate upper half
1600 adcs $acc1,$acc1,$acc5
1601 adcs $acc2,$acc2,$acc6
1602 adcs $acc3,$acc3,$acc7
1605 subs $t0,$acc0,$ord0 // ret -= modulus
1606 sbcs $t1,$acc1,$ord1
1607 sbcs $t2,$acc2,$ord2
1608 sbcs $t3,$acc3,$ord3
1611 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1612 csel $a1,$acc1,$t1,lo
1613 csel $a2,$acc2,$t2,lo
1614 csel $a3,$acc3,$t3,lo
1616 cbnz $bp,.Loop_ord_sqr
1619 stp $a2,$a3,[$rp,#16]
1621 ldp x19,x20,[sp,#16]
1622 ldp x21,x22,[sp,#32]
1623 ldp x23,x24,[sp,#48]
1626 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1630 ########################################################################
1631 # scatter-gather subroutines
1633 my ($out,$inp,$index,$mask)=map("x
$_",(0..3));
1635 // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1637 .globl ecp_nistz256_scatter_w5
1638 .type ecp_nistz256_scatter_w5,%function
1640 ecp_nistz256_scatter_w5:
1641 stp x29,x30,[sp,#-16]!
1644 add $out,$out,$index,lsl#2
1646 ldp x4,x5,[$inp] // X
1647 ldp x6,x7,[$inp,#16]
1648 str w4,[$out,#64*0-4]
1650 str w5,[$out,#64*1-4]
1652 str w6,[$out,#64*2-4]
1654 str w7,[$out,#64*3-4]
1656 str w4,[$out,#64*4-4]
1657 str w5,[$out,#64*5-4]
1658 str w6,[$out,#64*6-4]
1659 str w7,[$out,#64*7-4]
1662 ldp x4,x5,[$inp,#32] // Y
1663 ldp x6,x7,[$inp,#48]
1664 str w4,[$out,#64*0-4]
1666 str w5,[$out,#64*1-4]
1668 str w6,[$out,#64*2-4]
1670 str w7,[$out,#64*3-4]
1672 str w4,[$out,#64*4-4]
1673 str w5,[$out,#64*5-4]
1674 str w6,[$out,#64*6-4]
1675 str w7,[$out,#64*7-4]
1678 ldp x4,x5,[$inp,#64] // Z
1679 ldp x6,x7,[$inp,#80]
1680 str w4,[$out,#64*0-4]
1682 str w5,[$out,#64*1-4]
1684 str w6,[$out,#64*2-4]
1686 str w7,[$out,#64*3-4]
1688 str w4,[$out,#64*4-4]
1689 str w5,[$out,#64*5-4]
1690 str w6,[$out,#64*6-4]
1691 str w7,[$out,#64*7-4]
1695 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1697 // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1699 .globl ecp_nistz256_gather_w5
1700 .type ecp_nistz256_gather_w5,%function
1702 ecp_nistz256_gather_w5:
1703 stp x29,x30,[sp,#-16]!
1708 add $index,$index,x3
1709 add $inp,$inp,$index,lsl#2
1717 ldr w10,[$inp,#64*6]
1718 ldr w11,[$inp,#64*7]
1722 orr x6,x6,x10,lsl#32
1723 orr x7,x7,x11,lsl#32
1728 stp x4,x5,[$out] // X
1729 stp x6,x7,[$out,#16]
1737 ldr w10,[$inp,#64*6]
1738 ldr w11,[$inp,#64*7]
1742 orr x6,x6,x10,lsl#32
1743 orr x7,x7,x11,lsl#32
1748 stp x4,x5,[$out,#32] // Y
1749 stp x6,x7,[$out,#48]
1757 ldr w10,[$inp,#64*6]
1758 ldr w11,[$inp,#64*7]
1761 orr x6,x6,x10,lsl#32
1762 orr x7,x7,x11,lsl#32
1767 stp x4,x5,[$out,#64] // Z
1768 stp x6,x7,[$out,#80]
1772 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1774 // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1776 .globl ecp_nistz256_scatter_w7
1777 .type ecp_nistz256_scatter_w7,%function
1779 ecp_nistz256_scatter_w7:
1780 stp x29,x30,[sp,#-16]!
1783 add $out,$out,$index
1787 subs $index,$index,#1
1788 prfm pstl1strm,[$out,#4096+64*0]
1789 prfm pstl1strm,[$out,#4096+64*1]
1790 prfm pstl1strm,[$out,#4096+64*2]
1791 prfm pstl1strm,[$out,#4096+64*3]
1792 prfm pstl1strm,[$out,#4096+64*4]
1793 prfm pstl1strm,[$out,#4096+64*5]
1794 prfm pstl1strm,[$out,#4096+64*6]
1795 prfm pstl1strm,[$out,#4096+64*7]
1796 strb w3,[$out,#64*0]
1798 strb w3,[$out,#64*1]
1800 strb w3,[$out,#64*2]
1802 strb w3,[$out,#64*3]
1804 strb w3,[$out,#64*4]
1806 strb w3,[$out,#64*5]
1808 strb w3,[$out,#64*6]
1810 strb w3,[$out,#64*7]
1812 b.ne .Loop_scatter_w7
1816 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1818 // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1820 .globl ecp_nistz256_gather_w7
1821 .type ecp_nistz256_gather_w7,%function
1823 ecp_nistz256_gather_w7:
1824 stp x29,x30,[sp,#-16]!
1829 add $index,$index,x3
1830 add $inp,$inp,$index
1834 ldrb w4,[$inp,#64*0]
1835 prfm pldl1strm,[$inp,#4096+64*0]
1836 subs $index,$index,#1
1837 ldrb w5,[$inp,#64*1]
1838 prfm pldl1strm,[$inp,#4096+64*1]
1839 ldrb w6,[$inp,#64*2]
1840 prfm pldl1strm,[$inp,#4096+64*2]
1841 ldrb w7,[$inp,#64*3]
1842 prfm pldl1strm,[$inp,#4096+64*3]
1843 ldrb w8,[$inp,#64*4]
1844 prfm pldl1strm,[$inp,#4096+64*4]
1845 ldrb w9,[$inp,#64*5]
1846 prfm pldl1strm,[$inp,#4096+64*5]
1847 ldrb w10,[$inp,#64*6]
1848 prfm pldl1strm,[$inp,#4096+64*6]
1849 ldrb w11,[$inp,#64*7]
1850 prfm pldl1strm,[$inp,#4096+64*7]
1856 orr x10,x10,x11,lsl#8
1858 orr x4,x4,x10,lsl#48
1861 b.ne .Loop_gather_w7
1865 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1869 foreach (split("\n",$code)) {
1870 s/\`([^\`]*)\`/eval $1/ge;
1874 close STDOUT; # enforce flush