]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_nistz256-armv8.pl
For all assembler scripts where it matters, recognise clang > 9.x
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # ECP_NISTZ256 module for ARMv8.
18 #
19 # February 2015.
20 #
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
23 #
24 # with/without -DECP_NISTZ256_ASM
25 # Apple A7 +190-360%
26 # Cortex-A53 +190-400%
27 # Cortex-A57 +190-350%
28 # Denver +230-400%
29 #
30 # Ranges denote minimum and maximum improvement coefficients depending
31 # on benchmark. Lower coefficients are for ECDSA sign, server-side
32 # operation. Keep in mind that +400% means 5x improvement.
33
34 $flavour = shift;
35 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
36
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40 die "can't locate arm-xlate.pl";
41
42 open OUT,"| \"$^X\" $xlate $flavour $output";
43 *STDOUT=*OUT;
44
45 {
46 my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
48 map("x$_",(0..17,19,20));
49
50 my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
51
52 $code.=<<___;
53 #include "arm_arch.h"
54
55 .text
56 ___
57 ########################################################################
58 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
59 #
60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61 open TABLE,"<ecp_nistz256_table.c" or
62 open TABLE,"<${dir}../ecp_nistz256_table.c" or
63 die "failed to open ecp_nistz256_table.c:",$!;
64
65 use integer;
66
67 foreach(<TABLE>) {
68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
69 }
70 close TABLE;
71
72 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
73 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
74 # amount of elements.
75 die "insane number of elements" if ($#arr != 64*16*37-1);
76
77 $code.=<<___;
78 .globl ecp_nistz256_precomputed
79 .type ecp_nistz256_precomputed,%object
80 .align 12
81 ecp_nistz256_precomputed:
82 ___
83 ########################################################################
84 # this conversion smashes P256_POINT_AFFINE by individual bytes with
85 # 64 byte interval, similar to
86 # 1111222233334444
87 # 1234123412341234
88 for(1..37) {
89 @tbl = splice(@arr,0,64*16);
90 for($i=0;$i<64;$i++) {
91 undef @line;
92 for($j=0;$j<64;$j++) {
93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
94 }
95 $code.=".byte\t";
96 $code.=join(',',map { sprintf "0x%02x",$_} @line);
97 $code.="\n";
98 }
99 }
100 $code.=<<___;
101 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
102 .align 5
103 .Lpoly:
104 .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
105 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial
106 .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
107 .Lone_mont:
108 .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
109 .Lone:
110 .quad 1,0,0,0
111 .Lord:
112 .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
113 .LordK:
114 .quad 0xccd1c8aaee00bc4f
115 .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
116
117 // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
118 .globl ecp_nistz256_to_mont
119 .type ecp_nistz256_to_mont,%function
120 .align 6
121 ecp_nistz256_to_mont:
122 .inst 0xd503233f // paciasp
123 stp x29,x30,[sp,#-32]!
124 add x29,sp,#0
125 stp x19,x20,[sp,#16]
126
127 ldr $bi,.LRR // bp[0]
128 ldp $a0,$a1,[$ap]
129 ldp $a2,$a3,[$ap,#16]
130 ldr $poly1,.Lpoly+8
131 ldr $poly3,.Lpoly+24
132 adr $bp,.LRR // &bp[0]
133
134 bl __ecp_nistz256_mul_mont
135
136 ldp x19,x20,[sp,#16]
137 ldp x29,x30,[sp],#32
138 .inst 0xd50323bf // autiasp
139 ret
140 .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
141
142 // void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
143 .globl ecp_nistz256_from_mont
144 .type ecp_nistz256_from_mont,%function
145 .align 4
146 ecp_nistz256_from_mont:
147 .inst 0xd503233f // paciasp
148 stp x29,x30,[sp,#-32]!
149 add x29,sp,#0
150 stp x19,x20,[sp,#16]
151
152 mov $bi,#1 // bp[0]
153 ldp $a0,$a1,[$ap]
154 ldp $a2,$a3,[$ap,#16]
155 ldr $poly1,.Lpoly+8
156 ldr $poly3,.Lpoly+24
157 adr $bp,.Lone // &bp[0]
158
159 bl __ecp_nistz256_mul_mont
160
161 ldp x19,x20,[sp,#16]
162 ldp x29,x30,[sp],#32
163 .inst 0xd50323bf // autiasp
164 ret
165 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
166
167 // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
168 // const BN_ULONG x2[4]);
169 .globl ecp_nistz256_mul_mont
170 .type ecp_nistz256_mul_mont,%function
171 .align 4
172 ecp_nistz256_mul_mont:
173 .inst 0xd503233f // paciasp
174 stp x29,x30,[sp,#-32]!
175 add x29,sp,#0
176 stp x19,x20,[sp,#16]
177
178 ldr $bi,[$bp] // bp[0]
179 ldp $a0,$a1,[$ap]
180 ldp $a2,$a3,[$ap,#16]
181 ldr $poly1,.Lpoly+8
182 ldr $poly3,.Lpoly+24
183
184 bl __ecp_nistz256_mul_mont
185
186 ldp x19,x20,[sp,#16]
187 ldp x29,x30,[sp],#32
188 .inst 0xd50323bf // autiasp
189 ret
190 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
191
192 // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
193 .globl ecp_nistz256_sqr_mont
194 .type ecp_nistz256_sqr_mont,%function
195 .align 4
196 ecp_nistz256_sqr_mont:
197 .inst 0xd503233f // paciasp
198 stp x29,x30,[sp,#-32]!
199 add x29,sp,#0
200 stp x19,x20,[sp,#16]
201
202 ldp $a0,$a1,[$ap]
203 ldp $a2,$a3,[$ap,#16]
204 ldr $poly1,.Lpoly+8
205 ldr $poly3,.Lpoly+24
206
207 bl __ecp_nistz256_sqr_mont
208
209 ldp x19,x20,[sp,#16]
210 ldp x29,x30,[sp],#32
211 .inst 0xd50323bf // autiasp
212 ret
213 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
214
215 // void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
216 // const BN_ULONG x2[4]);
217 .globl ecp_nistz256_add
218 .type ecp_nistz256_add,%function
219 .align 4
220 ecp_nistz256_add:
221 .inst 0xd503233f // paciasp
222 stp x29,x30,[sp,#-16]!
223 add x29,sp,#0
224
225 ldp $acc0,$acc1,[$ap]
226 ldp $t0,$t1,[$bp]
227 ldp $acc2,$acc3,[$ap,#16]
228 ldp $t2,$t3,[$bp,#16]
229 ldr $poly1,.Lpoly+8
230 ldr $poly3,.Lpoly+24
231
232 bl __ecp_nistz256_add
233
234 ldp x29,x30,[sp],#16
235 .inst 0xd50323bf // autiasp
236 ret
237 .size ecp_nistz256_add,.-ecp_nistz256_add
238
239 // void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240 .globl ecp_nistz256_div_by_2
241 .type ecp_nistz256_div_by_2,%function
242 .align 4
243 ecp_nistz256_div_by_2:
244 .inst 0xd503233f // paciasp
245 stp x29,x30,[sp,#-16]!
246 add x29,sp,#0
247
248 ldp $acc0,$acc1,[$ap]
249 ldp $acc2,$acc3,[$ap,#16]
250 ldr $poly1,.Lpoly+8
251 ldr $poly3,.Lpoly+24
252
253 bl __ecp_nistz256_div_by_2
254
255 ldp x29,x30,[sp],#16
256 .inst 0xd50323bf // autiasp
257 ret
258 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
259
260 // void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
261 .globl ecp_nistz256_mul_by_2
262 .type ecp_nistz256_mul_by_2,%function
263 .align 4
264 ecp_nistz256_mul_by_2:
265 .inst 0xd503233f // paciasp
266 stp x29,x30,[sp,#-16]!
267 add x29,sp,#0
268
269 ldp $acc0,$acc1,[$ap]
270 ldp $acc2,$acc3,[$ap,#16]
271 ldr $poly1,.Lpoly+8
272 ldr $poly3,.Lpoly+24
273 mov $t0,$acc0
274 mov $t1,$acc1
275 mov $t2,$acc2
276 mov $t3,$acc3
277
278 bl __ecp_nistz256_add // ret = a+a // 2*a
279
280 ldp x29,x30,[sp],#16
281 .inst 0xd50323bf // autiasp
282 ret
283 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
284
285 // void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
286 .globl ecp_nistz256_mul_by_3
287 .type ecp_nistz256_mul_by_3,%function
288 .align 4
289 ecp_nistz256_mul_by_3:
290 .inst 0xd503233f // paciasp
291 stp x29,x30,[sp,#-16]!
292 add x29,sp,#0
293
294 ldp $acc0,$acc1,[$ap]
295 ldp $acc2,$acc3,[$ap,#16]
296 ldr $poly1,.Lpoly+8
297 ldr $poly3,.Lpoly+24
298 mov $t0,$acc0
299 mov $t1,$acc1
300 mov $t2,$acc2
301 mov $t3,$acc3
302 mov $a0,$acc0
303 mov $a1,$acc1
304 mov $a2,$acc2
305 mov $a3,$acc3
306
307 bl __ecp_nistz256_add // ret = a+a // 2*a
308
309 mov $t0,$a0
310 mov $t1,$a1
311 mov $t2,$a2
312 mov $t3,$a3
313
314 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a
315
316 ldp x29,x30,[sp],#16
317 .inst 0xd50323bf // autiasp
318 ret
319 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
320
321 // void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
322 // const BN_ULONG x2[4]);
323 .globl ecp_nistz256_sub
324 .type ecp_nistz256_sub,%function
325 .align 4
326 ecp_nistz256_sub:
327 .inst 0xd503233f // paciasp
328 stp x29,x30,[sp,#-16]!
329 add x29,sp,#0
330
331 ldp $acc0,$acc1,[$ap]
332 ldp $acc2,$acc3,[$ap,#16]
333 ldr $poly1,.Lpoly+8
334 ldr $poly3,.Lpoly+24
335
336 bl __ecp_nistz256_sub_from
337
338 ldp x29,x30,[sp],#16
339 .inst 0xd50323bf // autiasp
340 ret
341 .size ecp_nistz256_sub,.-ecp_nistz256_sub
342
343 // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
344 .globl ecp_nistz256_neg
345 .type ecp_nistz256_neg,%function
346 .align 4
347 ecp_nistz256_neg:
348 .inst 0xd503233f // paciasp
349 stp x29,x30,[sp,#-16]!
350 add x29,sp,#0
351
352 mov $bp,$ap
353 mov $acc0,xzr // a = 0
354 mov $acc1,xzr
355 mov $acc2,xzr
356 mov $acc3,xzr
357 ldr $poly1,.Lpoly+8
358 ldr $poly3,.Lpoly+24
359
360 bl __ecp_nistz256_sub_from
361
362 ldp x29,x30,[sp],#16
363 .inst 0xd50323bf // autiasp
364 ret
365 .size ecp_nistz256_neg,.-ecp_nistz256_neg
366
367 // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
368 // to $a0-$a3 and b[0] - to $bi
369 .type __ecp_nistz256_mul_mont,%function
370 .align 4
371 __ecp_nistz256_mul_mont:
372 mul $acc0,$a0,$bi // a[0]*b[0]
373 umulh $t0,$a0,$bi
374
375 mul $acc1,$a1,$bi // a[1]*b[0]
376 umulh $t1,$a1,$bi
377
378 mul $acc2,$a2,$bi // a[2]*b[0]
379 umulh $t2,$a2,$bi
380
381 mul $acc3,$a3,$bi // a[3]*b[0]
382 umulh $t3,$a3,$bi
383 ldr $bi,[$bp,#8] // b[1]
384
385 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
386 lsl $t0,$acc0,#32
387 adcs $acc2,$acc2,$t1
388 lsr $t1,$acc0,#32
389 adcs $acc3,$acc3,$t2
390 adc $acc4,xzr,$t3
391 mov $acc5,xzr
392 ___
393 for($i=1;$i<4;$i++) {
394 # Reduction iteration is normally performed by accumulating
395 # result of multiplication of modulus by "magic" digit [and
396 # omitting least significant word, which is guaranteed to
397 # be 0], but thanks to special form of modulus and "magic"
398 # digit being equal to least significant word, it can be
399 # performed with additions and subtractions alone. Indeed:
400 #
401 # ffff0001.00000000.0000ffff.ffffffff
402 # * abcdefgh
403 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
404 #
405 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
406 # rewrite above as:
407 #
408 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
409 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
410 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
411 #
412 # or marking redundant operations:
413 #
414 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
415 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
416 # - 0000abcd.efgh0000.--------.--------.--------
417
418 $code.=<<___;
419 subs $t2,$acc0,$t0 // "*0xffff0001"
420 sbc $t3,$acc0,$t1
421 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
422 mul $t0,$a0,$bi // lo(a[0]*b[i])
423 adcs $acc1,$acc2,$t1
424 mul $t1,$a1,$bi // lo(a[1]*b[i])
425 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
426 mul $t2,$a2,$bi // lo(a[2]*b[i])
427 adcs $acc3,$acc4,$t3
428 mul $t3,$a3,$bi // lo(a[3]*b[i])
429 adc $acc4,$acc5,xzr
430
431 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
432 umulh $t0,$a0,$bi // hi(a[0]*b[i])
433 adcs $acc1,$acc1,$t1
434 umulh $t1,$a1,$bi // hi(a[1]*b[i])
435 adcs $acc2,$acc2,$t2
436 umulh $t2,$a2,$bi // hi(a[2]*b[i])
437 adcs $acc3,$acc3,$t3
438 umulh $t3,$a3,$bi // hi(a[3]*b[i])
439 adc $acc4,$acc4,xzr
440 ___
441 $code.=<<___ if ($i<3);
442 ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
443 ___
444 $code.=<<___;
445 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
446 lsl $t0,$acc0,#32
447 adcs $acc2,$acc2,$t1
448 lsr $t1,$acc0,#32
449 adcs $acc3,$acc3,$t2
450 adcs $acc4,$acc4,$t3
451 adc $acc5,xzr,xzr
452 ___
453 }
454 $code.=<<___;
455 // last reduction
456 subs $t2,$acc0,$t0 // "*0xffff0001"
457 sbc $t3,$acc0,$t1
458 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
459 adcs $acc1,$acc2,$t1
460 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
461 adcs $acc3,$acc4,$t3
462 adc $acc4,$acc5,xzr
463
464 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
465 sbcs $t1,$acc1,$poly1
466 sbcs $t2,$acc2,xzr
467 sbcs $t3,$acc3,$poly3
468 sbcs xzr,$acc4,xzr // did it borrow?
469
470 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
471 csel $acc1,$acc1,$t1,lo
472 csel $acc2,$acc2,$t2,lo
473 stp $acc0,$acc1,[$rp]
474 csel $acc3,$acc3,$t3,lo
475 stp $acc2,$acc3,[$rp,#16]
476
477 ret
478 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
479
480 // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
481 // to $a0-$a3
482 .type __ecp_nistz256_sqr_mont,%function
483 .align 4
484 __ecp_nistz256_sqr_mont:
485 // | | | | | |a1*a0| |
486 // | | | | |a2*a0| | |
487 // | |a3*a2|a3*a0| | | |
488 // | | | |a2*a1| | | |
489 // | | |a3*a1| | | | |
490 // *| | | | | | | | 2|
491 // +|a3*a3|a2*a2|a1*a1|a0*a0|
492 // |--+--+--+--+--+--+--+--|
493 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
494 //
495 // "can't overflow" below mark carrying into high part of
496 // multiplication result, which can't overflow, because it
497 // can never be all ones.
498
499 mul $acc1,$a1,$a0 // a[1]*a[0]
500 umulh $t1,$a1,$a0
501 mul $acc2,$a2,$a0 // a[2]*a[0]
502 umulh $t2,$a2,$a0
503 mul $acc3,$a3,$a0 // a[3]*a[0]
504 umulh $acc4,$a3,$a0
505
506 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
507 mul $t0,$a2,$a1 // a[2]*a[1]
508 umulh $t1,$a2,$a1
509 adcs $acc3,$acc3,$t2
510 mul $t2,$a3,$a1 // a[3]*a[1]
511 umulh $t3,$a3,$a1
512 adc $acc4,$acc4,xzr // can't overflow
513
514 mul $acc5,$a3,$a2 // a[3]*a[2]
515 umulh $acc6,$a3,$a2
516
517 adds $t1,$t1,$t2 // accumulate high parts of multiplication
518 mul $acc0,$a0,$a0 // a[0]*a[0]
519 adc $t2,$t3,xzr // can't overflow
520
521 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
522 umulh $a0,$a0,$a0
523 adcs $acc4,$acc4,$t1
524 mul $t1,$a1,$a1 // a[1]*a[1]
525 adcs $acc5,$acc5,$t2
526 umulh $a1,$a1,$a1
527 adc $acc6,$acc6,xzr // can't overflow
528
529 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
530 mul $t2,$a2,$a2 // a[2]*a[2]
531 adcs $acc2,$acc2,$acc2
532 umulh $a2,$a2,$a2
533 adcs $acc3,$acc3,$acc3
534 mul $t3,$a3,$a3 // a[3]*a[3]
535 adcs $acc4,$acc4,$acc4
536 umulh $a3,$a3,$a3
537 adcs $acc5,$acc5,$acc5
538 adcs $acc6,$acc6,$acc6
539 adc $acc7,xzr,xzr
540
541 adds $acc1,$acc1,$a0 // +a[i]*a[i]
542 adcs $acc2,$acc2,$t1
543 adcs $acc3,$acc3,$a1
544 adcs $acc4,$acc4,$t2
545 adcs $acc5,$acc5,$a2
546 lsl $t0,$acc0,#32
547 adcs $acc6,$acc6,$t3
548 lsr $t1,$acc0,#32
549 adc $acc7,$acc7,$a3
550 ___
551 for($i=0;$i<3;$i++) { # reductions, see commentary in
552 # multiplication for details
553 $code.=<<___;
554 subs $t2,$acc0,$t0 // "*0xffff0001"
555 sbc $t3,$acc0,$t1
556 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
557 adcs $acc1,$acc2,$t1
558 lsl $t0,$acc0,#32
559 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
560 lsr $t1,$acc0,#32
561 adc $acc3,$t3,xzr // can't overflow
562 ___
563 }
564 $code.=<<___;
565 subs $t2,$acc0,$t0 // "*0xffff0001"
566 sbc $t3,$acc0,$t1
567 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
568 adcs $acc1,$acc2,$t1
569 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
570 adc $acc3,$t3,xzr // can't overflow
571
572 adds $acc0,$acc0,$acc4 // accumulate upper half
573 adcs $acc1,$acc1,$acc5
574 adcs $acc2,$acc2,$acc6
575 adcs $acc3,$acc3,$acc7
576 adc $acc4,xzr,xzr
577
578 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
579 sbcs $t1,$acc1,$poly1
580 sbcs $t2,$acc2,xzr
581 sbcs $t3,$acc3,$poly3
582 sbcs xzr,$acc4,xzr // did it borrow?
583
584 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
585 csel $acc1,$acc1,$t1,lo
586 csel $acc2,$acc2,$t2,lo
587 stp $acc0,$acc1,[$rp]
588 csel $acc3,$acc3,$t3,lo
589 stp $acc2,$acc3,[$rp,#16]
590
591 ret
592 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
593
594 // Note that __ecp_nistz256_add expects both input vectors pre-loaded to
595 // $a0-$a3 and $t0-$t3. This is done because it's used in multiple
596 // contexts, e.g. in multiplication by 2 and 3...
597 .type __ecp_nistz256_add,%function
598 .align 4
599 __ecp_nistz256_add:
600 adds $acc0,$acc0,$t0 // ret = a+b
601 adcs $acc1,$acc1,$t1
602 adcs $acc2,$acc2,$t2
603 adcs $acc3,$acc3,$t3
604 adc $ap,xzr,xzr // zap $ap
605
606 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
607 sbcs $t1,$acc1,$poly1
608 sbcs $t2,$acc2,xzr
609 sbcs $t3,$acc3,$poly3
610 sbcs xzr,$ap,xzr // did subtraction borrow?
611
612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
613 csel $acc1,$acc1,$t1,lo
614 csel $acc2,$acc2,$t2,lo
615 stp $acc0,$acc1,[$rp]
616 csel $acc3,$acc3,$t3,lo
617 stp $acc2,$acc3,[$rp,#16]
618
619 ret
620 .size __ecp_nistz256_add,.-__ecp_nistz256_add
621
622 .type __ecp_nistz256_sub_from,%function
623 .align 4
624 __ecp_nistz256_sub_from:
625 ldp $t0,$t1,[$bp]
626 ldp $t2,$t3,[$bp,#16]
627 subs $acc0,$acc0,$t0 // ret = a-b
628 sbcs $acc1,$acc1,$t1
629 sbcs $acc2,$acc2,$t2
630 sbcs $acc3,$acc3,$t3
631 sbc $ap,xzr,xzr // zap $ap
632
633 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
634 adcs $t1,$acc1,$poly1
635 adcs $t2,$acc2,xzr
636 adc $t3,$acc3,$poly3
637 cmp $ap,xzr // did subtraction borrow?
638
639 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
640 csel $acc1,$acc1,$t1,eq
641 csel $acc2,$acc2,$t2,eq
642 stp $acc0,$acc1,[$rp]
643 csel $acc3,$acc3,$t3,eq
644 stp $acc2,$acc3,[$rp,#16]
645
646 ret
647 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
648
649 .type __ecp_nistz256_sub_morf,%function
650 .align 4
651 __ecp_nistz256_sub_morf:
652 ldp $t0,$t1,[$bp]
653 ldp $t2,$t3,[$bp,#16]
654 subs $acc0,$t0,$acc0 // ret = b-a
655 sbcs $acc1,$t1,$acc1
656 sbcs $acc2,$t2,$acc2
657 sbcs $acc3,$t3,$acc3
658 sbc $ap,xzr,xzr // zap $ap
659
660 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
661 adcs $t1,$acc1,$poly1
662 adcs $t2,$acc2,xzr
663 adc $t3,$acc3,$poly3
664 cmp $ap,xzr // did subtraction borrow?
665
666 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
667 csel $acc1,$acc1,$t1,eq
668 csel $acc2,$acc2,$t2,eq
669 stp $acc0,$acc1,[$rp]
670 csel $acc3,$acc3,$t3,eq
671 stp $acc2,$acc3,[$rp,#16]
672
673 ret
674 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
675
676 .type __ecp_nistz256_div_by_2,%function
677 .align 4
678 __ecp_nistz256_div_by_2:
679 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
680 adcs $t1,$acc1,$poly1
681 adcs $t2,$acc2,xzr
682 adcs $t3,$acc3,$poly3
683 adc $ap,xzr,xzr // zap $ap
684 tst $acc0,#1 // is a even?
685
686 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
687 csel $acc1,$acc1,$t1,eq
688 csel $acc2,$acc2,$t2,eq
689 csel $acc3,$acc3,$t3,eq
690 csel $ap,xzr,$ap,eq
691
692 lsr $acc0,$acc0,#1 // ret >>= 1
693 orr $acc0,$acc0,$acc1,lsl#63
694 lsr $acc1,$acc1,#1
695 orr $acc1,$acc1,$acc2,lsl#63
696 lsr $acc2,$acc2,#1
697 orr $acc2,$acc2,$acc3,lsl#63
698 lsr $acc3,$acc3,#1
699 stp $acc0,$acc1,[$rp]
700 orr $acc3,$acc3,$ap,lsl#63
701 stp $acc2,$acc3,[$rp,#16]
702
703 ret
704 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
705 ___
706 ########################################################################
707 # following subroutines are "literal" implementation of those found in
708 # ecp_nistz256.c
709 #
710 ########################################################################
711 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
712 #
713 {
714 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
715 # above map() describes stack layout with 4 temporary
716 # 256-bit vectors on top.
717 my ($rp_real,$ap_real) = map("x$_",(21,22));
718
719 $code.=<<___;
720 .globl ecp_nistz256_point_double
721 .type ecp_nistz256_point_double,%function
722 .align 5
723 ecp_nistz256_point_double:
724 .inst 0xd503233f // paciasp
725 stp x29,x30,[sp,#-96]!
726 add x29,sp,#0
727 stp x19,x20,[sp,#16]
728 stp x21,x22,[sp,#32]
729 sub sp,sp,#32*4
730
731 .Ldouble_shortcut:
732 ldp $acc0,$acc1,[$ap,#32]
733 mov $rp_real,$rp
734 ldp $acc2,$acc3,[$ap,#48]
735 mov $ap_real,$ap
736 ldr $poly1,.Lpoly+8
737 mov $t0,$acc0
738 ldr $poly3,.Lpoly+24
739 mov $t1,$acc1
740 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
741 mov $t2,$acc2
742 mov $t3,$acc3
743 ldp $a2,$a3,[$ap_real,#64+16]
744 add $rp,sp,#$S
745 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
746
747 add $rp,sp,#$Zsqr
748 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
749
750 ldp $t0,$t1,[$ap_real]
751 ldp $t2,$t3,[$ap_real,#16]
752 mov $a0,$acc0 // put Zsqr aside for p256_sub
753 mov $a1,$acc1
754 mov $a2,$acc2
755 mov $a3,$acc3
756 add $rp,sp,#$M
757 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
758
759 add $bp,$ap_real,#0
760 mov $acc0,$a0 // restore Zsqr
761 mov $acc1,$a1
762 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
763 mov $acc2,$a2
764 mov $acc3,$a3
765 ldp $a2,$a3,[sp,#$S+16]
766 add $rp,sp,#$Zsqr
767 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
768
769 add $rp,sp,#$S
770 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
771
772 ldr $bi,[$ap_real,#32]
773 ldp $a0,$a1,[$ap_real,#64]
774 ldp $a2,$a3,[$ap_real,#64+16]
775 add $bp,$ap_real,#32
776 add $rp,sp,#$tmp0
777 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
778
779 mov $t0,$acc0
780 mov $t1,$acc1
781 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
782 mov $t2,$acc2
783 mov $t3,$acc3
784 ldp $a2,$a3,[sp,#$S+16]
785 add $rp,$rp_real,#64
786 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
787
788 add $rp,sp,#$tmp0
789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
790
791 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
792 ldp $a0,$a1,[sp,#$M]
793 ldp $a2,$a3,[sp,#$M+16]
794 add $rp,$rp_real,#32
795 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
796
797 add $bp,sp,#$Zsqr
798 add $rp,sp,#$M
799 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
800
801 mov $t0,$acc0 // duplicate M
802 mov $t1,$acc1
803 mov $t2,$acc2
804 mov $t3,$acc3
805 mov $a0,$acc0 // put M aside
806 mov $a1,$acc1
807 mov $a2,$acc2
808 mov $a3,$acc3
809 add $rp,sp,#$M
810 bl __ecp_nistz256_add
811 mov $t0,$a0 // restore M
812 mov $t1,$a1
813 ldr $bi,[$ap_real] // forward load for p256_mul_mont
814 mov $t2,$a2
815 ldp $a0,$a1,[sp,#$S]
816 mov $t3,$a3
817 ldp $a2,$a3,[sp,#$S+16]
818 bl __ecp_nistz256_add // p256_mul_by_3(M, M);
819
820 add $bp,$ap_real,#0
821 add $rp,sp,#$S
822 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
823
824 mov $t0,$acc0
825 mov $t1,$acc1
826 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
827 mov $t2,$acc2
828 mov $t3,$acc3
829 ldp $a2,$a3,[sp,#$M+16]
830 add $rp,sp,#$tmp0
831 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
832
833 add $rp,$rp_real,#0
834 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
835
836 add $bp,sp,#$tmp0
837 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
838
839 add $bp,sp,#$S
840 add $rp,sp,#$S
841 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
842
843 ldr $bi,[sp,#$M]
844 mov $a0,$acc0 // copy S
845 mov $a1,$acc1
846 mov $a2,$acc2
847 mov $a3,$acc3
848 add $bp,sp,#$M
849 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
850
851 add $bp,$rp_real,#32
852 add $rp,$rp_real,#32
853 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
854
855 add sp,x29,#0 // destroy frame
856 ldp x19,x20,[x29,#16]
857 ldp x21,x22,[x29,#32]
858 ldp x29,x30,[sp],#96
859 .inst 0xd50323bf // autiasp
860 ret
861 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
862 ___
863 }
864
865 ########################################################################
866 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
867 # const P256_POINT *in2);
868 {
869 my ($res_x,$res_y,$res_z,
870 $H,$Hsqr,$R,$Rsqr,$Hcub,
871 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
872 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
873 # above map() describes stack layout with 12 temporary
874 # 256-bit vectors on top.
875 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
876
877 $code.=<<___;
878 .globl ecp_nistz256_point_add
879 .type ecp_nistz256_point_add,%function
880 .align 5
881 ecp_nistz256_point_add:
882 .inst 0xd503233f // paciasp
883 stp x29,x30,[sp,#-96]!
884 add x29,sp,#0
885 stp x19,x20,[sp,#16]
886 stp x21,x22,[sp,#32]
887 stp x23,x24,[sp,#48]
888 stp x25,x26,[sp,#64]
889 stp x27,x28,[sp,#80]
890 sub sp,sp,#32*12
891
892 ldp $a0,$a1,[$bp,#64] // in2_z
893 ldp $a2,$a3,[$bp,#64+16]
894 mov $rp_real,$rp
895 mov $ap_real,$ap
896 mov $bp_real,$bp
897 ldr $poly1,.Lpoly+8
898 ldr $poly3,.Lpoly+24
899 orr $t0,$a0,$a1
900 orr $t2,$a2,$a3
901 orr $in2infty,$t0,$t2
902 cmp $in2infty,#0
903 csetm $in2infty,ne // ~in2infty
904 add $rp,sp,#$Z2sqr
905 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
906
907 ldp $a0,$a1,[$ap_real,#64] // in1_z
908 ldp $a2,$a3,[$ap_real,#64+16]
909 orr $t0,$a0,$a1
910 orr $t2,$a2,$a3
911 orr $in1infty,$t0,$t2
912 cmp $in1infty,#0
913 csetm $in1infty,ne // ~in1infty
914 add $rp,sp,#$Z1sqr
915 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
916
917 ldr $bi,[$bp_real,#64]
918 ldp $a0,$a1,[sp,#$Z2sqr]
919 ldp $a2,$a3,[sp,#$Z2sqr+16]
920 add $bp,$bp_real,#64
921 add $rp,sp,#$S1
922 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
923
924 ldr $bi,[$ap_real,#64]
925 ldp $a0,$a1,[sp,#$Z1sqr]
926 ldp $a2,$a3,[sp,#$Z1sqr+16]
927 add $bp,$ap_real,#64
928 add $rp,sp,#$S2
929 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
930
931 ldr $bi,[$ap_real,#32]
932 ldp $a0,$a1,[sp,#$S1]
933 ldp $a2,$a3,[sp,#$S1+16]
934 add $bp,$ap_real,#32
935 add $rp,sp,#$S1
936 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
937
938 ldr $bi,[$bp_real,#32]
939 ldp $a0,$a1,[sp,#$S2]
940 ldp $a2,$a3,[sp,#$S2+16]
941 add $bp,$bp_real,#32
942 add $rp,sp,#$S2
943 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
944
945 add $bp,sp,#$S1
946 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
947 ldp $a0,$a1,[$ap_real]
948 ldp $a2,$a3,[$ap_real,#16]
949 add $rp,sp,#$R
950 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
951
952 orr $acc0,$acc0,$acc1 // see if result is zero
953 orr $acc2,$acc2,$acc3
954 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2)
955
956 add $bp,sp,#$Z2sqr
957 add $rp,sp,#$U1
958 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
959
960 ldr $bi,[sp,#$Z1sqr]
961 ldp $a0,$a1,[$bp_real]
962 ldp $a2,$a3,[$bp_real,#16]
963 add $bp,sp,#$Z1sqr
964 add $rp,sp,#$U2
965 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
966
967 add $bp,sp,#$U1
968 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
969 ldp $a2,$a3,[sp,#$R+16]
970 add $rp,sp,#$H
971 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
972
973 orr $acc0,$acc0,$acc1 // see if result is zero
974 orr $acc2,$acc2,$acc3
975 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2)
976
977 mvn $temp1,$in1infty // -1/0 -> 0/-1
978 mvn $temp2,$in2infty // -1/0 -> 0/-1
979 orr $acc0,$acc0,$temp1
980 orr $acc0,$acc0,$temp2
981 orr $acc0,$acc0,$temp0
982 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
983
984 .Ladd_double:
985 mov $ap,$ap_real
986 mov $rp,$rp_real
987 ldp x23,x24,[x29,#48]
988 ldp x25,x26,[x29,#64]
989 ldp x27,x28,[x29,#80]
990 add sp,sp,#32*(12-4) // difference in stack frames
991 b .Ldouble_shortcut
992
993 .align 4
994 .Ladd_proceed:
995 add $rp,sp,#$Rsqr
996 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
997
998 ldr $bi,[$ap_real,#64]
999 ldp $a0,$a1,[sp,#$H]
1000 ldp $a2,$a3,[sp,#$H+16]
1001 add $bp,$ap_real,#64
1002 add $rp,sp,#$res_z
1003 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1004
1005 ldp $a0,$a1,[sp,#$H]
1006 ldp $a2,$a3,[sp,#$H+16]
1007 add $rp,sp,#$Hsqr
1008 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1009
1010 ldr $bi,[$bp_real,#64]
1011 ldp $a0,$a1,[sp,#$res_z]
1012 ldp $a2,$a3,[sp,#$res_z+16]
1013 add $bp,$bp_real,#64
1014 add $rp,sp,#$res_z
1015 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
1016
1017 ldr $bi,[sp,#$H]
1018 ldp $a0,$a1,[sp,#$Hsqr]
1019 ldp $a2,$a3,[sp,#$Hsqr+16]
1020 add $bp,sp,#$H
1021 add $rp,sp,#$Hcub
1022 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1023
1024 ldr $bi,[sp,#$Hsqr]
1025 ldp $a0,$a1,[sp,#$U1]
1026 ldp $a2,$a3,[sp,#$U1+16]
1027 add $bp,sp,#$Hsqr
1028 add $rp,sp,#$U2
1029 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
1030
1031 mov $t0,$acc0
1032 mov $t1,$acc1
1033 mov $t2,$acc2
1034 mov $t3,$acc3
1035 add $rp,sp,#$Hsqr
1036 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1037
1038 add $bp,sp,#$Rsqr
1039 add $rp,sp,#$res_x
1040 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1041
1042 add $bp,sp,#$Hcub
1043 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1044
1045 add $bp,sp,#$U2
1046 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
1047 ldp $a0,$a1,[sp,#$S1]
1048 ldp $a2,$a3,[sp,#$S1+16]
1049 add $rp,sp,#$res_y
1050 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1051
1052 add $bp,sp,#$Hcub
1053 add $rp,sp,#$S2
1054 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
1055
1056 ldr $bi,[sp,#$R]
1057 ldp $a0,$a1,[sp,#$res_y]
1058 ldp $a2,$a3,[sp,#$res_y+16]
1059 add $bp,sp,#$R
1060 add $rp,sp,#$res_y
1061 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1062
1063 add $bp,sp,#$S2
1064 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1065
1066 ldp $a0,$a1,[sp,#$res_x] // res
1067 ldp $a2,$a3,[sp,#$res_x+16]
1068 ldp $t0,$t1,[$bp_real] // in2
1069 ldp $t2,$t3,[$bp_real,#16]
1070 ___
1071 for($i=0;$i<64;$i+=32) { # conditional moves
1072 $code.=<<___;
1073 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1074 cmp $in1infty,#0 // ~$in1intfy, remember?
1075 ldp $acc2,$acc3,[$ap_real,#$i+16]
1076 csel $t0,$a0,$t0,ne
1077 csel $t1,$a1,$t1,ne
1078 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1079 csel $t2,$a2,$t2,ne
1080 csel $t3,$a3,$t3,ne
1081 cmp $in2infty,#0 // ~$in2intfy, remember?
1082 ldp $a2,$a3,[sp,#$res_x+$i+48]
1083 csel $acc0,$t0,$acc0,ne
1084 csel $acc1,$t1,$acc1,ne
1085 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1086 csel $acc2,$t2,$acc2,ne
1087 csel $acc3,$t3,$acc3,ne
1088 ldp $t2,$t3,[$bp_real,#$i+48]
1089 stp $acc0,$acc1,[$rp_real,#$i]
1090 stp $acc2,$acc3,[$rp_real,#$i+16]
1091 ___
1092 }
1093 $code.=<<___;
1094 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1095 cmp $in1infty,#0 // ~$in1intfy, remember?
1096 ldp $acc2,$acc3,[$ap_real,#$i+16]
1097 csel $t0,$a0,$t0,ne
1098 csel $t1,$a1,$t1,ne
1099 csel $t2,$a2,$t2,ne
1100 csel $t3,$a3,$t3,ne
1101 cmp $in2infty,#0 // ~$in2intfy, remember?
1102 csel $acc0,$t0,$acc0,ne
1103 csel $acc1,$t1,$acc1,ne
1104 csel $acc2,$t2,$acc2,ne
1105 csel $acc3,$t3,$acc3,ne
1106 stp $acc0,$acc1,[$rp_real,#$i]
1107 stp $acc2,$acc3,[$rp_real,#$i+16]
1108
1109 .Ladd_done:
1110 add sp,x29,#0 // destroy frame
1111 ldp x19,x20,[x29,#16]
1112 ldp x21,x22,[x29,#32]
1113 ldp x23,x24,[x29,#48]
1114 ldp x25,x26,[x29,#64]
1115 ldp x27,x28,[x29,#80]
1116 ldp x29,x30,[sp],#96
1117 .inst 0xd50323bf // autiasp
1118 ret
1119 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1120 ___
1121 }
1122
1123 ########################################################################
1124 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1125 # const P256_POINT_AFFINE *in2);
1126 {
1127 my ($res_x,$res_y,$res_z,
1128 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1129 my $Z1sqr = $S2;
1130 # above map() describes stack layout with 10 temporary
1131 # 256-bit vectors on top.
1132 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1133
1134 $code.=<<___;
1135 .globl ecp_nistz256_point_add_affine
1136 .type ecp_nistz256_point_add_affine,%function
1137 .align 5
1138 ecp_nistz256_point_add_affine:
1139 .inst 0xd503233f // paciasp
1140 stp x29,x30,[sp,#-80]!
1141 add x29,sp,#0
1142 stp x19,x20,[sp,#16]
1143 stp x21,x22,[sp,#32]
1144 stp x23,x24,[sp,#48]
1145 stp x25,x26,[sp,#64]
1146 sub sp,sp,#32*10
1147
1148 mov $rp_real,$rp
1149 mov $ap_real,$ap
1150 mov $bp_real,$bp
1151 ldr $poly1,.Lpoly+8
1152 ldr $poly3,.Lpoly+24
1153
1154 ldp $a0,$a1,[$ap,#64] // in1_z
1155 ldp $a2,$a3,[$ap,#64+16]
1156 orr $t0,$a0,$a1
1157 orr $t2,$a2,$a3
1158 orr $in1infty,$t0,$t2
1159 cmp $in1infty,#0
1160 csetm $in1infty,ne // ~in1infty
1161
1162 ldp $acc0,$acc1,[$bp] // in2_x
1163 ldp $acc2,$acc3,[$bp,#16]
1164 ldp $t0,$t1,[$bp,#32] // in2_y
1165 ldp $t2,$t3,[$bp,#48]
1166 orr $acc0,$acc0,$acc1
1167 orr $acc2,$acc2,$acc3
1168 orr $t0,$t0,$t1
1169 orr $t2,$t2,$t3
1170 orr $acc0,$acc0,$acc2
1171 orr $t0,$t0,$t2
1172 orr $in2infty,$acc0,$t0
1173 cmp $in2infty,#0
1174 csetm $in2infty,ne // ~in2infty
1175
1176 add $rp,sp,#$Z1sqr
1177 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
1178
1179 mov $a0,$acc0
1180 mov $a1,$acc1
1181 mov $a2,$acc2
1182 mov $a3,$acc3
1183 ldr $bi,[$bp_real]
1184 add $bp,$bp_real,#0
1185 add $rp,sp,#$U2
1186 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
1187
1188 add $bp,$ap_real,#0
1189 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
1190 ldp $a0,$a1,[sp,#$Z1sqr]
1191 ldp $a2,$a3,[sp,#$Z1sqr+16]
1192 add $rp,sp,#$H
1193 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
1194
1195 add $bp,$ap_real,#64
1196 add $rp,sp,#$S2
1197 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
1198
1199 ldr $bi,[$ap_real,#64]
1200 ldp $a0,$a1,[sp,#$H]
1201 ldp $a2,$a3,[sp,#$H+16]
1202 add $bp,$ap_real,#64
1203 add $rp,sp,#$res_z
1204 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1205
1206 ldr $bi,[$bp_real,#32]
1207 ldp $a0,$a1,[sp,#$S2]
1208 ldp $a2,$a3,[sp,#$S2+16]
1209 add $bp,$bp_real,#32
1210 add $rp,sp,#$S2
1211 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
1212
1213 add $bp,$ap_real,#32
1214 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
1215 ldp $a2,$a3,[sp,#$H+16]
1216 add $rp,sp,#$R
1217 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
1218
1219 add $rp,sp,#$Hsqr
1220 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1221
1222 ldp $a0,$a1,[sp,#$R]
1223 ldp $a2,$a3,[sp,#$R+16]
1224 add $rp,sp,#$Rsqr
1225 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1226
1227 ldr $bi,[sp,#$H]
1228 ldp $a0,$a1,[sp,#$Hsqr]
1229 ldp $a2,$a3,[sp,#$Hsqr+16]
1230 add $bp,sp,#$H
1231 add $rp,sp,#$Hcub
1232 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1233
1234 ldr $bi,[$ap_real]
1235 ldp $a0,$a1,[sp,#$Hsqr]
1236 ldp $a2,$a3,[sp,#$Hsqr+16]
1237 add $bp,$ap_real,#0
1238 add $rp,sp,#$U2
1239 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
1240
1241 mov $t0,$acc0
1242 mov $t1,$acc1
1243 mov $t2,$acc2
1244 mov $t3,$acc3
1245 add $rp,sp,#$Hsqr
1246 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1247
1248 add $bp,sp,#$Rsqr
1249 add $rp,sp,#$res_x
1250 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1251
1252 add $bp,sp,#$Hcub
1253 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1254
1255 add $bp,sp,#$U2
1256 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
1257 ldp $a0,$a1,[sp,#$Hcub]
1258 ldp $a2,$a3,[sp,#$Hcub+16]
1259 add $rp,sp,#$res_y
1260 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1261
1262 add $bp,$ap_real,#32
1263 add $rp,sp,#$S2
1264 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
1265
1266 ldr $bi,[sp,#$R]
1267 ldp $a0,$a1,[sp,#$res_y]
1268 ldp $a2,$a3,[sp,#$res_y+16]
1269 add $bp,sp,#$R
1270 add $rp,sp,#$res_y
1271 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1272
1273 add $bp,sp,#$S2
1274 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1275
1276 ldp $a0,$a1,[sp,#$res_x] // res
1277 ldp $a2,$a3,[sp,#$res_x+16]
1278 ldp $t0,$t1,[$bp_real] // in2
1279 ldp $t2,$t3,[$bp_real,#16]
1280 ___
1281 for($i=0;$i<64;$i+=32) { # conditional moves
1282 $code.=<<___;
1283 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1284 cmp $in1infty,#0 // ~$in1intfy, remember?
1285 ldp $acc2,$acc3,[$ap_real,#$i+16]
1286 csel $t0,$a0,$t0,ne
1287 csel $t1,$a1,$t1,ne
1288 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1289 csel $t2,$a2,$t2,ne
1290 csel $t3,$a3,$t3,ne
1291 cmp $in2infty,#0 // ~$in2intfy, remember?
1292 ldp $a2,$a3,[sp,#$res_x+$i+48]
1293 csel $acc0,$t0,$acc0,ne
1294 csel $acc1,$t1,$acc1,ne
1295 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1296 csel $acc2,$t2,$acc2,ne
1297 csel $acc3,$t3,$acc3,ne
1298 ldp $t2,$t3,[$bp_real,#$i+48]
1299 stp $acc0,$acc1,[$rp_real,#$i]
1300 stp $acc2,$acc3,[$rp_real,#$i+16]
1301 ___
1302 $code.=<<___ if ($i == 0);
1303 adr $bp_real,.Lone_mont-64
1304 ___
1305 }
1306 $code.=<<___;
1307 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1308 cmp $in1infty,#0 // ~$in1intfy, remember?
1309 ldp $acc2,$acc3,[$ap_real,#$i+16]
1310 csel $t0,$a0,$t0,ne
1311 csel $t1,$a1,$t1,ne
1312 csel $t2,$a2,$t2,ne
1313 csel $t3,$a3,$t3,ne
1314 cmp $in2infty,#0 // ~$in2intfy, remember?
1315 csel $acc0,$t0,$acc0,ne
1316 csel $acc1,$t1,$acc1,ne
1317 csel $acc2,$t2,$acc2,ne
1318 csel $acc3,$t3,$acc3,ne
1319 stp $acc0,$acc1,[$rp_real,#$i]
1320 stp $acc2,$acc3,[$rp_real,#$i+16]
1321
1322 add sp,x29,#0 // destroy frame
1323 ldp x19,x20,[x29,#16]
1324 ldp x21,x22,[x29,#32]
1325 ldp x23,x24,[x29,#48]
1326 ldp x25,x26,[x29,#64]
1327 ldp x29,x30,[sp],#80
1328 .inst 0xd50323bf // autiasp
1329 ret
1330 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1331 ___
1332 }
1333 if (1) {
1334 my ($ord0,$ord1) = ($poly1,$poly3);
1335 my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1336 my $acc7 = $bi;
1337
1338 $code.=<<___;
1339 ////////////////////////////////////////////////////////////////////////
1340 // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1341 // uint64_t b[4]);
1342 .globl ecp_nistz256_ord_mul_mont
1343 .type ecp_nistz256_ord_mul_mont,%function
1344 .align 4
1345 ecp_nistz256_ord_mul_mont:
1346 stp x29,x30,[sp,#-64]!
1347 add x29,sp,#0
1348 stp x19,x20,[sp,#16]
1349 stp x21,x22,[sp,#32]
1350 stp x23,x24,[sp,#48]
1351
1352 adr $ordk,.Lord
1353 ldr $bi,[$bp] // bp[0]
1354 ldp $a0,$a1,[$ap]
1355 ldp $a2,$a3,[$ap,#16]
1356
1357 ldp $ord0,$ord1,[$ordk,#0]
1358 ldp $ord2,$ord3,[$ordk,#16]
1359 ldr $ordk,[$ordk,#32]
1360
1361 mul $acc0,$a0,$bi // a[0]*b[0]
1362 umulh $t0,$a0,$bi
1363
1364 mul $acc1,$a1,$bi // a[1]*b[0]
1365 umulh $t1,$a1,$bi
1366
1367 mul $acc2,$a2,$bi // a[2]*b[0]
1368 umulh $t2,$a2,$bi
1369
1370 mul $acc3,$a3,$bi // a[3]*b[0]
1371 umulh $acc4,$a3,$bi
1372
1373 mul $t4,$acc0,$ordk
1374
1375 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
1376 adcs $acc2,$acc2,$t1
1377 adcs $acc3,$acc3,$t2
1378 adc $acc4,$acc4,xzr
1379 mov $acc5,xzr
1380 ___
1381 for ($i=1;$i<4;$i++) {
1382 ################################################################
1383 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1384 # * abcdefgh
1385 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1386 #
1387 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1388 # rewrite above as:
1389 #
1390 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1391 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1392 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1393 $code.=<<___;
1394 ldr $bi,[$bp,#8*$i] // b[i]
1395
1396 lsl $t0,$t4,#32
1397 subs $acc2,$acc2,$t4
1398 lsr $t1,$t4,#32
1399 sbcs $acc3,$acc3,$t0
1400 sbcs $acc4,$acc4,$t1
1401 sbc $acc5,$acc5,xzr
1402
1403 subs xzr,$acc0,#1
1404 umulh $t1,$ord0,$t4
1405 mul $t2,$ord1,$t4
1406 umulh $t3,$ord1,$t4
1407
1408 adcs $t2,$t2,$t1
1409 mul $t0,$a0,$bi
1410 adc $t3,$t3,xzr
1411 mul $t1,$a1,$bi
1412
1413 adds $acc0,$acc1,$t2
1414 mul $t2,$a2,$bi
1415 adcs $acc1,$acc2,$t3
1416 mul $t3,$a3,$bi
1417 adcs $acc2,$acc3,$t4
1418 adcs $acc3,$acc4,$t4
1419 adc $acc4,$acc5,xzr
1420
1421 adds $acc0,$acc0,$t0 // accumulate low parts
1422 umulh $t0,$a0,$bi
1423 adcs $acc1,$acc1,$t1
1424 umulh $t1,$a1,$bi
1425 adcs $acc2,$acc2,$t2
1426 umulh $t2,$a2,$bi
1427 adcs $acc3,$acc3,$t3
1428 umulh $t3,$a3,$bi
1429 adc $acc4,$acc4,xzr
1430 mul $t4,$acc0,$ordk
1431 adds $acc1,$acc1,$t0 // accumulate high parts
1432 adcs $acc2,$acc2,$t1
1433 adcs $acc3,$acc3,$t2
1434 adcs $acc4,$acc4,$t3
1435 adc $acc5,xzr,xzr
1436 ___
1437 }
1438 $code.=<<___;
1439 lsl $t0,$t4,#32 // last reduction
1440 subs $acc2,$acc2,$t4
1441 lsr $t1,$t4,#32
1442 sbcs $acc3,$acc3,$t0
1443 sbcs $acc4,$acc4,$t1
1444 sbc $acc5,$acc5,xzr
1445
1446 subs xzr,$acc0,#1
1447 umulh $t1,$ord0,$t4
1448 mul $t2,$ord1,$t4
1449 umulh $t3,$ord1,$t4
1450
1451 adcs $t2,$t2,$t1
1452 adc $t3,$t3,xzr
1453
1454 adds $acc0,$acc1,$t2
1455 adcs $acc1,$acc2,$t3
1456 adcs $acc2,$acc3,$t4
1457 adcs $acc3,$acc4,$t4
1458 adc $acc4,$acc5,xzr
1459
1460 subs $t0,$acc0,$ord0 // ret -= modulus
1461 sbcs $t1,$acc1,$ord1
1462 sbcs $t2,$acc2,$ord2
1463 sbcs $t3,$acc3,$ord3
1464 sbcs xzr,$acc4,xzr
1465
1466 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1467 csel $acc1,$acc1,$t1,lo
1468 csel $acc2,$acc2,$t2,lo
1469 stp $acc0,$acc1,[$rp]
1470 csel $acc3,$acc3,$t3,lo
1471 stp $acc2,$acc3,[$rp,#16]
1472
1473 ldp x19,x20,[sp,#16]
1474 ldp x21,x22,[sp,#32]
1475 ldp x23,x24,[sp,#48]
1476 ldr x29,[sp],#64
1477 ret
1478 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1479
1480 ////////////////////////////////////////////////////////////////////////
1481 // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1482 // int rep);
1483 .globl ecp_nistz256_ord_sqr_mont
1484 .type ecp_nistz256_ord_sqr_mont,%function
1485 .align 4
1486 ecp_nistz256_ord_sqr_mont:
1487 stp x29,x30,[sp,#-64]!
1488 add x29,sp,#0
1489 stp x19,x20,[sp,#16]
1490 stp x21,x22,[sp,#32]
1491 stp x23,x24,[sp,#48]
1492
1493 adr $ordk,.Lord
1494 ldp $a0,$a1,[$ap]
1495 ldp $a2,$a3,[$ap,#16]
1496
1497 ldp $ord0,$ord1,[$ordk,#0]
1498 ldp $ord2,$ord3,[$ordk,#16]
1499 ldr $ordk,[$ordk,#32]
1500 b .Loop_ord_sqr
1501
1502 .align 4
1503 .Loop_ord_sqr:
1504 sub $bp,$bp,#1
1505 ////////////////////////////////////////////////////////////////
1506 // | | | | | |a1*a0| |
1507 // | | | | |a2*a0| | |
1508 // | |a3*a2|a3*a0| | | |
1509 // | | | |a2*a1| | | |
1510 // | | |a3*a1| | | | |
1511 // *| | | | | | | | 2|
1512 // +|a3*a3|a2*a2|a1*a1|a0*a0|
1513 // |--+--+--+--+--+--+--+--|
1514 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1515 //
1516 // "can't overflow" below mark carrying into high part of
1517 // multiplication result, which can't overflow, because it
1518 // can never be all ones.
1519
1520 mul $acc1,$a1,$a0 // a[1]*a[0]
1521 umulh $t1,$a1,$a0
1522 mul $acc2,$a2,$a0 // a[2]*a[0]
1523 umulh $t2,$a2,$a0
1524 mul $acc3,$a3,$a0 // a[3]*a[0]
1525 umulh $acc4,$a3,$a0
1526
1527 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
1528 mul $t0,$a2,$a1 // a[2]*a[1]
1529 umulh $t1,$a2,$a1
1530 adcs $acc3,$acc3,$t2
1531 mul $t2,$a3,$a1 // a[3]*a[1]
1532 umulh $t3,$a3,$a1
1533 adc $acc4,$acc4,xzr // can't overflow
1534
1535 mul $acc5,$a3,$a2 // a[3]*a[2]
1536 umulh $acc6,$a3,$a2
1537
1538 adds $t1,$t1,$t2 // accumulate high parts of multiplication
1539 mul $acc0,$a0,$a0 // a[0]*a[0]
1540 adc $t2,$t3,xzr // can't overflow
1541
1542 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
1543 umulh $a0,$a0,$a0
1544 adcs $acc4,$acc4,$t1
1545 mul $t1,$a1,$a1 // a[1]*a[1]
1546 adcs $acc5,$acc5,$t2
1547 umulh $a1,$a1,$a1
1548 adc $acc6,$acc6,xzr // can't overflow
1549
1550 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
1551 mul $t2,$a2,$a2 // a[2]*a[2]
1552 adcs $acc2,$acc2,$acc2
1553 umulh $a2,$a2,$a2
1554 adcs $acc3,$acc3,$acc3
1555 mul $t3,$a3,$a3 // a[3]*a[3]
1556 adcs $acc4,$acc4,$acc4
1557 umulh $a3,$a3,$a3
1558 adcs $acc5,$acc5,$acc5
1559 adcs $acc6,$acc6,$acc6
1560 adc $acc7,xzr,xzr
1561
1562 adds $acc1,$acc1,$a0 // +a[i]*a[i]
1563 mul $t4,$acc0,$ordk
1564 adcs $acc2,$acc2,$t1
1565 adcs $acc3,$acc3,$a1
1566 adcs $acc4,$acc4,$t2
1567 adcs $acc5,$acc5,$a2
1568 adcs $acc6,$acc6,$t3
1569 adc $acc7,$acc7,$a3
1570 ___
1571 for($i=0; $i<4; $i++) { # reductions
1572 $code.=<<___;
1573 subs xzr,$acc0,#1
1574 umulh $t1,$ord0,$t4
1575 mul $t2,$ord1,$t4
1576 umulh $t3,$ord1,$t4
1577
1578 adcs $t2,$t2,$t1
1579 adc $t3,$t3,xzr
1580
1581 adds $acc0,$acc1,$t2
1582 adcs $acc1,$acc2,$t3
1583 adcs $acc2,$acc3,$t4
1584 adc $acc3,xzr,$t4 // can't overflow
1585 ___
1586 $code.=<<___ if ($i<3);
1587 mul $t3,$acc0,$ordk
1588 ___
1589 $code.=<<___;
1590 lsl $t0,$t4,#32
1591 subs $acc1,$acc1,$t4
1592 lsr $t1,$t4,#32
1593 sbcs $acc2,$acc2,$t0
1594 sbc $acc3,$acc3,$t1 // can't borrow
1595 ___
1596 ($t3,$t4) = ($t4,$t3);
1597 }
1598 $code.=<<___;
1599 adds $acc0,$acc0,$acc4 // accumulate upper half
1600 adcs $acc1,$acc1,$acc5
1601 adcs $acc2,$acc2,$acc6
1602 adcs $acc3,$acc3,$acc7
1603 adc $acc4,xzr,xzr
1604
1605 subs $t0,$acc0,$ord0 // ret -= modulus
1606 sbcs $t1,$acc1,$ord1
1607 sbcs $t2,$acc2,$ord2
1608 sbcs $t3,$acc3,$ord3
1609 sbcs xzr,$acc4,xzr
1610
1611 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1612 csel $a1,$acc1,$t1,lo
1613 csel $a2,$acc2,$t2,lo
1614 csel $a3,$acc3,$t3,lo
1615
1616 cbnz $bp,.Loop_ord_sqr
1617
1618 stp $a0,$a1,[$rp]
1619 stp $a2,$a3,[$rp,#16]
1620
1621 ldp x19,x20,[sp,#16]
1622 ldp x21,x22,[sp,#32]
1623 ldp x23,x24,[sp,#48]
1624 ldr x29,[sp],#64
1625 ret
1626 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1627 ___
1628 } }
1629
1630 ########################################################################
1631 # scatter-gather subroutines
1632 {
1633 my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1634 $code.=<<___;
1635 // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1636 // int x2);
1637 .globl ecp_nistz256_scatter_w5
1638 .type ecp_nistz256_scatter_w5,%function
1639 .align 4
1640 ecp_nistz256_scatter_w5:
1641 stp x29,x30,[sp,#-16]!
1642 add x29,sp,#0
1643
1644 add $out,$out,$index,lsl#2
1645
1646 ldp x4,x5,[$inp] // X
1647 ldp x6,x7,[$inp,#16]
1648 str w4,[$out,#64*0-4]
1649 lsr x4,x4,#32
1650 str w5,[$out,#64*1-4]
1651 lsr x5,x5,#32
1652 str w6,[$out,#64*2-4]
1653 lsr x6,x6,#32
1654 str w7,[$out,#64*3-4]
1655 lsr x7,x7,#32
1656 str w4,[$out,#64*4-4]
1657 str w5,[$out,#64*5-4]
1658 str w6,[$out,#64*6-4]
1659 str w7,[$out,#64*7-4]
1660 add $out,$out,#64*8
1661
1662 ldp x4,x5,[$inp,#32] // Y
1663 ldp x6,x7,[$inp,#48]
1664 str w4,[$out,#64*0-4]
1665 lsr x4,x4,#32
1666 str w5,[$out,#64*1-4]
1667 lsr x5,x5,#32
1668 str w6,[$out,#64*2-4]
1669 lsr x6,x6,#32
1670 str w7,[$out,#64*3-4]
1671 lsr x7,x7,#32
1672 str w4,[$out,#64*4-4]
1673 str w5,[$out,#64*5-4]
1674 str w6,[$out,#64*6-4]
1675 str w7,[$out,#64*7-4]
1676 add $out,$out,#64*8
1677
1678 ldp x4,x5,[$inp,#64] // Z
1679 ldp x6,x7,[$inp,#80]
1680 str w4,[$out,#64*0-4]
1681 lsr x4,x4,#32
1682 str w5,[$out,#64*1-4]
1683 lsr x5,x5,#32
1684 str w6,[$out,#64*2-4]
1685 lsr x6,x6,#32
1686 str w7,[$out,#64*3-4]
1687 lsr x7,x7,#32
1688 str w4,[$out,#64*4-4]
1689 str w5,[$out,#64*5-4]
1690 str w6,[$out,#64*6-4]
1691 str w7,[$out,#64*7-4]
1692
1693 ldr x29,[sp],#16
1694 ret
1695 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1696
1697 // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1698 // int x2);
1699 .globl ecp_nistz256_gather_w5
1700 .type ecp_nistz256_gather_w5,%function
1701 .align 4
1702 ecp_nistz256_gather_w5:
1703 stp x29,x30,[sp,#-16]!
1704 add x29,sp,#0
1705
1706 cmp $index,xzr
1707 csetm x3,ne
1708 add $index,$index,x3
1709 add $inp,$inp,$index,lsl#2
1710
1711 ldr w4,[$inp,#64*0]
1712 ldr w5,[$inp,#64*1]
1713 ldr w6,[$inp,#64*2]
1714 ldr w7,[$inp,#64*3]
1715 ldr w8,[$inp,#64*4]
1716 ldr w9,[$inp,#64*5]
1717 ldr w10,[$inp,#64*6]
1718 ldr w11,[$inp,#64*7]
1719 add $inp,$inp,#64*8
1720 orr x4,x4,x8,lsl#32
1721 orr x5,x5,x9,lsl#32
1722 orr x6,x6,x10,lsl#32
1723 orr x7,x7,x11,lsl#32
1724 csel x4,x4,xzr,ne
1725 csel x5,x5,xzr,ne
1726 csel x6,x6,xzr,ne
1727 csel x7,x7,xzr,ne
1728 stp x4,x5,[$out] // X
1729 stp x6,x7,[$out,#16]
1730
1731 ldr w4,[$inp,#64*0]
1732 ldr w5,[$inp,#64*1]
1733 ldr w6,[$inp,#64*2]
1734 ldr w7,[$inp,#64*3]
1735 ldr w8,[$inp,#64*4]
1736 ldr w9,[$inp,#64*5]
1737 ldr w10,[$inp,#64*6]
1738 ldr w11,[$inp,#64*7]
1739 add $inp,$inp,#64*8
1740 orr x4,x4,x8,lsl#32
1741 orr x5,x5,x9,lsl#32
1742 orr x6,x6,x10,lsl#32
1743 orr x7,x7,x11,lsl#32
1744 csel x4,x4,xzr,ne
1745 csel x5,x5,xzr,ne
1746 csel x6,x6,xzr,ne
1747 csel x7,x7,xzr,ne
1748 stp x4,x5,[$out,#32] // Y
1749 stp x6,x7,[$out,#48]
1750
1751 ldr w4,[$inp,#64*0]
1752 ldr w5,[$inp,#64*1]
1753 ldr w6,[$inp,#64*2]
1754 ldr w7,[$inp,#64*3]
1755 ldr w8,[$inp,#64*4]
1756 ldr w9,[$inp,#64*5]
1757 ldr w10,[$inp,#64*6]
1758 ldr w11,[$inp,#64*7]
1759 orr x4,x4,x8,lsl#32
1760 orr x5,x5,x9,lsl#32
1761 orr x6,x6,x10,lsl#32
1762 orr x7,x7,x11,lsl#32
1763 csel x4,x4,xzr,ne
1764 csel x5,x5,xzr,ne
1765 csel x6,x6,xzr,ne
1766 csel x7,x7,xzr,ne
1767 stp x4,x5,[$out,#64] // Z
1768 stp x6,x7,[$out,#80]
1769
1770 ldr x29,[sp],#16
1771 ret
1772 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1773
1774 // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1775 // int x2);
1776 .globl ecp_nistz256_scatter_w7
1777 .type ecp_nistz256_scatter_w7,%function
1778 .align 4
1779 ecp_nistz256_scatter_w7:
1780 stp x29,x30,[sp,#-16]!
1781 add x29,sp,#0
1782
1783 add $out,$out,$index
1784 mov $index,#64/8
1785 .Loop_scatter_w7:
1786 ldr x3,[$inp],#8
1787 subs $index,$index,#1
1788 prfm pstl1strm,[$out,#4096+64*0]
1789 prfm pstl1strm,[$out,#4096+64*1]
1790 prfm pstl1strm,[$out,#4096+64*2]
1791 prfm pstl1strm,[$out,#4096+64*3]
1792 prfm pstl1strm,[$out,#4096+64*4]
1793 prfm pstl1strm,[$out,#4096+64*5]
1794 prfm pstl1strm,[$out,#4096+64*6]
1795 prfm pstl1strm,[$out,#4096+64*7]
1796 strb w3,[$out,#64*0]
1797 lsr x3,x3,#8
1798 strb w3,[$out,#64*1]
1799 lsr x3,x3,#8
1800 strb w3,[$out,#64*2]
1801 lsr x3,x3,#8
1802 strb w3,[$out,#64*3]
1803 lsr x3,x3,#8
1804 strb w3,[$out,#64*4]
1805 lsr x3,x3,#8
1806 strb w3,[$out,#64*5]
1807 lsr x3,x3,#8
1808 strb w3,[$out,#64*6]
1809 lsr x3,x3,#8
1810 strb w3,[$out,#64*7]
1811 add $out,$out,#64*8
1812 b.ne .Loop_scatter_w7
1813
1814 ldr x29,[sp],#16
1815 ret
1816 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1817
1818 // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1819 // int x2);
1820 .globl ecp_nistz256_gather_w7
1821 .type ecp_nistz256_gather_w7,%function
1822 .align 4
1823 ecp_nistz256_gather_w7:
1824 stp x29,x30,[sp,#-16]!
1825 add x29,sp,#0
1826
1827 cmp $index,xzr
1828 csetm x3,ne
1829 add $index,$index,x3
1830 add $inp,$inp,$index
1831 mov $index,#64/8
1832 nop
1833 .Loop_gather_w7:
1834 ldrb w4,[$inp,#64*0]
1835 prfm pldl1strm,[$inp,#4096+64*0]
1836 subs $index,$index,#1
1837 ldrb w5,[$inp,#64*1]
1838 prfm pldl1strm,[$inp,#4096+64*1]
1839 ldrb w6,[$inp,#64*2]
1840 prfm pldl1strm,[$inp,#4096+64*2]
1841 ldrb w7,[$inp,#64*3]
1842 prfm pldl1strm,[$inp,#4096+64*3]
1843 ldrb w8,[$inp,#64*4]
1844 prfm pldl1strm,[$inp,#4096+64*4]
1845 ldrb w9,[$inp,#64*5]
1846 prfm pldl1strm,[$inp,#4096+64*5]
1847 ldrb w10,[$inp,#64*6]
1848 prfm pldl1strm,[$inp,#4096+64*6]
1849 ldrb w11,[$inp,#64*7]
1850 prfm pldl1strm,[$inp,#4096+64*7]
1851 add $inp,$inp,#64*8
1852 orr x4,x4,x5,lsl#8
1853 orr x6,x6,x7,lsl#8
1854 orr x8,x8,x9,lsl#8
1855 orr x4,x4,x6,lsl#16
1856 orr x10,x10,x11,lsl#8
1857 orr x4,x4,x8,lsl#32
1858 orr x4,x4,x10,lsl#48
1859 and x4,x4,x3
1860 str x4,[$out],#8
1861 b.ne .Loop_gather_w7
1862
1863 ldr x29,[sp],#16
1864 ret
1865 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1866 ___
1867 }
1868
1869 foreach (split("\n",$code)) {
1870 s/\`([^\`]*)\`/eval $1/ge;
1871
1872 print $_,"\n";
1873 }
1874 close STDOUT; # enforce flush