]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/ec/asm/ecp_nistz256-armv8.pl
Fix calling convention bug in ecp_nistz256_ord_sqr_mont
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-armv8.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
3c7d0945 2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a7f182b7 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
e1613e7c
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
d38f1b39 24# with/without -DECP_NISTZ256_ASM
ab4f2026
AP
25# Apple A7 +190-360%
26# Cortex-A53 +190-400%
27# Cortex-A57 +190-350%
28# Denver +230-400%
e1613e7c
AP
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
d38f1b39 32# operation. Keep in mind that +400% means 5x improvement.
e1613e7c
AP
33
34$flavour = shift;
a5aa63a4 35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
e1613e7c
AP
36
37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40die "can't locate arm-xlate.pl";
41
42open OUT,"| \"$^X\" $xlate $flavour $output";
43*STDOUT=*OUT;
44
45{
46my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
48 map("x$_",(0..17,19,20));
49
50my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
51
52$code.=<<___;
53#include "arm_arch.h"
54
55.text
56___
57########################################################################
58# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
59#
60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61open TABLE,"<ecp_nistz256_table.c" or
62open TABLE,"<${dir}../ecp_nistz256_table.c" or
63die "failed to open ecp_nistz256_table.c:",$!;
64
65use integer;
66
67foreach(<TABLE>) {
68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
69}
70close TABLE;
71
72# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
73# 64*16*37-1 is because $#arr returns last valid index or @arr, not
74# amount of elements.
75die "insane number of elements" if ($#arr != 64*16*37-1);
76
77$code.=<<___;
78.globl ecp_nistz256_precomputed
79.type ecp_nistz256_precomputed,%object
80.align 12
81ecp_nistz256_precomputed:
82___
83########################################################################
84# this conversion smashes P256_POINT_AFFINE by individual bytes with
85# 64 byte interval, similar to
86# 1111222233334444
87# 1234123412341234
88for(1..37) {
89 @tbl = splice(@arr,0,64*16);
90 for($i=0;$i<64;$i++) {
91 undef @line;
92 for($j=0;$j<64;$j++) {
93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
94 }
95 $code.=".byte\t";
96 $code.=join(',',map { sprintf "0x%02x",$_} @line);
97 $code.="\n";
98 }
99}
100$code.=<<___;
101.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
102.align 5
103.Lpoly:
104.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
105.LRR: // 2^512 mod P precomputed for NIST P256 polynomial
106.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
107.Lone_mont:
108.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
109.Lone:
110.quad 1,0,0,0
ab4f2026
AP
111.Lord:
112.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
113.LordK:
114.quad 0xccd1c8aaee00bc4f
e1613e7c
AP
115.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
116
117// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
118.globl ecp_nistz256_to_mont
119.type ecp_nistz256_to_mont,%function
120.align 6
121ecp_nistz256_to_mont:
9a18aae5 122 .inst 0xd503233f // paciasp
e1613e7c
AP
123 stp x29,x30,[sp,#-32]!
124 add x29,sp,#0
125 stp x19,x20,[sp,#16]
126
127 ldr $bi,.LRR // bp[0]
128 ldp $a0,$a1,[$ap]
129 ldp $a2,$a3,[$ap,#16]
130 ldr $poly1,.Lpoly+8
131 ldr $poly3,.Lpoly+24
132 adr $bp,.LRR // &bp[0]
133
134 bl __ecp_nistz256_mul_mont
135
136 ldp x19,x20,[sp,#16]
137 ldp x29,x30,[sp],#32
9a18aae5 138 .inst 0xd50323bf // autiasp
e1613e7c
AP
139 ret
140.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
141
142// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
143.globl ecp_nistz256_from_mont
144.type ecp_nistz256_from_mont,%function
145.align 4
146ecp_nistz256_from_mont:
9a18aae5 147 .inst 0xd503233f // paciasp
e1613e7c
AP
148 stp x29,x30,[sp,#-32]!
149 add x29,sp,#0
150 stp x19,x20,[sp,#16]
151
152 mov $bi,#1 // bp[0]
153 ldp $a0,$a1,[$ap]
154 ldp $a2,$a3,[$ap,#16]
155 ldr $poly1,.Lpoly+8
156 ldr $poly3,.Lpoly+24
157 adr $bp,.Lone // &bp[0]
158
159 bl __ecp_nistz256_mul_mont
160
161 ldp x19,x20,[sp,#16]
162 ldp x29,x30,[sp],#32
9a18aae5 163 .inst 0xd50323bf // autiasp
e1613e7c
AP
164 ret
165.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
166
167// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
168// const BN_ULONG x2[4]);
169.globl ecp_nistz256_mul_mont
170.type ecp_nistz256_mul_mont,%function
171.align 4
172ecp_nistz256_mul_mont:
9a18aae5 173 .inst 0xd503233f // paciasp
e1613e7c
AP
174 stp x29,x30,[sp,#-32]!
175 add x29,sp,#0
176 stp x19,x20,[sp,#16]
177
178 ldr $bi,[$bp] // bp[0]
179 ldp $a0,$a1,[$ap]
180 ldp $a2,$a3,[$ap,#16]
181 ldr $poly1,.Lpoly+8
182 ldr $poly3,.Lpoly+24
183
184 bl __ecp_nistz256_mul_mont
185
186 ldp x19,x20,[sp,#16]
187 ldp x29,x30,[sp],#32
9a18aae5 188 .inst 0xd50323bf // autiasp
e1613e7c
AP
189 ret
190.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
191
192// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
193.globl ecp_nistz256_sqr_mont
194.type ecp_nistz256_sqr_mont,%function
195.align 4
196ecp_nistz256_sqr_mont:
9a18aae5 197 .inst 0xd503233f // paciasp
e1613e7c
AP
198 stp x29,x30,[sp,#-32]!
199 add x29,sp,#0
200 stp x19,x20,[sp,#16]
201
202 ldp $a0,$a1,[$ap]
203 ldp $a2,$a3,[$ap,#16]
204 ldr $poly1,.Lpoly+8
205 ldr $poly3,.Lpoly+24
206
207 bl __ecp_nistz256_sqr_mont
208
209 ldp x19,x20,[sp,#16]
210 ldp x29,x30,[sp],#32
9a18aae5 211 .inst 0xd50323bf // autiasp
e1613e7c
AP
212 ret
213.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
214
215// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
216// const BN_ULONG x2[4]);
217.globl ecp_nistz256_add
218.type ecp_nistz256_add,%function
219.align 4
220ecp_nistz256_add:
9a18aae5 221 .inst 0xd503233f // paciasp
e1613e7c
AP
222 stp x29,x30,[sp,#-16]!
223 add x29,sp,#0
224
225 ldp $acc0,$acc1,[$ap]
226 ldp $t0,$t1,[$bp]
227 ldp $acc2,$acc3,[$ap,#16]
228 ldp $t2,$t3,[$bp,#16]
229 ldr $poly1,.Lpoly+8
230 ldr $poly3,.Lpoly+24
231
232 bl __ecp_nistz256_add
233
234 ldp x29,x30,[sp],#16
9a18aae5 235 .inst 0xd50323bf // autiasp
e1613e7c
AP
236 ret
237.size ecp_nistz256_add,.-ecp_nistz256_add
238
239// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240.globl ecp_nistz256_div_by_2
241.type ecp_nistz256_div_by_2,%function
242.align 4
243ecp_nistz256_div_by_2:
9a18aae5 244 .inst 0xd503233f // paciasp
e1613e7c
AP
245 stp x29,x30,[sp,#-16]!
246 add x29,sp,#0
247
248 ldp $acc0,$acc1,[$ap]
249 ldp $acc2,$acc3,[$ap,#16]
250 ldr $poly1,.Lpoly+8
251 ldr $poly3,.Lpoly+24
252
253 bl __ecp_nistz256_div_by_2
254
255 ldp x29,x30,[sp],#16
9a18aae5 256 .inst 0xd50323bf // autiasp
e1613e7c
AP
257 ret
258.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
259
260// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
261.globl ecp_nistz256_mul_by_2
262.type ecp_nistz256_mul_by_2,%function
263.align 4
264ecp_nistz256_mul_by_2:
9a18aae5 265 .inst 0xd503233f // paciasp
e1613e7c
AP
266 stp x29,x30,[sp,#-16]!
267 add x29,sp,#0
268
269 ldp $acc0,$acc1,[$ap]
270 ldp $acc2,$acc3,[$ap,#16]
271 ldr $poly1,.Lpoly+8
272 ldr $poly3,.Lpoly+24
273 mov $t0,$acc0
274 mov $t1,$acc1
275 mov $t2,$acc2
276 mov $t3,$acc3
277
278 bl __ecp_nistz256_add // ret = a+a // 2*a
279
280 ldp x29,x30,[sp],#16
9a18aae5 281 .inst 0xd50323bf // autiasp
e1613e7c
AP
282 ret
283.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
284
285// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
286.globl ecp_nistz256_mul_by_3
287.type ecp_nistz256_mul_by_3,%function
288.align 4
289ecp_nistz256_mul_by_3:
9a18aae5 290 .inst 0xd503233f // paciasp
e1613e7c
AP
291 stp x29,x30,[sp,#-16]!
292 add x29,sp,#0
293
294 ldp $acc0,$acc1,[$ap]
295 ldp $acc2,$acc3,[$ap,#16]
296 ldr $poly1,.Lpoly+8
297 ldr $poly3,.Lpoly+24
298 mov $t0,$acc0
299 mov $t1,$acc1
300 mov $t2,$acc2
301 mov $t3,$acc3
302 mov $a0,$acc0
303 mov $a1,$acc1
304 mov $a2,$acc2
305 mov $a3,$acc3
306
307 bl __ecp_nistz256_add // ret = a+a // 2*a
308
309 mov $t0,$a0
310 mov $t1,$a1
311 mov $t2,$a2
312 mov $t3,$a3
313
314 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a
315
316 ldp x29,x30,[sp],#16
9a18aae5 317 .inst 0xd50323bf // autiasp
e1613e7c
AP
318 ret
319.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
320
321// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
322// const BN_ULONG x2[4]);
323.globl ecp_nistz256_sub
324.type ecp_nistz256_sub,%function
325.align 4
326ecp_nistz256_sub:
9a18aae5 327 .inst 0xd503233f // paciasp
e1613e7c
AP
328 stp x29,x30,[sp,#-16]!
329 add x29,sp,#0
330
331 ldp $acc0,$acc1,[$ap]
332 ldp $acc2,$acc3,[$ap,#16]
333 ldr $poly1,.Lpoly+8
334 ldr $poly3,.Lpoly+24
335
336 bl __ecp_nistz256_sub_from
337
338 ldp x29,x30,[sp],#16
9a18aae5 339 .inst 0xd50323bf // autiasp
e1613e7c
AP
340 ret
341.size ecp_nistz256_sub,.-ecp_nistz256_sub
342
343// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
344.globl ecp_nistz256_neg
345.type ecp_nistz256_neg,%function
346.align 4
347ecp_nistz256_neg:
9a18aae5 348 .inst 0xd503233f // paciasp
e1613e7c
AP
349 stp x29,x30,[sp,#-16]!
350 add x29,sp,#0
351
352 mov $bp,$ap
353 mov $acc0,xzr // a = 0
354 mov $acc1,xzr
355 mov $acc2,xzr
356 mov $acc3,xzr
357 ldr $poly1,.Lpoly+8
358 ldr $poly3,.Lpoly+24
359
360 bl __ecp_nistz256_sub_from
361
362 ldp x29,x30,[sp],#16
9a18aae5 363 .inst 0xd50323bf // autiasp
e1613e7c
AP
364 ret
365.size ecp_nistz256_neg,.-ecp_nistz256_neg
366
367// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
368// to $a0-$a3 and b[0] - to $bi
369.type __ecp_nistz256_mul_mont,%function
370.align 4
371__ecp_nistz256_mul_mont:
372 mul $acc0,$a0,$bi // a[0]*b[0]
373 umulh $t0,$a0,$bi
374
375 mul $acc1,$a1,$bi // a[1]*b[0]
376 umulh $t1,$a1,$bi
377
378 mul $acc2,$a2,$bi // a[2]*b[0]
379 umulh $t2,$a2,$bi
380
381 mul $acc3,$a3,$bi // a[3]*b[0]
382 umulh $t3,$a3,$bi
383 ldr $bi,[$bp,#8] // b[1]
384
385 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
386 lsl $t0,$acc0,#32
387 adcs $acc2,$acc2,$t1
388 lsr $t1,$acc0,#32
389 adcs $acc3,$acc3,$t2
390 adc $acc4,xzr,$t3
391 mov $acc5,xzr
392___
393for($i=1;$i<4;$i++) {
394 # Reduction iteration is normally performed by accumulating
395 # result of multiplication of modulus by "magic" digit [and
396 # omitting least significant word, which is guaranteed to
397 # be 0], but thanks to special form of modulus and "magic"
398 # digit being equal to least significant word, it can be
399 # performed with additions and subtractions alone. Indeed:
400 #
401 # ffff0001.00000000.0000ffff.ffffffff
402 # * abcdefgh
403 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
404 #
405 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
406 # rewrite above as:
407 #
408 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
409 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
410 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
411 #
412 # or marking redundant operations:
413 #
414 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
415 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
416 # - 0000abcd.efgh0000.--------.--------.--------
417
418$code.=<<___;
419 subs $t2,$acc0,$t0 // "*0xffff0001"
420 sbc $t3,$acc0,$t1
421 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
422 mul $t0,$a0,$bi // lo(a[0]*b[i])
423 adcs $acc1,$acc2,$t1
424 mul $t1,$a1,$bi // lo(a[1]*b[i])
425 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
426 mul $t2,$a2,$bi // lo(a[2]*b[i])
427 adcs $acc3,$acc4,$t3
428 mul $t3,$a3,$bi // lo(a[3]*b[i])
429 adc $acc4,$acc5,xzr
430
431 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
432 umulh $t0,$a0,$bi // hi(a[0]*b[i])
433 adcs $acc1,$acc1,$t1
434 umulh $t1,$a1,$bi // hi(a[1]*b[i])
435 adcs $acc2,$acc2,$t2
436 umulh $t2,$a2,$bi // hi(a[2]*b[i])
437 adcs $acc3,$acc3,$t3
438 umulh $t3,$a3,$bi // hi(a[3]*b[i])
439 adc $acc4,$acc4,xzr
440___
441$code.=<<___ if ($i<3);
442 ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
443___
444$code.=<<___;
445 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
446 lsl $t0,$acc0,#32
447 adcs $acc2,$acc2,$t1
448 lsr $t1,$acc0,#32
449 adcs $acc3,$acc3,$t2
450 adcs $acc4,$acc4,$t3
451 adc $acc5,xzr,xzr
452___
453}
454$code.=<<___;
455 // last reduction
456 subs $t2,$acc0,$t0 // "*0xffff0001"
457 sbc $t3,$acc0,$t1
458 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
459 adcs $acc1,$acc2,$t1
460 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
461 adcs $acc3,$acc4,$t3
462 adc $acc4,$acc5,xzr
463
464 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
465 sbcs $t1,$acc1,$poly1
466 sbcs $t2,$acc2,xzr
467 sbcs $t3,$acc3,$poly3
468 sbcs xzr,$acc4,xzr // did it borrow?
469
470 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
471 csel $acc1,$acc1,$t1,lo
472 csel $acc2,$acc2,$t2,lo
473 stp $acc0,$acc1,[$rp]
474 csel $acc3,$acc3,$t3,lo
475 stp $acc2,$acc3,[$rp,#16]
476
477 ret
478.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
479
480// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
481// to $a0-$a3
482.type __ecp_nistz256_sqr_mont,%function
483.align 4
484__ecp_nistz256_sqr_mont:
485 // | | | | | |a1*a0| |
486 // | | | | |a2*a0| | |
487 // | |a3*a2|a3*a0| | | |
488 // | | | |a2*a1| | | |
489 // | | |a3*a1| | | | |
490 // *| | | | | | | | 2|
491 // +|a3*a3|a2*a2|a1*a1|a0*a0|
492 // |--+--+--+--+--+--+--+--|
493 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
494 //
495 // "can't overflow" below mark carrying into high part of
496 // multiplication result, which can't overflow, because it
497 // can never be all ones.
498
499 mul $acc1,$a1,$a0 // a[1]*a[0]
500 umulh $t1,$a1,$a0
501 mul $acc2,$a2,$a0 // a[2]*a[0]
502 umulh $t2,$a2,$a0
503 mul $acc3,$a3,$a0 // a[3]*a[0]
504 umulh $acc4,$a3,$a0
505
506 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
507 mul $t0,$a2,$a1 // a[2]*a[1]
508 umulh $t1,$a2,$a1
509 adcs $acc3,$acc3,$t2
510 mul $t2,$a3,$a1 // a[3]*a[1]
511 umulh $t3,$a3,$a1
512 adc $acc4,$acc4,xzr // can't overflow
513
514 mul $acc5,$a3,$a2 // a[3]*a[2]
515 umulh $acc6,$a3,$a2
516
517 adds $t1,$t1,$t2 // accumulate high parts of multiplication
518 mul $acc0,$a0,$a0 // a[0]*a[0]
519 adc $t2,$t3,xzr // can't overflow
520
521 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
522 umulh $a0,$a0,$a0
523 adcs $acc4,$acc4,$t1
524 mul $t1,$a1,$a1 // a[1]*a[1]
525 adcs $acc5,$acc5,$t2
526 umulh $a1,$a1,$a1
527 adc $acc6,$acc6,xzr // can't overflow
528
529 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
530 mul $t2,$a2,$a2 // a[2]*a[2]
531 adcs $acc2,$acc2,$acc2
532 umulh $a2,$a2,$a2
533 adcs $acc3,$acc3,$acc3
534 mul $t3,$a3,$a3 // a[3]*a[3]
535 adcs $acc4,$acc4,$acc4
536 umulh $a3,$a3,$a3
537 adcs $acc5,$acc5,$acc5
538 adcs $acc6,$acc6,$acc6
539 adc $acc7,xzr,xzr
540
541 adds $acc1,$acc1,$a0 // +a[i]*a[i]
542 adcs $acc2,$acc2,$t1
543 adcs $acc3,$acc3,$a1
544 adcs $acc4,$acc4,$t2
545 adcs $acc5,$acc5,$a2
546 lsl $t0,$acc0,#32
547 adcs $acc6,$acc6,$t3
548 lsr $t1,$acc0,#32
549 adc $acc7,$acc7,$a3
550___
551for($i=0;$i<3;$i++) { # reductions, see commentary in
552 # multiplication for details
553$code.=<<___;
554 subs $t2,$acc0,$t0 // "*0xffff0001"
555 sbc $t3,$acc0,$t1
556 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
557 adcs $acc1,$acc2,$t1
558 lsl $t0,$acc0,#32
559 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
560 lsr $t1,$acc0,#32
561 adc $acc3,$t3,xzr // can't overflow
562___
563}
564$code.=<<___;
565 subs $t2,$acc0,$t0 // "*0xffff0001"
566 sbc $t3,$acc0,$t1
567 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
568 adcs $acc1,$acc2,$t1
569 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
570 adc $acc3,$t3,xzr // can't overflow
571
572 adds $acc0,$acc0,$acc4 // accumulate upper half
573 adcs $acc1,$acc1,$acc5
574 adcs $acc2,$acc2,$acc6
575 adcs $acc3,$acc3,$acc7
576 adc $acc4,xzr,xzr
577
578 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
579 sbcs $t1,$acc1,$poly1
580 sbcs $t2,$acc2,xzr
581 sbcs $t3,$acc3,$poly3
582 sbcs xzr,$acc4,xzr // did it borrow?
583
584 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
585 csel $acc1,$acc1,$t1,lo
586 csel $acc2,$acc2,$t2,lo
587 stp $acc0,$acc1,[$rp]
588 csel $acc3,$acc3,$t3,lo
589 stp $acc2,$acc3,[$rp,#16]
590
591 ret
592.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
593
594// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
595// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
596// contexts, e.g. in multiplication by 2 and 3...
597.type __ecp_nistz256_add,%function
598.align 4
599__ecp_nistz256_add:
600 adds $acc0,$acc0,$t0 // ret = a+b
601 adcs $acc1,$acc1,$t1
602 adcs $acc2,$acc2,$t2
603 adcs $acc3,$acc3,$t3
604 adc $ap,xzr,xzr // zap $ap
605
606 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
607 sbcs $t1,$acc1,$poly1
608 sbcs $t2,$acc2,xzr
dfde4219
AP
609 sbcs $t3,$acc3,$poly3
610 sbcs xzr,$ap,xzr // did subtraction borrow?
e1613e7c 611
dfde4219
AP
612 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
613 csel $acc1,$acc1,$t1,lo
614 csel $acc2,$acc2,$t2,lo
e1613e7c 615 stp $acc0,$acc1,[$rp]
dfde4219 616 csel $acc3,$acc3,$t3,lo
e1613e7c
AP
617 stp $acc2,$acc3,[$rp,#16]
618
619 ret
620.size __ecp_nistz256_add,.-__ecp_nistz256_add
621
622.type __ecp_nistz256_sub_from,%function
623.align 4
624__ecp_nistz256_sub_from:
625 ldp $t0,$t1,[$bp]
626 ldp $t2,$t3,[$bp,#16]
627 subs $acc0,$acc0,$t0 // ret = a-b
628 sbcs $acc1,$acc1,$t1
629 sbcs $acc2,$acc2,$t2
630 sbcs $acc3,$acc3,$t3
631 sbc $ap,xzr,xzr // zap $ap
632
633 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
634 adcs $t1,$acc1,$poly1
635 adcs $t2,$acc2,xzr
636 adc $t3,$acc3,$poly3
637 cmp $ap,xzr // did subtraction borrow?
638
639 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
640 csel $acc1,$acc1,$t1,eq
641 csel $acc2,$acc2,$t2,eq
642 stp $acc0,$acc1,[$rp]
643 csel $acc3,$acc3,$t3,eq
644 stp $acc2,$acc3,[$rp,#16]
645
646 ret
647.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
648
649.type __ecp_nistz256_sub_morf,%function
650.align 4
651__ecp_nistz256_sub_morf:
652 ldp $t0,$t1,[$bp]
653 ldp $t2,$t3,[$bp,#16]
654 subs $acc0,$t0,$acc0 // ret = b-a
655 sbcs $acc1,$t1,$acc1
656 sbcs $acc2,$t2,$acc2
657 sbcs $acc3,$t3,$acc3
658 sbc $ap,xzr,xzr // zap $ap
659
660 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
661 adcs $t1,$acc1,$poly1
662 adcs $t2,$acc2,xzr
663 adc $t3,$acc3,$poly3
664 cmp $ap,xzr // did subtraction borrow?
665
666 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
667 csel $acc1,$acc1,$t1,eq
668 csel $acc2,$acc2,$t2,eq
669 stp $acc0,$acc1,[$rp]
670 csel $acc3,$acc3,$t3,eq
671 stp $acc2,$acc3,[$rp,#16]
672
673 ret
674.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
675
676.type __ecp_nistz256_div_by_2,%function
677.align 4
678__ecp_nistz256_div_by_2:
679 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
680 adcs $t1,$acc1,$poly1
681 adcs $t2,$acc2,xzr
682 adcs $t3,$acc3,$poly3
683 adc $ap,xzr,xzr // zap $ap
684 tst $acc0,#1 // is a even?
685
609b0852 686 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
e1613e7c
AP
687 csel $acc1,$acc1,$t1,eq
688 csel $acc2,$acc2,$t2,eq
689 csel $acc3,$acc3,$t3,eq
690 csel $ap,xzr,$ap,eq
691
692 lsr $acc0,$acc0,#1 // ret >>= 1
693 orr $acc0,$acc0,$acc1,lsl#63
694 lsr $acc1,$acc1,#1
695 orr $acc1,$acc1,$acc2,lsl#63
696 lsr $acc2,$acc2,#1
697 orr $acc2,$acc2,$acc3,lsl#63
698 lsr $acc3,$acc3,#1
699 stp $acc0,$acc1,[$rp]
700 orr $acc3,$acc3,$ap,lsl#63
701 stp $acc2,$acc3,[$rp,#16]
702
703 ret
704.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
705___
706########################################################################
085b3860 707# following subroutines are "literal" implementation of those found in
e1613e7c
AP
708# ecp_nistz256.c
709#
710########################################################################
711# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
712#
713{
714my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
715# above map() describes stack layout with 4 temporary
716# 256-bit vectors on top.
717my ($rp_real,$ap_real) = map("x$_",(21,22));
718
719$code.=<<___;
720.globl ecp_nistz256_point_double
721.type ecp_nistz256_point_double,%function
722.align 5
723ecp_nistz256_point_double:
9a18aae5 724 .inst 0xd503233f // paciasp
143ee099 725 stp x29,x30,[sp,#-80]!
e1613e7c
AP
726 add x29,sp,#0
727 stp x19,x20,[sp,#16]
728 stp x21,x22,[sp,#32]
729 sub sp,sp,#32*4
730
143ee099 731.Ldouble_shortcut:
e1613e7c
AP
732 ldp $acc0,$acc1,[$ap,#32]
733 mov $rp_real,$rp
734 ldp $acc2,$acc3,[$ap,#48]
735 mov $ap_real,$ap
736 ldr $poly1,.Lpoly+8
737 mov $t0,$acc0
738 ldr $poly3,.Lpoly+24
739 mov $t1,$acc1
740 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
741 mov $t2,$acc2
742 mov $t3,$acc3
743 ldp $a2,$a3,[$ap_real,#64+16]
744 add $rp,sp,#$S
745 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
746
747 add $rp,sp,#$Zsqr
748 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
749
750 ldp $t0,$t1,[$ap_real]
751 ldp $t2,$t3,[$ap_real,#16]
752 mov $a0,$acc0 // put Zsqr aside for p256_sub
753 mov $a1,$acc1
754 mov $a2,$acc2
755 mov $a3,$acc3
756 add $rp,sp,#$M
757 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
758
759 add $bp,$ap_real,#0
760 mov $acc0,$a0 // restore Zsqr
761 mov $acc1,$a1
762 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
763 mov $acc2,$a2
764 mov $acc3,$a3
765 ldp $a2,$a3,[sp,#$S+16]
766 add $rp,sp,#$Zsqr
767 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
768
769 add $rp,sp,#$S
770 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
771
772 ldr $bi,[$ap_real,#32]
773 ldp $a0,$a1,[$ap_real,#64]
774 ldp $a2,$a3,[$ap_real,#64+16]
775 add $bp,$ap_real,#32
776 add $rp,sp,#$tmp0
777 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
778
779 mov $t0,$acc0
780 mov $t1,$acc1
781 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
782 mov $t2,$acc2
783 mov $t3,$acc3
784 ldp $a2,$a3,[sp,#$S+16]
785 add $rp,$rp_real,#64
786 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
787
788 add $rp,sp,#$tmp0
789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
790
791 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
792 ldp $a0,$a1,[sp,#$M]
793 ldp $a2,$a3,[sp,#$M+16]
794 add $rp,$rp_real,#32
795 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
796
797 add $bp,sp,#$Zsqr
798 add $rp,sp,#$M
799 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
800
801 mov $t0,$acc0 // duplicate M
802 mov $t1,$acc1
803 mov $t2,$acc2
804 mov $t3,$acc3
805 mov $a0,$acc0 // put M aside
806 mov $a1,$acc1
807 mov $a2,$acc2
808 mov $a3,$acc3
809 add $rp,sp,#$M
810 bl __ecp_nistz256_add
811 mov $t0,$a0 // restore M
812 mov $t1,$a1
813 ldr $bi,[$ap_real] // forward load for p256_mul_mont
814 mov $t2,$a2
815 ldp $a0,$a1,[sp,#$S]
816 mov $t3,$a3
817 ldp $a2,$a3,[sp,#$S+16]
818 bl __ecp_nistz256_add // p256_mul_by_3(M, M);
819
820 add $bp,$ap_real,#0
821 add $rp,sp,#$S
822 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
823
824 mov $t0,$acc0
825 mov $t1,$acc1
826 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
827 mov $t2,$acc2
828 mov $t3,$acc3
829 ldp $a2,$a3,[sp,#$M+16]
830 add $rp,sp,#$tmp0
831 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
832
833 add $rp,$rp_real,#0
834 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
835
836 add $bp,sp,#$tmp0
837 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
838
839 add $bp,sp,#$S
840 add $rp,sp,#$S
841 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
842
843 ldr $bi,[sp,#$M]
844 mov $a0,$acc0 // copy S
845 mov $a1,$acc1
846 mov $a2,$acc2
847 mov $a3,$acc3
848 add $bp,sp,#$M
849 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
850
851 add $bp,$rp_real,#32
852 add $rp,$rp_real,#32
853 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
854
855 add sp,x29,#0 // destroy frame
856 ldp x19,x20,[x29,#16]
857 ldp x21,x22,[x29,#32]
143ee099 858 ldp x29,x30,[sp],#80
9a18aae5 859 .inst 0xd50323bf // autiasp
e1613e7c
AP
860 ret
861.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
862___
863}
864
865########################################################################
866# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
867# const P256_POINT *in2);
868{
869my ($res_x,$res_y,$res_z,
870 $H,$Hsqr,$R,$Rsqr,$Hcub,
871 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
872my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
873# above map() describes stack layout with 12 temporary
874# 256-bit vectors on top.
875my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
876
877$code.=<<___;
878.globl ecp_nistz256_point_add
879.type ecp_nistz256_point_add,%function
880.align 5
881ecp_nistz256_point_add:
9a18aae5 882 .inst 0xd503233f // paciasp
e1613e7c
AP
883 stp x29,x30,[sp,#-80]!
884 add x29,sp,#0
885 stp x19,x20,[sp,#16]
886 stp x21,x22,[sp,#32]
887 stp x23,x24,[sp,#48]
888 stp x25,x26,[sp,#64]
889 sub sp,sp,#32*12
890
c74aea8d
AP
891 ldp $a0,$a1,[$bp,#64] // in2_z
892 ldp $a2,$a3,[$bp,#64+16]
e1613e7c
AP
893 mov $rp_real,$rp
894 mov $ap_real,$ap
895 mov $bp_real,$bp
e1613e7c
AP
896 ldr $poly1,.Lpoly+8
897 ldr $poly3,.Lpoly+24
c74aea8d
AP
898 orr $t0,$a0,$a1
899 orr $t2,$a2,$a3
900 orr $in2infty,$t0,$t2
901 cmp $in2infty,#0
902 csetm $in2infty,ne // !in2infty
e1613e7c
AP
903 add $rp,sp,#$Z2sqr
904 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
905
c74aea8d 906 ldp $a0,$a1,[$ap_real,#64] // in1_z
e1613e7c 907 ldp $a2,$a3,[$ap_real,#64+16]
c74aea8d
AP
908 orr $t0,$a0,$a1
909 orr $t2,$a2,$a3
910 orr $in1infty,$t0,$t2
911 cmp $in1infty,#0
912 csetm $in1infty,ne // !in1infty
e1613e7c
AP
913 add $rp,sp,#$Z1sqr
914 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
915
916 ldr $bi,[$bp_real,#64]
917 ldp $a0,$a1,[sp,#$Z2sqr]
918 ldp $a2,$a3,[sp,#$Z2sqr+16]
919 add $bp,$bp_real,#64
920 add $rp,sp,#$S1
921 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
922
923 ldr $bi,[$ap_real,#64]
924 ldp $a0,$a1,[sp,#$Z1sqr]
925 ldp $a2,$a3,[sp,#$Z1sqr+16]
926 add $bp,$ap_real,#64
927 add $rp,sp,#$S2
928 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
929
930 ldr $bi,[$ap_real,#32]
931 ldp $a0,$a1,[sp,#$S1]
932 ldp $a2,$a3,[sp,#$S1+16]
933 add $bp,$ap_real,#32
934 add $rp,sp,#$S1
935 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
936
937 ldr $bi,[$bp_real,#32]
938 ldp $a0,$a1,[sp,#$S2]
939 ldp $a2,$a3,[sp,#$S2+16]
940 add $bp,$bp_real,#32
941 add $rp,sp,#$S2
942 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
943
944 add $bp,sp,#$S1
945 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
946 ldp $a0,$a1,[$ap_real]
947 ldp $a2,$a3,[$ap_real,#16]
948 add $rp,sp,#$R
949 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
950
951 orr $acc0,$acc0,$acc1 // see if result is zero
952 orr $acc2,$acc2,$acc3
953 orr $temp,$acc0,$acc2
954
955 add $bp,sp,#$Z2sqr
956 add $rp,sp,#$U1
957 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
958
959 ldr $bi,[sp,#$Z1sqr]
960 ldp $a0,$a1,[$bp_real]
961 ldp $a2,$a3,[$bp_real,#16]
962 add $bp,sp,#$Z1sqr
963 add $rp,sp,#$U2
964 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
965
966 add $bp,sp,#$U1
967 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
968 ldp $a2,$a3,[sp,#$R+16]
969 add $rp,sp,#$H
970 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
971
972 orr $acc0,$acc0,$acc1 // see if result is zero
973 orr $acc2,$acc2,$acc3
974 orr $acc0,$acc0,$acc2
975 tst $acc0,$acc0
976 b.ne .Ladd_proceed // is_equal(U1,U2)?
977
978 tst $in1infty,$in2infty
979 b.eq .Ladd_proceed // (in1infty || in2infty)?
980
981 tst $temp,$temp
143ee099 982 b.eq .Ladd_double // is_equal(S1,S2)?
e1613e7c
AP
983
984 eor $a0,$a0,$a0
985 eor $a1,$a1,$a1
986 stp $a0,$a1,[$rp_real]
987 stp $a0,$a1,[$rp_real,#16]
988 stp $a0,$a1,[$rp_real,#32]
989 stp $a0,$a1,[$rp_real,#48]
990 stp $a0,$a1,[$rp_real,#64]
991 stp $a0,$a1,[$rp_real,#80]
992 b .Ladd_done
993
143ee099
AP
994.align 4
995.Ladd_double:
996 mov $ap,$ap_real
997 mov $rp,$rp_real
998 ldp x23,x24,[x29,#48]
999 ldp x25,x26,[x29,#64]
1000 add sp,sp,#32*(12-4) // difference in stack frames
1001 b .Ldouble_shortcut
1002
e1613e7c
AP
1003.align 4
1004.Ladd_proceed:
1005 add $rp,sp,#$Rsqr
1006 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1007
1008 ldr $bi,[$ap_real,#64]
1009 ldp $a0,$a1,[sp,#$H]
1010 ldp $a2,$a3,[sp,#$H+16]
1011 add $bp,$ap_real,#64
1012 add $rp,sp,#$res_z
1013 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1014
1015 ldp $a0,$a1,[sp,#$H]
1016 ldp $a2,$a3,[sp,#$H+16]
1017 add $rp,sp,#$Hsqr
1018 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1019
1020 ldr $bi,[$bp_real,#64]
1021 ldp $a0,$a1,[sp,#$res_z]
1022 ldp $a2,$a3,[sp,#$res_z+16]
1023 add $bp,$bp_real,#64
1024 add $rp,sp,#$res_z
1025 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
1026
1027 ldr $bi,[sp,#$H]
1028 ldp $a0,$a1,[sp,#$Hsqr]
1029 ldp $a2,$a3,[sp,#$Hsqr+16]
1030 add $bp,sp,#$H
1031 add $rp,sp,#$Hcub
1032 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1033
1034 ldr $bi,[sp,#$Hsqr]
1035 ldp $a0,$a1,[sp,#$U1]
1036 ldp $a2,$a3,[sp,#$U1+16]
1037 add $bp,sp,#$Hsqr
1038 add $rp,sp,#$U2
1039 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
1040
1041 mov $t0,$acc0
1042 mov $t1,$acc1
1043 mov $t2,$acc2
1044 mov $t3,$acc3
1045 add $rp,sp,#$Hsqr
1046 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1047
1048 add $bp,sp,#$Rsqr
1049 add $rp,sp,#$res_x
1050 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1051
1052 add $bp,sp,#$Hcub
1053 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1054
1055 add $bp,sp,#$U2
1056 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
1057 ldp $a0,$a1,[sp,#$S1]
1058 ldp $a2,$a3,[sp,#$S1+16]
1059 add $rp,sp,#$res_y
1060 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1061
1062 add $bp,sp,#$Hcub
1063 add $rp,sp,#$S2
1064 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
1065
1066 ldr $bi,[sp,#$R]
1067 ldp $a0,$a1,[sp,#$res_y]
1068 ldp $a2,$a3,[sp,#$res_y+16]
1069 add $bp,sp,#$R
1070 add $rp,sp,#$res_y
1071 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1072
1073 add $bp,sp,#$S2
1074 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1075
1076 ldp $a0,$a1,[sp,#$res_x] // res
1077 ldp $a2,$a3,[sp,#$res_x+16]
1078 ldp $t0,$t1,[$bp_real] // in2
1079 ldp $t2,$t3,[$bp_real,#16]
1080___
1081for($i=0;$i<64;$i+=32) { # conditional moves
1082$code.=<<___;
1083 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1084 cmp $in1infty,#0 // !$in1intfy, remember?
1085 ldp $acc2,$acc3,[$ap_real,#$i+16]
1086 csel $t0,$a0,$t0,ne
1087 csel $t1,$a1,$t1,ne
1088 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1089 csel $t2,$a2,$t2,ne
1090 csel $t3,$a3,$t3,ne
1091 cmp $in2infty,#0 // !$in2intfy, remember?
1092 ldp $a2,$a3,[sp,#$res_x+$i+48]
1093 csel $acc0,$t0,$acc0,ne
1094 csel $acc1,$t1,$acc1,ne
1095 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1096 csel $acc2,$t2,$acc2,ne
1097 csel $acc3,$t3,$acc3,ne
1098 ldp $t2,$t3,[$bp_real,#$i+48]
1099 stp $acc0,$acc1,[$rp_real,#$i]
1100 stp $acc2,$acc3,[$rp_real,#$i+16]
1101___
1102}
1103$code.=<<___;
1104 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1105 cmp $in1infty,#0 // !$in1intfy, remember?
1106 ldp $acc2,$acc3,[$ap_real,#$i+16]
1107 csel $t0,$a0,$t0,ne
1108 csel $t1,$a1,$t1,ne
1109 csel $t2,$a2,$t2,ne
1110 csel $t3,$a3,$t3,ne
1111 cmp $in2infty,#0 // !$in2intfy, remember?
1112 csel $acc0,$t0,$acc0,ne
1113 csel $acc1,$t1,$acc1,ne
1114 csel $acc2,$t2,$acc2,ne
1115 csel $acc3,$t3,$acc3,ne
1116 stp $acc0,$acc1,[$rp_real,#$i]
1117 stp $acc2,$acc3,[$rp_real,#$i+16]
1118
1119.Ladd_done:
9a18aae5 1120 add sp,x29,#0 // destroy frame
e1613e7c
AP
1121 ldp x19,x20,[x29,#16]
1122 ldp x21,x22,[x29,#32]
1123 ldp x23,x24,[x29,#48]
1124 ldp x25,x26,[x29,#64]
1125 ldp x29,x30,[sp],#80
9a18aae5 1126 .inst 0xd50323bf // autiasp
e1613e7c
AP
1127 ret
1128.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1129___
1130}
1131
1132########################################################################
1133# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1134# const P256_POINT_AFFINE *in2);
1135{
1136my ($res_x,$res_y,$res_z,
1137 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1138my $Z1sqr = $S2;
1139# above map() describes stack layout with 10 temporary
1140# 256-bit vectors on top.
1141my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1142
1143$code.=<<___;
1144.globl ecp_nistz256_point_add_affine
1145.type ecp_nistz256_point_add_affine,%function
1146.align 5
1147ecp_nistz256_point_add_affine:
9a18aae5 1148 .inst 0xd503233f // paciasp
e1613e7c
AP
1149 stp x29,x30,[sp,#-80]!
1150 add x29,sp,#0
1151 stp x19,x20,[sp,#16]
1152 stp x21,x22,[sp,#32]
1153 stp x23,x24,[sp,#48]
1154 stp x25,x26,[sp,#64]
1155 sub sp,sp,#32*10
1156
1157 mov $rp_real,$rp
1158 mov $ap_real,$ap
1159 mov $bp_real,$bp
1160 ldr $poly1,.Lpoly+8
1161 ldr $poly3,.Lpoly+24
1162
c74aea8d
AP
1163 ldp $a0,$a1,[$ap,#64] // in1_z
1164 ldp $a2,$a3,[$ap,#64+16]
1165 orr $t0,$a0,$a1
1166 orr $t2,$a2,$a3
1167 orr $in1infty,$t0,$t2
e1613e7c
AP
1168 cmp $in1infty,#0
1169 csetm $in1infty,ne // !in1infty
1170
c74aea8d
AP
1171 ldp $acc0,$acc1,[$bp] // in2_x
1172 ldp $acc2,$acc3,[$bp,#16]
1173 ldp $t0,$t1,[$bp,#32] // in2_y
e1613e7c 1174 ldp $t2,$t3,[$bp,#48]
c74aea8d
AP
1175 orr $acc0,$acc0,$acc1
1176 orr $acc2,$acc2,$acc3
e1613e7c
AP
1177 orr $t0,$t0,$t1
1178 orr $t2,$t2,$t3
c74aea8d 1179 orr $acc0,$acc0,$acc2
e1613e7c 1180 orr $t0,$t0,$t2
c74aea8d 1181 orr $in2infty,$acc0,$t0
e1613e7c
AP
1182 cmp $in2infty,#0
1183 csetm $in2infty,ne // !in2infty
1184
e1613e7c
AP
1185 add $rp,sp,#$Z1sqr
1186 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
1187
1188 mov $a0,$acc0
1189 mov $a1,$acc1
1190 mov $a2,$acc2
1191 mov $a3,$acc3
1192 ldr $bi,[$bp_real]
1193 add $bp,$bp_real,#0
1194 add $rp,sp,#$U2
1195 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
1196
1197 add $bp,$ap_real,#0
1198 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
1199 ldp $a0,$a1,[sp,#$Z1sqr]
1200 ldp $a2,$a3,[sp,#$Z1sqr+16]
1201 add $rp,sp,#$H
1202 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
1203
1204 add $bp,$ap_real,#64
1205 add $rp,sp,#$S2
1206 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
1207
1208 ldr $bi,[$ap_real,#64]
1209 ldp $a0,$a1,[sp,#$H]
1210 ldp $a2,$a3,[sp,#$H+16]
1211 add $bp,$ap_real,#64
1212 add $rp,sp,#$res_z
1213 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1214
1215 ldr $bi,[$bp_real,#32]
1216 ldp $a0,$a1,[sp,#$S2]
1217 ldp $a2,$a3,[sp,#$S2+16]
1218 add $bp,$bp_real,#32
1219 add $rp,sp,#$S2
1220 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
1221
1222 add $bp,$ap_real,#32
1223 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
1224 ldp $a2,$a3,[sp,#$H+16]
1225 add $rp,sp,#$R
1226 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
1227
1228 add $rp,sp,#$Hsqr
1229 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1230
1231 ldp $a0,$a1,[sp,#$R]
1232 ldp $a2,$a3,[sp,#$R+16]
1233 add $rp,sp,#$Rsqr
1234 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1235
1236 ldr $bi,[sp,#$H]
1237 ldp $a0,$a1,[sp,#$Hsqr]
1238 ldp $a2,$a3,[sp,#$Hsqr+16]
1239 add $bp,sp,#$H
1240 add $rp,sp,#$Hcub
1241 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1242
1243 ldr $bi,[$ap_real]
1244 ldp $a0,$a1,[sp,#$Hsqr]
1245 ldp $a2,$a3,[sp,#$Hsqr+16]
1246 add $bp,$ap_real,#0
1247 add $rp,sp,#$U2
1248 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
1249
1250 mov $t0,$acc0
1251 mov $t1,$acc1
1252 mov $t2,$acc2
1253 mov $t3,$acc3
1254 add $rp,sp,#$Hsqr
1255 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1256
1257 add $bp,sp,#$Rsqr
1258 add $rp,sp,#$res_x
1259 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1260
1261 add $bp,sp,#$Hcub
1262 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1263
1264 add $bp,sp,#$U2
1265 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
1266 ldp $a0,$a1,[sp,#$Hcub]
1267 ldp $a2,$a3,[sp,#$Hcub+16]
1268 add $rp,sp,#$res_y
1269 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1270
1271 add $bp,$ap_real,#32
1272 add $rp,sp,#$S2
1273 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
1274
1275 ldr $bi,[sp,#$R]
1276 ldp $a0,$a1,[sp,#$res_y]
1277 ldp $a2,$a3,[sp,#$res_y+16]
1278 add $bp,sp,#$R
1279 add $rp,sp,#$res_y
1280 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1281
1282 add $bp,sp,#$S2
1283 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1284
1285 ldp $a0,$a1,[sp,#$res_x] // res
1286 ldp $a2,$a3,[sp,#$res_x+16]
1287 ldp $t0,$t1,[$bp_real] // in2
1288 ldp $t2,$t3,[$bp_real,#16]
1289___
1290for($i=0;$i<64;$i+=32) { # conditional moves
1291$code.=<<___;
1292 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1293 cmp $in1infty,#0 // !$in1intfy, remember?
1294 ldp $acc2,$acc3,[$ap_real,#$i+16]
1295 csel $t0,$a0,$t0,ne
1296 csel $t1,$a1,$t1,ne
1297 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1298 csel $t2,$a2,$t2,ne
1299 csel $t3,$a3,$t3,ne
1300 cmp $in2infty,#0 // !$in2intfy, remember?
1301 ldp $a2,$a3,[sp,#$res_x+$i+48]
1302 csel $acc0,$t0,$acc0,ne
1303 csel $acc1,$t1,$acc1,ne
1304 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1305 csel $acc2,$t2,$acc2,ne
1306 csel $acc3,$t3,$acc3,ne
1307 ldp $t2,$t3,[$bp_real,#$i+48]
1308 stp $acc0,$acc1,[$rp_real,#$i]
1309 stp $acc2,$acc3,[$rp_real,#$i+16]
1310___
57758351
AP
1311$code.=<<___ if ($i == 0);
1312 adr $bp_real,.Lone_mont-64
1313___
e1613e7c
AP
1314}
1315$code.=<<___;
1316 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1317 cmp $in1infty,#0 // !$in1intfy, remember?
1318 ldp $acc2,$acc3,[$ap_real,#$i+16]
1319 csel $t0,$a0,$t0,ne
1320 csel $t1,$a1,$t1,ne
1321 csel $t2,$a2,$t2,ne
1322 csel $t3,$a3,$t3,ne
1323 cmp $in2infty,#0 // !$in2intfy, remember?
1324 csel $acc0,$t0,$acc0,ne
1325 csel $acc1,$t1,$acc1,ne
1326 csel $acc2,$t2,$acc2,ne
1327 csel $acc3,$t3,$acc3,ne
1328 stp $acc0,$acc1,[$rp_real,#$i]
1329 stp $acc2,$acc3,[$rp_real,#$i+16]
1330
1331 add sp,x29,#0 // destroy frame
1332 ldp x19,x20,[x29,#16]
1333 ldp x21,x22,[x29,#32]
1334 ldp x23,x24,[x29,#48]
1335 ldp x25,x26,[x29,#64]
1336 ldp x29,x30,[sp],#80
9a18aae5 1337 .inst 0xd50323bf // autiasp
e1613e7c
AP
1338 ret
1339.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1340___
ab4f2026
AP
1341}
1342if (1) {
1343my ($ord0,$ord1) = ($poly1,$poly3);
1344my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1345my $acc7 = $bi;
1346
1347$code.=<<___;
1348////////////////////////////////////////////////////////////////////////
1349// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1350// uint64_t b[4]);
1351.globl ecp_nistz256_ord_mul_mont
1352.type ecp_nistz256_ord_mul_mont,%function
1353.align 4
1354ecp_nistz256_ord_mul_mont:
1355 stp x29,x30,[sp,#-64]!
1356 add x29,sp,#0
1357 stp x19,x20,[sp,#16]
1358 stp x21,x22,[sp,#32]
1359 stp x23,x24,[sp,#48]
1360
1361 adr $ordk,.Lord
1362 ldr $bi,[$bp] // bp[0]
1363 ldp $a0,$a1,[$ap]
1364 ldp $a2,$a3,[$ap,#16]
1365
1366 ldp $ord0,$ord1,[$ordk,#0]
1367 ldp $ord2,$ord3,[$ordk,#16]
1368 ldr $ordk,[$ordk,#32]
1369
1370 mul $acc0,$a0,$bi // a[0]*b[0]
1371 umulh $t0,$a0,$bi
1372
1373 mul $acc1,$a1,$bi // a[1]*b[0]
1374 umulh $t1,$a1,$bi
1375
1376 mul $acc2,$a2,$bi // a[2]*b[0]
1377 umulh $t2,$a2,$bi
1378
1379 mul $acc3,$a3,$bi // a[3]*b[0]
1380 umulh $acc4,$a3,$bi
1381
1382 mul $t4,$acc0,$ordk
1383
1384 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
1385 adcs $acc2,$acc2,$t1
1386 adcs $acc3,$acc3,$t2
1387 adc $acc4,$acc4,xzr
1388 mov $acc5,xzr
1389___
1390for ($i=1;$i<4;$i++) {
1391 ################################################################
1392 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1393 # * abcdefgh
1394 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1395 #
1396 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1397 # rewrite above as:
1398 #
1399 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1400 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1401 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1402$code.=<<___;
1403 ldr $bi,[$bp,#8*$i] // b[i]
1404
1405 lsl $t0,$t4,#32
1406 subs $acc2,$acc2,$t4
1407 lsr $t1,$t4,#32
1408 sbcs $acc3,$acc3,$t0
1409 sbcs $acc4,$acc4,$t1
1410 sbc $acc5,$acc5,xzr
1411
1412 subs xzr,$acc0,#1
1413 umulh $t1,$ord0,$t4
1414 mul $t2,$ord1,$t4
1415 umulh $t3,$ord1,$t4
1416
1417 adcs $t2,$t2,$t1
1418 mul $t0,$a0,$bi
1419 adc $t3,$t3,xzr
1420 mul $t1,$a1,$bi
1421
1422 adds $acc0,$acc1,$t2
1423 mul $t2,$a2,$bi
1424 adcs $acc1,$acc2,$t3
1425 mul $t3,$a3,$bi
1426 adcs $acc2,$acc3,$t4
1427 adcs $acc3,$acc4,$t4
1428 adc $acc4,$acc5,xzr
1429
1430 adds $acc0,$acc0,$t0 // accumulate low parts
1431 umulh $t0,$a0,$bi
1432 adcs $acc1,$acc1,$t1
1433 umulh $t1,$a1,$bi
1434 adcs $acc2,$acc2,$t2
1435 umulh $t2,$a2,$bi
1436 adcs $acc3,$acc3,$t3
1437 umulh $t3,$a3,$bi
1438 adc $acc4,$acc4,xzr
1439 mul $t4,$acc0,$ordk
1440 adds $acc1,$acc1,$t0 // accumulate high parts
1441 adcs $acc2,$acc2,$t1
1442 adcs $acc3,$acc3,$t2
1443 adcs $acc4,$acc4,$t3
1444 adc $acc5,xzr,xzr
1445___
1446}
1447$code.=<<___;
1448 lsl $t0,$t4,#32 // last reduction
1449 subs $acc2,$acc2,$t4
1450 lsr $t1,$t4,#32
1451 sbcs $acc3,$acc3,$t0
1452 sbcs $acc4,$acc4,$t1
1453 sbc $acc5,$acc5,xzr
1454
1455 subs xzr,$acc0,#1
1456 umulh $t1,$ord0,$t4
1457 mul $t2,$ord1,$t4
1458 umulh $t3,$ord1,$t4
1459
1460 adcs $t2,$t2,$t1
1461 adc $t3,$t3,xzr
1462
1463 adds $acc0,$acc1,$t2
1464 adcs $acc1,$acc2,$t3
1465 adcs $acc2,$acc3,$t4
1466 adcs $acc3,$acc4,$t4
1467 adc $acc4,$acc5,xzr
1468
1469 subs $t0,$acc0,$ord0 // ret -= modulus
1470 sbcs $t1,$acc1,$ord1
1471 sbcs $t2,$acc2,$ord2
1472 sbcs $t3,$acc3,$ord3
1473 sbcs xzr,$acc4,xzr
1474
1475 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1476 csel $acc1,$acc1,$t1,lo
1477 csel $acc2,$acc2,$t2,lo
1478 stp $acc0,$acc1,[$rp]
1479 csel $acc3,$acc3,$t3,lo
1480 stp $acc2,$acc3,[$rp,#16]
1481
1482 ldp x19,x20,[sp,#16]
1483 ldp x21,x22,[sp,#32]
1484 ldp x23,x24,[sp,#48]
1485 ldr x29,[sp],#64
1486 ret
1487.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1488
1489////////////////////////////////////////////////////////////////////////
1490// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
15972296 1491// uint64_t rep);
ab4f2026
AP
1492.globl ecp_nistz256_ord_sqr_mont
1493.type ecp_nistz256_ord_sqr_mont,%function
1494.align 4
1495ecp_nistz256_ord_sqr_mont:
1496 stp x29,x30,[sp,#-64]!
1497 add x29,sp,#0
1498 stp x19,x20,[sp,#16]
1499 stp x21,x22,[sp,#32]
1500 stp x23,x24,[sp,#48]
1501
1502 adr $ordk,.Lord
1503 ldp $a0,$a1,[$ap]
1504 ldp $a2,$a3,[$ap,#16]
1505
1506 ldp $ord0,$ord1,[$ordk,#0]
1507 ldp $ord2,$ord3,[$ordk,#16]
1508 ldr $ordk,[$ordk,#32]
1509 b .Loop_ord_sqr
1510
1511.align 4
1512.Loop_ord_sqr:
1513 sub $bp,$bp,#1
1514 ////////////////////////////////////////////////////////////////
1515 // | | | | | |a1*a0| |
1516 // | | | | |a2*a0| | |
1517 // | |a3*a2|a3*a0| | | |
1518 // | | | |a2*a1| | | |
1519 // | | |a3*a1| | | | |
1520 // *| | | | | | | | 2|
1521 // +|a3*a3|a2*a2|a1*a1|a0*a0|
1522 // |--+--+--+--+--+--+--+--|
1523 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1524 //
1525 // "can't overflow" below mark carrying into high part of
1526 // multiplication result, which can't overflow, because it
1527 // can never be all ones.
1528
1529 mul $acc1,$a1,$a0 // a[1]*a[0]
1530 umulh $t1,$a1,$a0
1531 mul $acc2,$a2,$a0 // a[2]*a[0]
1532 umulh $t2,$a2,$a0
1533 mul $acc3,$a3,$a0 // a[3]*a[0]
1534 umulh $acc4,$a3,$a0
1535
1536 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
1537 mul $t0,$a2,$a1 // a[2]*a[1]
1538 umulh $t1,$a2,$a1
1539 adcs $acc3,$acc3,$t2
1540 mul $t2,$a3,$a1 // a[3]*a[1]
1541 umulh $t3,$a3,$a1
1542 adc $acc4,$acc4,xzr // can't overflow
1543
1544 mul $acc5,$a3,$a2 // a[3]*a[2]
1545 umulh $acc6,$a3,$a2
1546
1547 adds $t1,$t1,$t2 // accumulate high parts of multiplication
1548 mul $acc0,$a0,$a0 // a[0]*a[0]
1549 adc $t2,$t3,xzr // can't overflow
1550
1551 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
1552 umulh $a0,$a0,$a0
1553 adcs $acc4,$acc4,$t1
1554 mul $t1,$a1,$a1 // a[1]*a[1]
1555 adcs $acc5,$acc5,$t2
1556 umulh $a1,$a1,$a1
1557 adc $acc6,$acc6,xzr // can't overflow
1558
1559 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
1560 mul $t2,$a2,$a2 // a[2]*a[2]
1561 adcs $acc2,$acc2,$acc2
1562 umulh $a2,$a2,$a2
1563 adcs $acc3,$acc3,$acc3
1564 mul $t3,$a3,$a3 // a[3]*a[3]
1565 adcs $acc4,$acc4,$acc4
1566 umulh $a3,$a3,$a3
1567 adcs $acc5,$acc5,$acc5
1568 adcs $acc6,$acc6,$acc6
1569 adc $acc7,xzr,xzr
1570
1571 adds $acc1,$acc1,$a0 // +a[i]*a[i]
1572 mul $t4,$acc0,$ordk
1573 adcs $acc2,$acc2,$t1
1574 adcs $acc3,$acc3,$a1
1575 adcs $acc4,$acc4,$t2
1576 adcs $acc5,$acc5,$a2
1577 adcs $acc6,$acc6,$t3
1578 adc $acc7,$acc7,$a3
1579___
1580for($i=0; $i<4; $i++) { # reductions
1581$code.=<<___;
1582 subs xzr,$acc0,#1
1583 umulh $t1,$ord0,$t4
1584 mul $t2,$ord1,$t4
1585 umulh $t3,$ord1,$t4
1586
1587 adcs $t2,$t2,$t1
1588 adc $t3,$t3,xzr
1589
1590 adds $acc0,$acc1,$t2
1591 adcs $acc1,$acc2,$t3
1592 adcs $acc2,$acc3,$t4
1593 adc $acc3,xzr,$t4 // can't overflow
1594___
1595$code.=<<___ if ($i<3);
1596 mul $t3,$acc0,$ordk
1597___
1598$code.=<<___;
1599 lsl $t0,$t4,#32
1600 subs $acc1,$acc1,$t4
1601 lsr $t1,$t4,#32
1602 sbcs $acc2,$acc2,$t0
1603 sbc $acc3,$acc3,$t1 // can't borrow
1604___
1605 ($t3,$t4) = ($t4,$t3);
1606}
1607$code.=<<___;
1608 adds $acc0,$acc0,$acc4 // accumulate upper half
1609 adcs $acc1,$acc1,$acc5
1610 adcs $acc2,$acc2,$acc6
1611 adcs $acc3,$acc3,$acc7
1612 adc $acc4,xzr,xzr
1613
1614 subs $t0,$acc0,$ord0 // ret -= modulus
1615 sbcs $t1,$acc1,$ord1
1616 sbcs $t2,$acc2,$ord2
1617 sbcs $t3,$acc3,$ord3
1618 sbcs xzr,$acc4,xzr
1619
1620 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1621 csel $a1,$acc1,$t1,lo
1622 csel $a2,$acc2,$t2,lo
1623 csel $a3,$acc3,$t3,lo
1624
1625 cbnz $bp,.Loop_ord_sqr
1626
1627 stp $a0,$a1,[$rp]
1628 stp $a2,$a3,[$rp,#16]
1629
1630 ldp x19,x20,[sp,#16]
1631 ldp x21,x22,[sp,#32]
1632 ldp x23,x24,[sp,#48]
1633 ldr x29,[sp],#64
1634 ret
1635.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1636___
e1613e7c
AP
1637} }
1638
1639########################################################################
1640# scatter-gather subroutines
1641{
1642my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1643$code.=<<___;
1644// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1645// int x2);
1646.globl ecp_nistz256_scatter_w5
1647.type ecp_nistz256_scatter_w5,%function
1648.align 4
1649ecp_nistz256_scatter_w5:
1650 stp x29,x30,[sp,#-16]!
1651 add x29,sp,#0
1652
1653 add $out,$out,$index,lsl#2
1654
1655 ldp x4,x5,[$inp] // X
1656 ldp x6,x7,[$inp,#16]
db42bb44 1657 stur w4,[$out,#64*0-4]
e1613e7c
AP
1658 lsr x4,x4,#32
1659 str w5,[$out,#64*1-4]
1660 lsr x5,x5,#32
1661 str w6,[$out,#64*2-4]
1662 lsr x6,x6,#32
1663 str w7,[$out,#64*3-4]
1664 lsr x7,x7,#32
1665 str w4,[$out,#64*4-4]
1666 str w5,[$out,#64*5-4]
1667 str w6,[$out,#64*6-4]
1668 str w7,[$out,#64*7-4]
1669 add $out,$out,#64*8
1670
1671 ldp x4,x5,[$inp,#32] // Y
1672 ldp x6,x7,[$inp,#48]
db42bb44 1673 stur w4,[$out,#64*0-4]
e1613e7c
AP
1674 lsr x4,x4,#32
1675 str w5,[$out,#64*1-4]
1676 lsr x5,x5,#32
1677 str w6,[$out,#64*2-4]
1678 lsr x6,x6,#32
1679 str w7,[$out,#64*3-4]
1680 lsr x7,x7,#32
1681 str w4,[$out,#64*4-4]
1682 str w5,[$out,#64*5-4]
1683 str w6,[$out,#64*6-4]
1684 str w7,[$out,#64*7-4]
1685 add $out,$out,#64*8
1686
1687 ldp x4,x5,[$inp,#64] // Z
1688 ldp x6,x7,[$inp,#80]
db42bb44 1689 stur w4,[$out,#64*0-4]
e1613e7c
AP
1690 lsr x4,x4,#32
1691 str w5,[$out,#64*1-4]
1692 lsr x5,x5,#32
1693 str w6,[$out,#64*2-4]
1694 lsr x6,x6,#32
1695 str w7,[$out,#64*3-4]
1696 lsr x7,x7,#32
1697 str w4,[$out,#64*4-4]
1698 str w5,[$out,#64*5-4]
1699 str w6,[$out,#64*6-4]
1700 str w7,[$out,#64*7-4]
1701
1702 ldr x29,[sp],#16
1703 ret
1704.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1705
1706// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1707// int x2);
1708.globl ecp_nistz256_gather_w5
1709.type ecp_nistz256_gather_w5,%function
1710.align 4
1711ecp_nistz256_gather_w5:
1712 stp x29,x30,[sp,#-16]!
1713 add x29,sp,#0
1714
1715 cmp $index,xzr
1716 csetm x3,ne
1717 add $index,$index,x3
1718 add $inp,$inp,$index,lsl#2
1719
1720 ldr w4,[$inp,#64*0]
1721 ldr w5,[$inp,#64*1]
1722 ldr w6,[$inp,#64*2]
1723 ldr w7,[$inp,#64*3]
1724 ldr w8,[$inp,#64*4]
1725 ldr w9,[$inp,#64*5]
1726 ldr w10,[$inp,#64*6]
1727 ldr w11,[$inp,#64*7]
1728 add $inp,$inp,#64*8
1729 orr x4,x4,x8,lsl#32
1730 orr x5,x5,x9,lsl#32
1731 orr x6,x6,x10,lsl#32
1732 orr x7,x7,x11,lsl#32
1733 csel x4,x4,xzr,ne
1734 csel x5,x5,xzr,ne
1735 csel x6,x6,xzr,ne
1736 csel x7,x7,xzr,ne
1737 stp x4,x5,[$out] // X
1738 stp x6,x7,[$out,#16]
1739
1740 ldr w4,[$inp,#64*0]
1741 ldr w5,[$inp,#64*1]
1742 ldr w6,[$inp,#64*2]
1743 ldr w7,[$inp,#64*3]
1744 ldr w8,[$inp,#64*4]
1745 ldr w9,[$inp,#64*5]
1746 ldr w10,[$inp,#64*6]
1747 ldr w11,[$inp,#64*7]
1748 add $inp,$inp,#64*8
1749 orr x4,x4,x8,lsl#32
1750 orr x5,x5,x9,lsl#32
1751 orr x6,x6,x10,lsl#32
1752 orr x7,x7,x11,lsl#32
1753 csel x4,x4,xzr,ne
1754 csel x5,x5,xzr,ne
1755 csel x6,x6,xzr,ne
1756 csel x7,x7,xzr,ne
1757 stp x4,x5,[$out,#32] // Y
1758 stp x6,x7,[$out,#48]
1759
1760 ldr w4,[$inp,#64*0]
1761 ldr w5,[$inp,#64*1]
1762 ldr w6,[$inp,#64*2]
1763 ldr w7,[$inp,#64*3]
1764 ldr w8,[$inp,#64*4]
1765 ldr w9,[$inp,#64*5]
1766 ldr w10,[$inp,#64*6]
1767 ldr w11,[$inp,#64*7]
1768 orr x4,x4,x8,lsl#32
1769 orr x5,x5,x9,lsl#32
1770 orr x6,x6,x10,lsl#32
1771 orr x7,x7,x11,lsl#32
1772 csel x4,x4,xzr,ne
1773 csel x5,x5,xzr,ne
1774 csel x6,x6,xzr,ne
1775 csel x7,x7,xzr,ne
1776 stp x4,x5,[$out,#64] // Z
1777 stp x6,x7,[$out,#80]
1778
1779 ldr x29,[sp],#16
1780 ret
1781.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1782
1783// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1784// int x2);
1785.globl ecp_nistz256_scatter_w7
1786.type ecp_nistz256_scatter_w7,%function
1787.align 4
1788ecp_nistz256_scatter_w7:
1789 stp x29,x30,[sp,#-16]!
1790 add x29,sp,#0
1791
1792 add $out,$out,$index
1793 mov $index,#64/8
1794.Loop_scatter_w7:
1795 ldr x3,[$inp],#8
1796 subs $index,$index,#1
1797 prfm pstl1strm,[$out,#4096+64*0]
1798 prfm pstl1strm,[$out,#4096+64*1]
1799 prfm pstl1strm,[$out,#4096+64*2]
1800 prfm pstl1strm,[$out,#4096+64*3]
1801 prfm pstl1strm,[$out,#4096+64*4]
1802 prfm pstl1strm,[$out,#4096+64*5]
1803 prfm pstl1strm,[$out,#4096+64*6]
1804 prfm pstl1strm,[$out,#4096+64*7]
87a75b3e 1805 strb w3,[$out,#64*0]
e1613e7c 1806 lsr x3,x3,#8
87a75b3e 1807 strb w3,[$out,#64*1]
e1613e7c 1808 lsr x3,x3,#8
87a75b3e 1809 strb w3,[$out,#64*2]
e1613e7c 1810 lsr x3,x3,#8
87a75b3e 1811 strb w3,[$out,#64*3]
e1613e7c 1812 lsr x3,x3,#8
87a75b3e 1813 strb w3,[$out,#64*4]
e1613e7c 1814 lsr x3,x3,#8
87a75b3e 1815 strb w3,[$out,#64*5]
e1613e7c 1816 lsr x3,x3,#8
87a75b3e 1817 strb w3,[$out,#64*6]
e1613e7c 1818 lsr x3,x3,#8
87a75b3e 1819 strb w3,[$out,#64*7]
e1613e7c
AP
1820 add $out,$out,#64*8
1821 b.ne .Loop_scatter_w7
1822
1823 ldr x29,[sp],#16
1824 ret
1825.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1826
1827// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1828// int x2);
1829.globl ecp_nistz256_gather_w7
1830.type ecp_nistz256_gather_w7,%function
1831.align 4
1832ecp_nistz256_gather_w7:
1833 stp x29,x30,[sp,#-16]!
1834 add x29,sp,#0
1835
1836 cmp $index,xzr
1837 csetm x3,ne
1838 add $index,$index,x3
1839 add $inp,$inp,$index
1840 mov $index,#64/8
1841 nop
1842.Loop_gather_w7:
1843 ldrb w4,[$inp,#64*0]
1844 prfm pldl1strm,[$inp,#4096+64*0]
1845 subs $index,$index,#1
1846 ldrb w5,[$inp,#64*1]
1847 prfm pldl1strm,[$inp,#4096+64*1]
1848 ldrb w6,[$inp,#64*2]
1849 prfm pldl1strm,[$inp,#4096+64*2]
1850 ldrb w7,[$inp,#64*3]
1851 prfm pldl1strm,[$inp,#4096+64*3]
1852 ldrb w8,[$inp,#64*4]
1853 prfm pldl1strm,[$inp,#4096+64*4]
1854 ldrb w9,[$inp,#64*5]
1855 prfm pldl1strm,[$inp,#4096+64*5]
1856 ldrb w10,[$inp,#64*6]
1857 prfm pldl1strm,[$inp,#4096+64*6]
1858 ldrb w11,[$inp,#64*7]
1859 prfm pldl1strm,[$inp,#4096+64*7]
1860 add $inp,$inp,#64*8
1861 orr x4,x4,x5,lsl#8
1862 orr x6,x6,x7,lsl#8
1863 orr x8,x8,x9,lsl#8
1864 orr x4,x4,x6,lsl#16
1865 orr x10,x10,x11,lsl#8
1866 orr x4,x4,x8,lsl#32
1867 orr x4,x4,x10,lsl#48
1868 and x4,x4,x3
1869 str x4,[$out],#8
1870 b.ne .Loop_gather_w7
1871
1872 ldr x29,[sp],#16
1873 ret
1874.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1875___
1876}
1877
1878foreach (split("\n",$code)) {
1879 s/\`([^\`]*)\`/eval $1/ge;
1880
1881 print $_,"\n";
1882}
1883close STDOUT; # enforce flush