]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/armv8-mont.pl
Update copyright year
[thirdparty/openssl.git] / crypto / bn / asm / armv8-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # March 2015
18 #
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
32 #
33 # April 2015
34 #
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
42
43 $flavour = shift;
44 $output = shift;
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
50
51 open OUT,"| \"$^X\" $xlate $flavour $output";
52 *STDOUT=*OUT;
53
54 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
57
58 # int bn_mul_mont(
59 $rp="x0"; # BN_ULONG *rp,
60 $ap="x1"; # const BN_ULONG *ap,
61 $bp="x2"; # const BN_ULONG *bp,
62 $np="x3"; # const BN_ULONG *np,
63 $n0="x4"; # const BN_ULONG *n0,
64 $num="x5"; # int num);
65
66 $code.=<<___;
67 .text
68
69 .globl bn_mul_mont
70 .type bn_mul_mont,%function
71 .align 5
72 bn_mul_mont:
73 tst $num,#7
74 b.eq __bn_sqr8x_mont
75 tst $num,#3
76 b.eq __bn_mul4x_mont
77 .Lmul_mont:
78 stp x29,x30,[sp,#-64]!
79 add x29,sp,#0
80 stp x19,x20,[sp,#16]
81 stp x21,x22,[sp,#32]
82 stp x23,x24,[sp,#48]
83
84 ldr $m0,[$bp],#8 // bp[0]
85 sub $tp,sp,$num,lsl#3
86 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
87 lsl $num,$num,#3
88 ldr $n0,[$n0] // *n0
89 and $tp,$tp,#-16 // ABI says so
90 ldp $hi1,$nj,[$np],#16 // np[0..1]
91
92 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
93 sub $j,$num,#16 // j=num-2
94 umulh $hi0,$hi0,$m0
95 mul $alo,$aj,$m0 // ap[1]*bp[0]
96 umulh $ahi,$aj,$m0
97
98 mul $m1,$lo0,$n0 // "tp[0]"*n0
99 mov sp,$tp // alloca
100
101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
102 umulh $hi1,$hi1,$m1
103 mul $nlo,$nj,$m1 // np[1]*m1
104 // (*) adds $lo1,$lo1,$lo0 // discarded
105 // (*) As for removal of first multiplication and addition
106 // instructions. The outcome of first addition is
107 // guaranteed to be zero, which leaves two computationally
108 // significant outcomes: it either carries or not. Then
109 // question is when does it carry? Is there alternative
110 // way to deduce it? If you follow operations, you can
111 // observe that condition for carry is quite simple:
112 // $lo0 being non-zero. So that carry can be calculated
113 // by adding -1 to $lo0. That's what next instruction does.
114 subs xzr,$lo0,#1 // (*)
115 umulh $nhi,$nj,$m1
116 adc $hi1,$hi1,xzr
117 cbz $j,.L1st_skip
118
119 .L1st:
120 ldr $aj,[$ap],#8
121 adds $lo0,$alo,$hi0
122 sub $j,$j,#8 // j--
123 adc $hi0,$ahi,xzr
124
125 ldr $nj,[$np],#8
126 adds $lo1,$nlo,$hi1
127 mul $alo,$aj,$m0 // ap[j]*bp[0]
128 adc $hi1,$nhi,xzr
129 umulh $ahi,$aj,$m0
130
131 adds $lo1,$lo1,$lo0
132 mul $nlo,$nj,$m1 // np[j]*m1
133 adc $hi1,$hi1,xzr
134 umulh $nhi,$nj,$m1
135 str $lo1,[$tp],#8 // tp[j-1]
136 cbnz $j,.L1st
137
138 .L1st_skip:
139 adds $lo0,$alo,$hi0
140 sub $ap,$ap,$num // rewind $ap
141 adc $hi0,$ahi,xzr
142
143 adds $lo1,$nlo,$hi1
144 sub $np,$np,$num // rewind $np
145 adc $hi1,$nhi,xzr
146
147 adds $lo1,$lo1,$lo0
148 sub $i,$num,#8 // i=num-1
149 adcs $hi1,$hi1,$hi0
150
151 adc $ovf,xzr,xzr // upmost overflow bit
152 stp $lo1,$hi1,[$tp]
153
154 .Louter:
155 ldr $m0,[$bp],#8 // bp[i]
156 ldp $hi0,$aj,[$ap],#16
157 ldr $tj,[sp] // tp[0]
158 add $tp,sp,#8
159
160 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
161 sub $j,$num,#16 // j=num-2
162 umulh $hi0,$hi0,$m0
163 ldp $hi1,$nj,[$np],#16
164 mul $alo,$aj,$m0 // ap[1]*bp[i]
165 adds $lo0,$lo0,$tj
166 umulh $ahi,$aj,$m0
167 adc $hi0,$hi0,xzr
168
169 mul $m1,$lo0,$n0
170 sub $i,$i,#8 // i--
171
172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
173 umulh $hi1,$hi1,$m1
174 mul $nlo,$nj,$m1 // np[1]*m1
175 // (*) adds $lo1,$lo1,$lo0
176 subs xzr,$lo0,#1 // (*)
177 umulh $nhi,$nj,$m1
178 cbz $j,.Linner_skip
179
180 .Linner:
181 ldr $aj,[$ap],#8
182 adc $hi1,$hi1,xzr
183 ldr $tj,[$tp],#8 // tp[j]
184 adds $lo0,$alo,$hi0
185 sub $j,$j,#8 // j--
186 adc $hi0,$ahi,xzr
187
188 adds $lo1,$nlo,$hi1
189 ldr $nj,[$np],#8
190 adc $hi1,$nhi,xzr
191
192 mul $alo,$aj,$m0 // ap[j]*bp[i]
193 adds $lo0,$lo0,$tj
194 umulh $ahi,$aj,$m0
195 adc $hi0,$hi0,xzr
196
197 mul $nlo,$nj,$m1 // np[j]*m1
198 adds $lo1,$lo1,$lo0
199 umulh $nhi,$nj,$m1
200 str $lo1,[$tp,#-16] // tp[j-1]
201 cbnz $j,.Linner
202
203 .Linner_skip:
204 ldr $tj,[$tp],#8 // tp[j]
205 adc $hi1,$hi1,xzr
206 adds $lo0,$alo,$hi0
207 sub $ap,$ap,$num // rewind $ap
208 adc $hi0,$ahi,xzr
209
210 adds $lo1,$nlo,$hi1
211 sub $np,$np,$num // rewind $np
212 adcs $hi1,$nhi,$ovf
213 adc $ovf,xzr,xzr
214
215 adds $lo0,$lo0,$tj
216 adc $hi0,$hi0,xzr
217
218 adds $lo1,$lo1,$lo0
219 adcs $hi1,$hi1,$hi0
220 adc $ovf,$ovf,xzr // upmost overflow bit
221 stp $lo1,$hi1,[$tp,#-16]
222
223 cbnz $i,.Louter
224
225 // Final step. We see if result is larger than modulus, and
226 // if it is, subtract the modulus. But comparison implies
227 // subtraction. So we subtract modulus, see if it borrowed,
228 // and conditionally copy original value.
229 ldr $tj,[sp] // tp[0]
230 add $tp,sp,#8
231 ldr $nj,[$np],#8 // np[0]
232 subs $j,$num,#8 // j=num-1 and clear borrow
233 mov $ap,$rp
234 .Lsub:
235 sbcs $aj,$tj,$nj // tp[j]-np[j]
236 ldr $tj,[$tp],#8
237 sub $j,$j,#8 // j--
238 ldr $nj,[$np],#8
239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
240 cbnz $j,.Lsub
241
242 sbcs $aj,$tj,$nj
243 sbcs $ovf,$ovf,xzr // did it borrow?
244 str $aj,[$ap],#8 // rp[num-1]
245
246 ldr $tj,[sp] // tp[0]
247 add $tp,sp,#8
248 ldr $aj,[$rp],#8 // rp[0]
249 sub $num,$num,#8 // num--
250 nop
251 .Lcond_copy:
252 sub $num,$num,#8 // num--
253 csel $nj,$tj,$aj,lo // did it borrow?
254 ldr $tj,[$tp],#8
255 ldr $aj,[$rp],#8
256 str xzr,[$tp,#-16] // wipe tp
257 str $nj,[$rp,#-16]
258 cbnz $num,.Lcond_copy
259
260 csel $nj,$tj,$aj,lo
261 str xzr,[$tp,#-8] // wipe tp
262 str $nj,[$rp,#-8]
263
264 ldp x19,x20,[x29,#16]
265 mov sp,x29
266 ldp x21,x22,[x29,#32]
267 mov x0,#1
268 ldp x23,x24,[x29,#48]
269 ldr x29,[sp],#64
270 ret
271 .size bn_mul_mont,.-bn_mul_mont
272 ___
273 {
274 ########################################################################
275 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
276
277 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280 my ($cnt,$carry,$topmost)=("x27","x28","x30");
281 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
282
283 $code.=<<___;
284 .type __bn_sqr8x_mont,%function
285 .align 5
286 __bn_sqr8x_mont:
287 cmp $ap,$bp
288 b.ne __bn_mul4x_mont
289 .Lsqr8x_mont:
290 .inst 0xd503233f // paciasp
291 stp x29,x30,[sp,#-128]!
292 add x29,sp,#0
293 stp x19,x20,[sp,#16]
294 stp x21,x22,[sp,#32]
295 stp x23,x24,[sp,#48]
296 stp x25,x26,[sp,#64]
297 stp x27,x28,[sp,#80]
298 stp $rp,$np,[sp,#96] // offload rp and np
299
300 ldp $a0,$a1,[$ap,#8*0]
301 ldp $a2,$a3,[$ap,#8*2]
302 ldp $a4,$a5,[$ap,#8*4]
303 ldp $a6,$a7,[$ap,#8*6]
304
305 sub $tp,sp,$num,lsl#4
306 lsl $num,$num,#3
307 ldr $n0,[$n0] // *n0
308 mov sp,$tp // alloca
309 sub $cnt,$num,#8*8
310 b .Lsqr8x_zero_start
311
312 .Lsqr8x_zero:
313 sub $cnt,$cnt,#8*8
314 stp xzr,xzr,[$tp,#8*0]
315 stp xzr,xzr,[$tp,#8*2]
316 stp xzr,xzr,[$tp,#8*4]
317 stp xzr,xzr,[$tp,#8*6]
318 .Lsqr8x_zero_start:
319 stp xzr,xzr,[$tp,#8*8]
320 stp xzr,xzr,[$tp,#8*10]
321 stp xzr,xzr,[$tp,#8*12]
322 stp xzr,xzr,[$tp,#8*14]
323 add $tp,$tp,#8*16
324 cbnz $cnt,.Lsqr8x_zero
325
326 add $ap_end,$ap,$num
327 add $ap,$ap,#8*8
328 mov $acc0,xzr
329 mov $acc1,xzr
330 mov $acc2,xzr
331 mov $acc3,xzr
332 mov $acc4,xzr
333 mov $acc5,xzr
334 mov $acc6,xzr
335 mov $acc7,xzr
336 mov $tp,sp
337 str $n0,[x29,#112] // offload n0
338
339 // Multiply everything but a[i]*a[i]
340 .align 4
341 .Lsqr8x_outer_loop:
342 // a[1]a[0] (i)
343 // a[2]a[0]
344 // a[3]a[0]
345 // a[4]a[0]
346 // a[5]a[0]
347 // a[6]a[0]
348 // a[7]a[0]
349 // a[2]a[1] (ii)
350 // a[3]a[1]
351 // a[4]a[1]
352 // a[5]a[1]
353 // a[6]a[1]
354 // a[7]a[1]
355 // a[3]a[2] (iii)
356 // a[4]a[2]
357 // a[5]a[2]
358 // a[6]a[2]
359 // a[7]a[2]
360 // a[4]a[3] (iv)
361 // a[5]a[3]
362 // a[6]a[3]
363 // a[7]a[3]
364 // a[5]a[4] (v)
365 // a[6]a[4]
366 // a[7]a[4]
367 // a[6]a[5] (vi)
368 // a[7]a[5]
369 // a[7]a[6] (vii)
370
371 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
372 mul $t1,$a2,$a0
373 mul $t2,$a3,$a0
374 mul $t3,$a4,$a0
375 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
376 mul $t0,$a5,$a0
377 adcs $acc2,$acc2,$t1
378 mul $t1,$a6,$a0
379 adcs $acc3,$acc3,$t2
380 mul $t2,$a7,$a0
381 adcs $acc4,$acc4,$t3
382 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
383 adcs $acc5,$acc5,$t0
384 umulh $t0,$a2,$a0
385 adcs $acc6,$acc6,$t1
386 umulh $t1,$a3,$a0
387 adcs $acc7,$acc7,$t2
388 umulh $t2,$a4,$a0
389 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
390 adc $acc0,xzr,xzr // t[8]
391 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
392 umulh $t3,$a5,$a0
393 adcs $acc3,$acc3,$t0
394 umulh $t0,$a6,$a0
395 adcs $acc4,$acc4,$t1
396 umulh $t1,$a7,$a0
397 adcs $acc5,$acc5,$t2
398 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
399 adcs $acc6,$acc6,$t3
400 mul $t3,$a3,$a1
401 adcs $acc7,$acc7,$t0
402 mul $t0,$a4,$a1
403 adc $acc0,$acc0,$t1
404
405 mul $t1,$a5,$a1
406 adds $acc3,$acc3,$t2
407 mul $t2,$a6,$a1
408 adcs $acc4,$acc4,$t3
409 mul $t3,$a7,$a1
410 adcs $acc5,$acc5,$t0
411 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
412 adcs $acc6,$acc6,$t1
413 umulh $t1,$a3,$a1
414 adcs $acc7,$acc7,$t2
415 umulh $t2,$a4,$a1
416 adcs $acc0,$acc0,$t3
417 umulh $t3,$a5,$a1
418 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
419 adc $acc1,xzr,xzr // t[9]
420 adds $acc4,$acc4,$t0
421 umulh $t0,$a6,$a1
422 adcs $acc5,$acc5,$t1
423 umulh $t1,$a7,$a1
424 adcs $acc6,$acc6,$t2
425 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
426 adcs $acc7,$acc7,$t3
427 mul $t3,$a4,$a2
428 adcs $acc0,$acc0,$t0
429 mul $t0,$a5,$a2
430 adc $acc1,$acc1,$t1
431
432 mul $t1,$a6,$a2
433 adds $acc5,$acc5,$t2
434 mul $t2,$a7,$a2
435 adcs $acc6,$acc6,$t3
436 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
437 adcs $acc7,$acc7,$t0
438 umulh $t0,$a4,$a2
439 adcs $acc0,$acc0,$t1
440 umulh $t1,$a5,$a2
441 adcs $acc1,$acc1,$t2
442 umulh $t2,$a6,$a2
443 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
444 adc $acc2,xzr,xzr // t[10]
445 adds $acc6,$acc6,$t3
446 umulh $t3,$a7,$a2
447 adcs $acc7,$acc7,$t0
448 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
449 adcs $acc0,$acc0,$t1
450 mul $t1,$a5,$a3
451 adcs $acc1,$acc1,$t2
452 mul $t2,$a6,$a3
453 adc $acc2,$acc2,$t3
454
455 mul $t3,$a7,$a3
456 adds $acc7,$acc7,$t0
457 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
458 adcs $acc0,$acc0,$t1
459 umulh $t1,$a5,$a3
460 adcs $acc1,$acc1,$t2
461 umulh $t2,$a6,$a3
462 adcs $acc2,$acc2,$t3
463 umulh $t3,$a7,$a3
464 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
465 adc $acc3,xzr,xzr // t[11]
466 adds $acc0,$acc0,$t0
467 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
468 adcs $acc1,$acc1,$t1
469 mul $t1,$a6,$a4
470 adcs $acc2,$acc2,$t2
471 mul $t2,$a7,$a4
472 adc $acc3,$acc3,$t3
473
474 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
475 adds $acc1,$acc1,$t0
476 umulh $t0,$a6,$a4
477 adcs $acc2,$acc2,$t1
478 umulh $t1,$a7,$a4
479 adcs $acc3,$acc3,$t2
480 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
481 adc $acc4,xzr,xzr // t[12]
482 adds $acc2,$acc2,$t3
483 mul $t3,$a7,$a5
484 adcs $acc3,$acc3,$t0
485 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
486 adc $acc4,$acc4,$t1
487
488 umulh $t1,$a7,$a5
489 adds $acc3,$acc3,$t2
490 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
491 adcs $acc4,$acc4,$t3
492 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
493 adc $acc5,xzr,xzr // t[13]
494 adds $acc4,$acc4,$t0
495 sub $cnt,$ap_end,$ap // done yet?
496 adc $acc5,$acc5,$t1
497
498 adds $acc5,$acc5,$t2
499 sub $t0,$ap_end,$num // rewinded ap
500 adc $acc6,xzr,xzr // t[14]
501 add $acc6,$acc6,$t3
502
503 cbz $cnt,.Lsqr8x_outer_break
504
505 mov $n0,$a0
506 ldp $a0,$a1,[$tp,#8*0]
507 ldp $a2,$a3,[$tp,#8*2]
508 ldp $a4,$a5,[$tp,#8*4]
509 ldp $a6,$a7,[$tp,#8*6]
510 adds $acc0,$acc0,$a0
511 adcs $acc1,$acc1,$a1
512 ldp $a0,$a1,[$ap,#8*0]
513 adcs $acc2,$acc2,$a2
514 adcs $acc3,$acc3,$a3
515 ldp $a2,$a3,[$ap,#8*2]
516 adcs $acc4,$acc4,$a4
517 adcs $acc5,$acc5,$a5
518 ldp $a4,$a5,[$ap,#8*4]
519 adcs $acc6,$acc6,$a6
520 mov $rp,$ap
521 adcs $acc7,xzr,$a7
522 ldp $a6,$a7,[$ap,#8*6]
523 add $ap,$ap,#8*8
524 //adc $carry,xzr,xzr // moved below
525 mov $cnt,#-8*8
526
527 // a[8]a[0]
528 // a[9]a[0]
529 // a[a]a[0]
530 // a[b]a[0]
531 // a[c]a[0]
532 // a[d]a[0]
533 // a[e]a[0]
534 // a[f]a[0]
535 // a[8]a[1]
536 // a[f]a[1]........................
537 // a[8]a[2]
538 // a[f]a[2]........................
539 // a[8]a[3]
540 // a[f]a[3]........................
541 // a[8]a[4]
542 // a[f]a[4]........................
543 // a[8]a[5]
544 // a[f]a[5]........................
545 // a[8]a[6]
546 // a[f]a[6]........................
547 // a[8]a[7]
548 // a[f]a[7]........................
549 .Lsqr8x_mul:
550 mul $t0,$a0,$n0
551 adc $carry,xzr,xzr // carry bit, modulo-scheduled
552 mul $t1,$a1,$n0
553 add $cnt,$cnt,#8
554 mul $t2,$a2,$n0
555 mul $t3,$a3,$n0
556 adds $acc0,$acc0,$t0
557 mul $t0,$a4,$n0
558 adcs $acc1,$acc1,$t1
559 mul $t1,$a5,$n0
560 adcs $acc2,$acc2,$t2
561 mul $t2,$a6,$n0
562 adcs $acc3,$acc3,$t3
563 mul $t3,$a7,$n0
564 adcs $acc4,$acc4,$t0
565 umulh $t0,$a0,$n0
566 adcs $acc5,$acc5,$t1
567 umulh $t1,$a1,$n0
568 adcs $acc6,$acc6,$t2
569 umulh $t2,$a2,$n0
570 adcs $acc7,$acc7,$t3
571 umulh $t3,$a3,$n0
572 adc $carry,$carry,xzr
573 str $acc0,[$tp],#8
574 adds $acc0,$acc1,$t0
575 umulh $t0,$a4,$n0
576 adcs $acc1,$acc2,$t1
577 umulh $t1,$a5,$n0
578 adcs $acc2,$acc3,$t2
579 umulh $t2,$a6,$n0
580 adcs $acc3,$acc4,$t3
581 umulh $t3,$a7,$n0
582 ldr $n0,[$rp,$cnt]
583 adcs $acc4,$acc5,$t0
584 adcs $acc5,$acc6,$t1
585 adcs $acc6,$acc7,$t2
586 adcs $acc7,$carry,$t3
587 //adc $carry,xzr,xzr // moved above
588 cbnz $cnt,.Lsqr8x_mul
589 // note that carry flag is guaranteed
590 // to be zero at this point
591 cmp $ap,$ap_end // done yet?
592 b.eq .Lsqr8x_break
593
594 ldp $a0,$a1,[$tp,#8*0]
595 ldp $a2,$a3,[$tp,#8*2]
596 ldp $a4,$a5,[$tp,#8*4]
597 ldp $a6,$a7,[$tp,#8*6]
598 adds $acc0,$acc0,$a0
599 ldr $n0,[$rp,#-8*8]
600 adcs $acc1,$acc1,$a1
601 ldp $a0,$a1,[$ap,#8*0]
602 adcs $acc2,$acc2,$a2
603 adcs $acc3,$acc3,$a3
604 ldp $a2,$a3,[$ap,#8*2]
605 adcs $acc4,$acc4,$a4
606 adcs $acc5,$acc5,$a5
607 ldp $a4,$a5,[$ap,#8*4]
608 adcs $acc6,$acc6,$a6
609 mov $cnt,#-8*8
610 adcs $acc7,$acc7,$a7
611 ldp $a6,$a7,[$ap,#8*6]
612 add $ap,$ap,#8*8
613 //adc $carry,xzr,xzr // moved above
614 b .Lsqr8x_mul
615
616 .align 4
617 .Lsqr8x_break:
618 ldp $a0,$a1,[$rp,#8*0]
619 add $ap,$rp,#8*8
620 ldp $a2,$a3,[$rp,#8*2]
621 sub $t0,$ap_end,$ap // is it last iteration?
622 ldp $a4,$a5,[$rp,#8*4]
623 sub $t1,$tp,$t0
624 ldp $a6,$a7,[$rp,#8*6]
625 cbz $t0,.Lsqr8x_outer_loop
626
627 stp $acc0,$acc1,[$tp,#8*0]
628 ldp $acc0,$acc1,[$t1,#8*0]
629 stp $acc2,$acc3,[$tp,#8*2]
630 ldp $acc2,$acc3,[$t1,#8*2]
631 stp $acc4,$acc5,[$tp,#8*4]
632 ldp $acc4,$acc5,[$t1,#8*4]
633 stp $acc6,$acc7,[$tp,#8*6]
634 mov $tp,$t1
635 ldp $acc6,$acc7,[$t1,#8*6]
636 b .Lsqr8x_outer_loop
637
638 .align 4
639 .Lsqr8x_outer_break:
640 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
641 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
642 ldp $t1,$t2,[sp,#8*1]
643 ldp $a5,$a7,[$t0,#8*2]
644 add $ap,$t0,#8*4
645 ldp $t3,$t0,[sp,#8*3]
646
647 stp $acc0,$acc1,[$tp,#8*0]
648 mul $acc0,$a1,$a1
649 stp $acc2,$acc3,[$tp,#8*2]
650 umulh $a1,$a1,$a1
651 stp $acc4,$acc5,[$tp,#8*4]
652 mul $a2,$a3,$a3
653 stp $acc6,$acc7,[$tp,#8*6]
654 mov $tp,sp
655 umulh $a3,$a3,$a3
656 adds $acc1,$a1,$t1,lsl#1
657 extr $t1,$t2,$t1,#63
658 sub $cnt,$num,#8*4
659
660 .Lsqr4x_shift_n_add:
661 adcs $acc2,$a2,$t1
662 extr $t2,$t3,$t2,#63
663 sub $cnt,$cnt,#8*4
664 adcs $acc3,$a3,$t2
665 ldp $t1,$t2,[$tp,#8*5]
666 mul $a4,$a5,$a5
667 ldp $a1,$a3,[$ap],#8*2
668 umulh $a5,$a5,$a5
669 mul $a6,$a7,$a7
670 umulh $a7,$a7,$a7
671 extr $t3,$t0,$t3,#63
672 stp $acc0,$acc1,[$tp,#8*0]
673 adcs $acc4,$a4,$t3
674 extr $t0,$t1,$t0,#63
675 stp $acc2,$acc3,[$tp,#8*2]
676 adcs $acc5,$a5,$t0
677 ldp $t3,$t0,[$tp,#8*7]
678 extr $t1,$t2,$t1,#63
679 adcs $acc6,$a6,$t1
680 extr $t2,$t3,$t2,#63
681 adcs $acc7,$a7,$t2
682 ldp $t1,$t2,[$tp,#8*9]
683 mul $a0,$a1,$a1
684 ldp $a5,$a7,[$ap],#8*2
685 umulh $a1,$a1,$a1
686 mul $a2,$a3,$a3
687 umulh $a3,$a3,$a3
688 stp $acc4,$acc5,[$tp,#8*4]
689 extr $t3,$t0,$t3,#63
690 stp $acc6,$acc7,[$tp,#8*6]
691 add $tp,$tp,#8*8
692 adcs $acc0,$a0,$t3
693 extr $t0,$t1,$t0,#63
694 adcs $acc1,$a1,$t0
695 ldp $t3,$t0,[$tp,#8*3]
696 extr $t1,$t2,$t1,#63
697 cbnz $cnt,.Lsqr4x_shift_n_add
698 ___
699 my ($np,$np_end)=($ap,$ap_end);
700 $code.=<<___;
701 ldp $np,$n0,[x29,#104] // pull np and n0
702
703 adcs $acc2,$a2,$t1
704 extr $t2,$t3,$t2,#63
705 adcs $acc3,$a3,$t2
706 ldp $t1,$t2,[$tp,#8*5]
707 mul $a4,$a5,$a5
708 umulh $a5,$a5,$a5
709 stp $acc0,$acc1,[$tp,#8*0]
710 mul $a6,$a7,$a7
711 umulh $a7,$a7,$a7
712 stp $acc2,$acc3,[$tp,#8*2]
713 extr $t3,$t0,$t3,#63
714 adcs $acc4,$a4,$t3
715 extr $t0,$t1,$t0,#63
716 ldp $acc0,$acc1,[sp,#8*0]
717 adcs $acc5,$a5,$t0
718 extr $t1,$t2,$t1,#63
719 ldp $a0,$a1,[$np,#8*0]
720 adcs $acc6,$a6,$t1
721 extr $t2,xzr,$t2,#63
722 ldp $a2,$a3,[$np,#8*2]
723 adc $acc7,$a7,$t2
724 ldp $a4,$a5,[$np,#8*4]
725
726 // Reduce by 512 bits per iteration
727 mul $na0,$n0,$acc0 // t[0]*n0
728 ldp $a6,$a7,[$np,#8*6]
729 add $np_end,$np,$num
730 ldp $acc2,$acc3,[sp,#8*2]
731 stp $acc4,$acc5,[$tp,#8*4]
732 ldp $acc4,$acc5,[sp,#8*4]
733 stp $acc6,$acc7,[$tp,#8*6]
734 ldp $acc6,$acc7,[sp,#8*6]
735 add $np,$np,#8*8
736 mov $topmost,xzr // initial top-most carry
737 mov $tp,sp
738 mov $cnt,#8
739
740 .Lsqr8x_reduction:
741 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
742 mul $t1,$a1,$na0
743 sub $cnt,$cnt,#1
744 mul $t2,$a2,$na0
745 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
746 mul $t3,$a3,$na0
747 // (*) adds xzr,$acc0,$t0
748 subs xzr,$acc0,#1 // (*)
749 mul $t0,$a4,$na0
750 adcs $acc0,$acc1,$t1
751 mul $t1,$a5,$na0
752 adcs $acc1,$acc2,$t2
753 mul $t2,$a6,$na0
754 adcs $acc2,$acc3,$t3
755 mul $t3,$a7,$na0
756 adcs $acc3,$acc4,$t0
757 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
758 adcs $acc4,$acc5,$t1
759 umulh $t1,$a1,$na0
760 adcs $acc5,$acc6,$t2
761 umulh $t2,$a2,$na0
762 adcs $acc6,$acc7,$t3
763 umulh $t3,$a3,$na0
764 adc $acc7,xzr,xzr
765 adds $acc0,$acc0,$t0
766 umulh $t0,$a4,$na0
767 adcs $acc1,$acc1,$t1
768 umulh $t1,$a5,$na0
769 adcs $acc2,$acc2,$t2
770 umulh $t2,$a6,$na0
771 adcs $acc3,$acc3,$t3
772 umulh $t3,$a7,$na0
773 mul $na0,$n0,$acc0 // next t[0]*n0
774 adcs $acc4,$acc4,$t0
775 adcs $acc5,$acc5,$t1
776 adcs $acc6,$acc6,$t2
777 adc $acc7,$acc7,$t3
778 cbnz $cnt,.Lsqr8x_reduction
779
780 ldp $t0,$t1,[$tp,#8*0]
781 ldp $t2,$t3,[$tp,#8*2]
782 mov $rp,$tp
783 sub $cnt,$np_end,$np // done yet?
784 adds $acc0,$acc0,$t0
785 adcs $acc1,$acc1,$t1
786 ldp $t0,$t1,[$tp,#8*4]
787 adcs $acc2,$acc2,$t2
788 adcs $acc3,$acc3,$t3
789 ldp $t2,$t3,[$tp,#8*6]
790 adcs $acc4,$acc4,$t0
791 adcs $acc5,$acc5,$t1
792 adcs $acc6,$acc6,$t2
793 adcs $acc7,$acc7,$t3
794 //adc $carry,xzr,xzr // moved below
795 cbz $cnt,.Lsqr8x8_post_condition
796
797 ldr $n0,[$tp,#-8*8]
798 ldp $a0,$a1,[$np,#8*0]
799 ldp $a2,$a3,[$np,#8*2]
800 ldp $a4,$a5,[$np,#8*4]
801 mov $cnt,#-8*8
802 ldp $a6,$a7,[$np,#8*6]
803 add $np,$np,#8*8
804
805 .Lsqr8x_tail:
806 mul $t0,$a0,$n0
807 adc $carry,xzr,xzr // carry bit, modulo-scheduled
808 mul $t1,$a1,$n0
809 add $cnt,$cnt,#8
810 mul $t2,$a2,$n0
811 mul $t3,$a3,$n0
812 adds $acc0,$acc0,$t0
813 mul $t0,$a4,$n0
814 adcs $acc1,$acc1,$t1
815 mul $t1,$a5,$n0
816 adcs $acc2,$acc2,$t2
817 mul $t2,$a6,$n0
818 adcs $acc3,$acc3,$t3
819 mul $t3,$a7,$n0
820 adcs $acc4,$acc4,$t0
821 umulh $t0,$a0,$n0
822 adcs $acc5,$acc5,$t1
823 umulh $t1,$a1,$n0
824 adcs $acc6,$acc6,$t2
825 umulh $t2,$a2,$n0
826 adcs $acc7,$acc7,$t3
827 umulh $t3,$a3,$n0
828 adc $carry,$carry,xzr
829 str $acc0,[$tp],#8
830 adds $acc0,$acc1,$t0
831 umulh $t0,$a4,$n0
832 adcs $acc1,$acc2,$t1
833 umulh $t1,$a5,$n0
834 adcs $acc2,$acc3,$t2
835 umulh $t2,$a6,$n0
836 adcs $acc3,$acc4,$t3
837 umulh $t3,$a7,$n0
838 ldr $n0,[$rp,$cnt]
839 adcs $acc4,$acc5,$t0
840 adcs $acc5,$acc6,$t1
841 adcs $acc6,$acc7,$t2
842 adcs $acc7,$carry,$t3
843 //adc $carry,xzr,xzr // moved above
844 cbnz $cnt,.Lsqr8x_tail
845 // note that carry flag is guaranteed
846 // to be zero at this point
847 ldp $a0,$a1,[$tp,#8*0]
848 sub $cnt,$np_end,$np // done yet?
849 sub $t2,$np_end,$num // rewinded np
850 ldp $a2,$a3,[$tp,#8*2]
851 ldp $a4,$a5,[$tp,#8*4]
852 ldp $a6,$a7,[$tp,#8*6]
853 cbz $cnt,.Lsqr8x_tail_break
854
855 ldr $n0,[$rp,#-8*8]
856 adds $acc0,$acc0,$a0
857 adcs $acc1,$acc1,$a1
858 ldp $a0,$a1,[$np,#8*0]
859 adcs $acc2,$acc2,$a2
860 adcs $acc3,$acc3,$a3
861 ldp $a2,$a3,[$np,#8*2]
862 adcs $acc4,$acc4,$a4
863 adcs $acc5,$acc5,$a5
864 ldp $a4,$a5,[$np,#8*4]
865 adcs $acc6,$acc6,$a6
866 mov $cnt,#-8*8
867 adcs $acc7,$acc7,$a7
868 ldp $a6,$a7,[$np,#8*6]
869 add $np,$np,#8*8
870 //adc $carry,xzr,xzr // moved above
871 b .Lsqr8x_tail
872
873 .align 4
874 .Lsqr8x_tail_break:
875 ldr $n0,[x29,#112] // pull n0
876 add $cnt,$tp,#8*8 // end of current t[num] window
877
878 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
879 adcs $t0,$acc0,$a0
880 adcs $t1,$acc1,$a1
881 ldp $acc0,$acc1,[$rp,#8*0]
882 adcs $acc2,$acc2,$a2
883 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
884 adcs $acc3,$acc3,$a3
885 ldp $a2,$a3,[$t2,#8*2]
886 adcs $acc4,$acc4,$a4
887 adcs $acc5,$acc5,$a5
888 ldp $a4,$a5,[$t2,#8*4]
889 adcs $acc6,$acc6,$a6
890 adcs $acc7,$acc7,$a7
891 ldp $a6,$a7,[$t2,#8*6]
892 add $np,$t2,#8*8
893 adc $topmost,xzr,xzr // top-most carry
894 mul $na0,$n0,$acc0
895 stp $t0,$t1,[$tp,#8*0]
896 stp $acc2,$acc3,[$tp,#8*2]
897 ldp $acc2,$acc3,[$rp,#8*2]
898 stp $acc4,$acc5,[$tp,#8*4]
899 ldp $acc4,$acc5,[$rp,#8*4]
900 cmp $cnt,x29 // did we hit the bottom?
901 stp $acc6,$acc7,[$tp,#8*6]
902 mov $tp,$rp // slide the window
903 ldp $acc6,$acc7,[$rp,#8*6]
904 mov $cnt,#8
905 b.ne .Lsqr8x_reduction
906
907 // Final step. We see if result is larger than modulus, and
908 // if it is, subtract the modulus. But comparison implies
909 // subtraction. So we subtract modulus, see if it borrowed,
910 // and conditionally copy original value.
911 ldr $rp,[x29,#96] // pull rp
912 add $tp,$tp,#8*8
913 subs $t0,$acc0,$a0
914 sbcs $t1,$acc1,$a1
915 sub $cnt,$num,#8*8
916 mov $ap_end,$rp // $rp copy
917
918 .Lsqr8x_sub:
919 sbcs $t2,$acc2,$a2
920 ldp $a0,$a1,[$np,#8*0]
921 sbcs $t3,$acc3,$a3
922 stp $t0,$t1,[$rp,#8*0]
923 sbcs $t0,$acc4,$a4
924 ldp $a2,$a3,[$np,#8*2]
925 sbcs $t1,$acc5,$a5
926 stp $t2,$t3,[$rp,#8*2]
927 sbcs $t2,$acc6,$a6
928 ldp $a4,$a5,[$np,#8*4]
929 sbcs $t3,$acc7,$a7
930 ldp $a6,$a7,[$np,#8*6]
931 add $np,$np,#8*8
932 ldp $acc0,$acc1,[$tp,#8*0]
933 sub $cnt,$cnt,#8*8
934 ldp $acc2,$acc3,[$tp,#8*2]
935 ldp $acc4,$acc5,[$tp,#8*4]
936 ldp $acc6,$acc7,[$tp,#8*6]
937 add $tp,$tp,#8*8
938 stp $t0,$t1,[$rp,#8*4]
939 sbcs $t0,$acc0,$a0
940 stp $t2,$t3,[$rp,#8*6]
941 add $rp,$rp,#8*8
942 sbcs $t1,$acc1,$a1
943 cbnz $cnt,.Lsqr8x_sub
944
945 sbcs $t2,$acc2,$a2
946 mov $tp,sp
947 add $ap,sp,$num
948 ldp $a0,$a1,[$ap_end,#8*0]
949 sbcs $t3,$acc3,$a3
950 stp $t0,$t1,[$rp,#8*0]
951 sbcs $t0,$acc4,$a4
952 ldp $a2,$a3,[$ap_end,#8*2]
953 sbcs $t1,$acc5,$a5
954 stp $t2,$t3,[$rp,#8*2]
955 sbcs $t2,$acc6,$a6
956 ldp $acc0,$acc1,[$ap,#8*0]
957 sbcs $t3,$acc7,$a7
958 ldp $acc2,$acc3,[$ap,#8*2]
959 sbcs xzr,$topmost,xzr // did it borrow?
960 ldr x30,[x29,#8] // pull return address
961 stp $t0,$t1,[$rp,#8*4]
962 stp $t2,$t3,[$rp,#8*6]
963
964 sub $cnt,$num,#8*4
965 .Lsqr4x_cond_copy:
966 sub $cnt,$cnt,#8*4
967 csel $t0,$acc0,$a0,lo
968 stp xzr,xzr,[$tp,#8*0]
969 csel $t1,$acc1,$a1,lo
970 ldp $a0,$a1,[$ap_end,#8*4]
971 ldp $acc0,$acc1,[$ap,#8*4]
972 csel $t2,$acc2,$a2,lo
973 stp xzr,xzr,[$tp,#8*2]
974 add $tp,$tp,#8*4
975 csel $t3,$acc3,$a3,lo
976 ldp $a2,$a3,[$ap_end,#8*6]
977 ldp $acc2,$acc3,[$ap,#8*6]
978 add $ap,$ap,#8*4
979 stp $t0,$t1,[$ap_end,#8*0]
980 stp $t2,$t3,[$ap_end,#8*2]
981 add $ap_end,$ap_end,#8*4
982 stp xzr,xzr,[$ap,#8*0]
983 stp xzr,xzr,[$ap,#8*2]
984 cbnz $cnt,.Lsqr4x_cond_copy
985
986 csel $t0,$acc0,$a0,lo
987 stp xzr,xzr,[$tp,#8*0]
988 csel $t1,$acc1,$a1,lo
989 stp xzr,xzr,[$tp,#8*2]
990 csel $t2,$acc2,$a2,lo
991 csel $t3,$acc3,$a3,lo
992 stp $t0,$t1,[$ap_end,#8*0]
993 stp $t2,$t3,[$ap_end,#8*2]
994
995 b .Lsqr8x_done
996
997 .align 4
998 .Lsqr8x8_post_condition:
999 adc $carry,xzr,xzr
1000 ldr x30,[x29,#8] // pull return address
1001 // $acc0-7,$carry hold result, $a0-7 hold modulus
1002 subs $a0,$acc0,$a0
1003 ldr $ap,[x29,#96] // pull rp
1004 sbcs $a1,$acc1,$a1
1005 stp xzr,xzr,[sp,#8*0]
1006 sbcs $a2,$acc2,$a2
1007 stp xzr,xzr,[sp,#8*2]
1008 sbcs $a3,$acc3,$a3
1009 stp xzr,xzr,[sp,#8*4]
1010 sbcs $a4,$acc4,$a4
1011 stp xzr,xzr,[sp,#8*6]
1012 sbcs $a5,$acc5,$a5
1013 stp xzr,xzr,[sp,#8*8]
1014 sbcs $a6,$acc6,$a6
1015 stp xzr,xzr,[sp,#8*10]
1016 sbcs $a7,$acc7,$a7
1017 stp xzr,xzr,[sp,#8*12]
1018 sbcs $carry,$carry,xzr // did it borrow?
1019 stp xzr,xzr,[sp,#8*14]
1020
1021 // $a0-7 hold result-modulus
1022 csel $a0,$acc0,$a0,lo
1023 csel $a1,$acc1,$a1,lo
1024 csel $a2,$acc2,$a2,lo
1025 csel $a3,$acc3,$a3,lo
1026 stp $a0,$a1,[$ap,#8*0]
1027 csel $a4,$acc4,$a4,lo
1028 csel $a5,$acc5,$a5,lo
1029 stp $a2,$a3,[$ap,#8*2]
1030 csel $a6,$acc6,$a6,lo
1031 csel $a7,$acc7,$a7,lo
1032 stp $a4,$a5,[$ap,#8*4]
1033 stp $a6,$a7,[$ap,#8*6]
1034
1035 .Lsqr8x_done:
1036 ldp x19,x20,[x29,#16]
1037 mov sp,x29
1038 ldp x21,x22,[x29,#32]
1039 mov x0,#1
1040 ldp x23,x24,[x29,#48]
1041 ldp x25,x26,[x29,#64]
1042 ldp x27,x28,[x29,#80]
1043 ldr x29,[sp],#128
1044 .inst 0xd50323bf // autiasp
1045 ret
1046 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1047 ___
1048 }
1049
1050 {
1051 ########################################################################
1052 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1053 # x86_64-mont5 module, it's different in sense that it performs
1054 # reduction 256 bits at a time.
1055
1056 my ($a0,$a1,$a2,$a3,
1057 $t0,$t1,$t2,$t3,
1058 $m0,$m1,$m2,$m3,
1059 $acc0,$acc1,$acc2,$acc3,$acc4,
1060 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1061 my $bp_end=$rp;
1062 my ($carry,$topmost) = ($rp,"x30");
1063
1064 $code.=<<___;
1065 .type __bn_mul4x_mont,%function
1066 .align 5
1067 __bn_mul4x_mont:
1068 .inst 0xd503233f // paciasp
1069 stp x29,x30,[sp,#-128]!
1070 add x29,sp,#0
1071 stp x19,x20,[sp,#16]
1072 stp x21,x22,[sp,#32]
1073 stp x23,x24,[sp,#48]
1074 stp x25,x26,[sp,#64]
1075 stp x27,x28,[sp,#80]
1076
1077 sub $tp,sp,$num,lsl#3
1078 lsl $num,$num,#3
1079 ldr $n0,[$n0] // *n0
1080 sub sp,$tp,#8*4 // alloca
1081
1082 add $t0,$bp,$num
1083 add $ap_end,$ap,$num
1084 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1085
1086 ldr $bi,[$bp,#8*0] // b[0]
1087 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1088 ldp $a2,$a3,[$ap,#8*2]
1089 add $ap,$ap,#8*4
1090 mov $acc0,xzr
1091 mov $acc1,xzr
1092 mov $acc2,xzr
1093 mov $acc3,xzr
1094 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1095 ldp $m2,$m3,[$np,#8*2]
1096 adds $np,$np,#8*4 // clear carry bit
1097 mov $carry,xzr
1098 mov $cnt,#0
1099 mov $tp,sp
1100
1101 .Loop_mul4x_1st_reduction:
1102 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1103 adc $carry,$carry,xzr // modulo-scheduled
1104 mul $t1,$a1,$bi
1105 add $cnt,$cnt,#8
1106 mul $t2,$a2,$bi
1107 and $cnt,$cnt,#31
1108 mul $t3,$a3,$bi
1109 adds $acc0,$acc0,$t0
1110 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1111 adcs $acc1,$acc1,$t1
1112 mul $mi,$acc0,$n0 // t[0]*n0
1113 adcs $acc2,$acc2,$t2
1114 umulh $t1,$a1,$bi
1115 adcs $acc3,$acc3,$t3
1116 umulh $t2,$a2,$bi
1117 adc $acc4,xzr,xzr
1118 umulh $t3,$a3,$bi
1119 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1120 adds $acc1,$acc1,$t0
1121 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1122 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1123 adcs $acc2,$acc2,$t1
1124 mul $t1,$m1,$mi
1125 adcs $acc3,$acc3,$t2
1126 mul $t2,$m2,$mi
1127 adc $acc4,$acc4,$t3 // can't overflow
1128 mul $t3,$m3,$mi
1129 // (*) adds xzr,$acc0,$t0
1130 subs xzr,$acc0,#1 // (*)
1131 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1132 adcs $acc0,$acc1,$t1
1133 umulh $t1,$m1,$mi
1134 adcs $acc1,$acc2,$t2
1135 umulh $t2,$m2,$mi
1136 adcs $acc2,$acc3,$t3
1137 umulh $t3,$m3,$mi
1138 adcs $acc3,$acc4,$carry
1139 adc $carry,xzr,xzr
1140 adds $acc0,$acc0,$t0
1141 sub $t0,$ap_end,$ap
1142 adcs $acc1,$acc1,$t1
1143 adcs $acc2,$acc2,$t2
1144 adcs $acc3,$acc3,$t3
1145 //adc $carry,$carry,xzr
1146 cbnz $cnt,.Loop_mul4x_1st_reduction
1147
1148 cbz $t0,.Lmul4x4_post_condition
1149
1150 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1151 ldp $a2,$a3,[$ap,#8*2]
1152 add $ap,$ap,#8*4
1153 ldr $mi,[sp] // a[0]*n0
1154 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1155 ldp $m2,$m3,[$np,#8*2]
1156 add $np,$np,#8*4
1157
1158 .Loop_mul4x_1st_tail:
1159 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1160 adc $carry,$carry,xzr // modulo-scheduled
1161 mul $t1,$a1,$bi
1162 add $cnt,$cnt,#8
1163 mul $t2,$a2,$bi
1164 and $cnt,$cnt,#31
1165 mul $t3,$a3,$bi
1166 adds $acc0,$acc0,$t0
1167 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1168 adcs $acc1,$acc1,$t1
1169 umulh $t1,$a1,$bi
1170 adcs $acc2,$acc2,$t2
1171 umulh $t2,$a2,$bi
1172 adcs $acc3,$acc3,$t3
1173 umulh $t3,$a3,$bi
1174 adc $acc4,xzr,xzr
1175 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1176 adds $acc1,$acc1,$t0
1177 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1178 adcs $acc2,$acc2,$t1
1179 mul $t1,$m1,$mi
1180 adcs $acc3,$acc3,$t2
1181 mul $t2,$m2,$mi
1182 adc $acc4,$acc4,$t3 // can't overflow
1183 mul $t3,$m3,$mi
1184 adds $acc0,$acc0,$t0
1185 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1186 adcs $acc1,$acc1,$t1
1187 umulh $t1,$m1,$mi
1188 adcs $acc2,$acc2,$t2
1189 umulh $t2,$m2,$mi
1190 adcs $acc3,$acc3,$t3
1191 adcs $acc4,$acc4,$carry
1192 umulh $t3,$m3,$mi
1193 adc $carry,xzr,xzr
1194 ldr $mi,[sp,$cnt] // next t[0]*n0
1195 str $acc0,[$tp],#8 // result!!!
1196 adds $acc0,$acc1,$t0
1197 sub $t0,$ap_end,$ap // done yet?
1198 adcs $acc1,$acc2,$t1
1199 adcs $acc2,$acc3,$t2
1200 adcs $acc3,$acc4,$t3
1201 //adc $carry,$carry,xzr
1202 cbnz $cnt,.Loop_mul4x_1st_tail
1203
1204 sub $t1,$ap_end,$num // rewinded $ap
1205 cbz $t0,.Lmul4x_proceed
1206
1207 ldp $a0,$a1,[$ap,#8*0]
1208 ldp $a2,$a3,[$ap,#8*2]
1209 add $ap,$ap,#8*4
1210 ldp $m0,$m1,[$np,#8*0]
1211 ldp $m2,$m3,[$np,#8*2]
1212 add $np,$np,#8*4
1213 b .Loop_mul4x_1st_tail
1214
1215 .align 5
1216 .Lmul4x_proceed:
1217 ldr $bi,[$bp,#8*4]! // *++b
1218 adc $topmost,$carry,xzr
1219 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1220 sub $np,$np,$num // rewind np
1221 ldp $a2,$a3,[$t1,#8*2]
1222 add $ap,$t1,#8*4
1223
1224 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1225 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1226 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1227 ldp $acc2,$acc3,[sp,#8*6]
1228
1229 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1230 mov $tp,sp
1231 ldp $m2,$m3,[$np,#8*2]
1232 adds $np,$np,#8*4 // clear carry bit
1233 mov $carry,xzr
1234
1235 .align 4
1236 .Loop_mul4x_reduction:
1237 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1238 adc $carry,$carry,xzr // modulo-scheduled
1239 mul $t1,$a1,$bi
1240 add $cnt,$cnt,#8
1241 mul $t2,$a2,$bi
1242 and $cnt,$cnt,#31
1243 mul $t3,$a3,$bi
1244 adds $acc0,$acc0,$t0
1245 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1246 adcs $acc1,$acc1,$t1
1247 mul $mi,$acc0,$n0 // t[0]*n0
1248 adcs $acc2,$acc2,$t2
1249 umulh $t1,$a1,$bi
1250 adcs $acc3,$acc3,$t3
1251 umulh $t2,$a2,$bi
1252 adc $acc4,xzr,xzr
1253 umulh $t3,$a3,$bi
1254 ldr $bi,[$bp,$cnt] // next b[i]
1255 adds $acc1,$acc1,$t0
1256 // (*) mul $t0,$m0,$mi
1257 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1258 adcs $acc2,$acc2,$t1
1259 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1260 adcs $acc3,$acc3,$t2
1261 mul $t2,$m2,$mi
1262 adc $acc4,$acc4,$t3 // can't overflow
1263 mul $t3,$m3,$mi
1264 // (*) adds xzr,$acc0,$t0
1265 subs xzr,$acc0,#1 // (*)
1266 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1267 adcs $acc0,$acc1,$t1
1268 umulh $t1,$m1,$mi
1269 adcs $acc1,$acc2,$t2
1270 umulh $t2,$m2,$mi
1271 adcs $acc2,$acc3,$t3
1272 umulh $t3,$m3,$mi
1273 adcs $acc3,$acc4,$carry
1274 adc $carry,xzr,xzr
1275 adds $acc0,$acc0,$t0
1276 adcs $acc1,$acc1,$t1
1277 adcs $acc2,$acc2,$t2
1278 adcs $acc3,$acc3,$t3
1279 //adc $carry,$carry,xzr
1280 cbnz $cnt,.Loop_mul4x_reduction
1281
1282 adc $carry,$carry,xzr
1283 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1284 ldp $t2,$t3,[$tp,#8*6]
1285 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1286 ldp $a2,$a3,[$ap,#8*2]
1287 add $ap,$ap,#8*4
1288 adds $acc0,$acc0,$t0
1289 adcs $acc1,$acc1,$t1
1290 adcs $acc2,$acc2,$t2
1291 adcs $acc3,$acc3,$t3
1292 //adc $carry,$carry,xzr
1293
1294 ldr $mi,[sp] // t[0]*n0
1295 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1296 ldp $m2,$m3,[$np,#8*2]
1297 add $np,$np,#8*4
1298
1299 .align 4
1300 .Loop_mul4x_tail:
1301 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1302 adc $carry,$carry,xzr // modulo-scheduled
1303 mul $t1,$a1,$bi
1304 add $cnt,$cnt,#8
1305 mul $t2,$a2,$bi
1306 and $cnt,$cnt,#31
1307 mul $t3,$a3,$bi
1308 adds $acc0,$acc0,$t0
1309 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1310 adcs $acc1,$acc1,$t1
1311 umulh $t1,$a1,$bi
1312 adcs $acc2,$acc2,$t2
1313 umulh $t2,$a2,$bi
1314 adcs $acc3,$acc3,$t3
1315 umulh $t3,$a3,$bi
1316 adc $acc4,xzr,xzr
1317 ldr $bi,[$bp,$cnt] // next b[i]
1318 adds $acc1,$acc1,$t0
1319 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1320 adcs $acc2,$acc2,$t1
1321 mul $t1,$m1,$mi
1322 adcs $acc3,$acc3,$t2
1323 mul $t2,$m2,$mi
1324 adc $acc4,$acc4,$t3 // can't overflow
1325 mul $t3,$m3,$mi
1326 adds $acc0,$acc0,$t0
1327 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1328 adcs $acc1,$acc1,$t1
1329 umulh $t1,$m1,$mi
1330 adcs $acc2,$acc2,$t2
1331 umulh $t2,$m2,$mi
1332 adcs $acc3,$acc3,$t3
1333 umulh $t3,$m3,$mi
1334 adcs $acc4,$acc4,$carry
1335 ldr $mi,[sp,$cnt] // next a[0]*n0
1336 adc $carry,xzr,xzr
1337 str $acc0,[$tp],#8 // result!!!
1338 adds $acc0,$acc1,$t0
1339 sub $t0,$ap_end,$ap // done yet?
1340 adcs $acc1,$acc2,$t1
1341 adcs $acc2,$acc3,$t2
1342 adcs $acc3,$acc4,$t3
1343 //adc $carry,$carry,xzr
1344 cbnz $cnt,.Loop_mul4x_tail
1345
1346 sub $t1,$np,$num // rewinded np?
1347 adc $carry,$carry,xzr
1348 cbz $t0,.Loop_mul4x_break
1349
1350 ldp $t0,$t1,[$tp,#8*4]
1351 ldp $t2,$t3,[$tp,#8*6]
1352 ldp $a0,$a1,[$ap,#8*0]
1353 ldp $a2,$a3,[$ap,#8*2]
1354 add $ap,$ap,#8*4
1355 adds $acc0,$acc0,$t0
1356 adcs $acc1,$acc1,$t1
1357 adcs $acc2,$acc2,$t2
1358 adcs $acc3,$acc3,$t3
1359 //adc $carry,$carry,xzr
1360 ldp $m0,$m1,[$np,#8*0]
1361 ldp $m2,$m3,[$np,#8*2]
1362 add $np,$np,#8*4
1363 b .Loop_mul4x_tail
1364
1365 .align 4
1366 .Loop_mul4x_break:
1367 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1368 adds $acc0,$acc0,$topmost
1369 add $bp,$bp,#8*4 // bp++
1370 adcs $acc1,$acc1,xzr
1371 sub $ap,$ap,$num // rewind ap
1372 adcs $acc2,$acc2,xzr
1373 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1374 adcs $acc3,$acc3,xzr
1375 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1376 adc $topmost,$carry,xzr
1377 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1378 cmp $bp,$t3 // done yet?
1379 ldp $acc2,$acc3,[sp,#8*6]
1380 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1381 ldp $m2,$m3,[$t1,#8*2]
1382 add $np,$t1,#8*4
1383 b.eq .Lmul4x_post
1384
1385 ldr $bi,[$bp]
1386 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1387 ldp $a2,$a3,[$ap,#8*2]
1388 adds $ap,$ap,#8*4 // clear carry bit
1389 mov $carry,xzr
1390 mov $tp,sp
1391 b .Loop_mul4x_reduction
1392
1393 .align 4
1394 .Lmul4x_post:
1395 // Final step. We see if result is larger than modulus, and
1396 // if it is, subtract the modulus. But comparison implies
1397 // subtraction. So we subtract modulus, see if it borrowed,
1398 // and conditionally copy original value.
1399 mov $rp,$t2
1400 mov $ap_end,$t2 // $rp copy
1401 subs $t0,$acc0,$m0
1402 add $tp,sp,#8*8
1403 sbcs $t1,$acc1,$m1
1404 sub $cnt,$num,#8*4
1405
1406 .Lmul4x_sub:
1407 sbcs $t2,$acc2,$m2
1408 ldp $m0,$m1,[$np,#8*0]
1409 sub $cnt,$cnt,#8*4
1410 ldp $acc0,$acc1,[$tp,#8*0]
1411 sbcs $t3,$acc3,$m3
1412 ldp $m2,$m3,[$np,#8*2]
1413 add $np,$np,#8*4
1414 ldp $acc2,$acc3,[$tp,#8*2]
1415 add $tp,$tp,#8*4
1416 stp $t0,$t1,[$rp,#8*0]
1417 sbcs $t0,$acc0,$m0
1418 stp $t2,$t3,[$rp,#8*2]
1419 add $rp,$rp,#8*4
1420 sbcs $t1,$acc1,$m1
1421 cbnz $cnt,.Lmul4x_sub
1422
1423 sbcs $t2,$acc2,$m2
1424 mov $tp,sp
1425 add $ap,sp,#8*4
1426 ldp $a0,$a1,[$ap_end,#8*0]
1427 sbcs $t3,$acc3,$m3
1428 stp $t0,$t1,[$rp,#8*0]
1429 ldp $a2,$a3,[$ap_end,#8*2]
1430 stp $t2,$t3,[$rp,#8*2]
1431 ldp $acc0,$acc1,[$ap,#8*0]
1432 ldp $acc2,$acc3,[$ap,#8*2]
1433 sbcs xzr,$topmost,xzr // did it borrow?
1434 ldr x30,[x29,#8] // pull return address
1435
1436 sub $cnt,$num,#8*4
1437 .Lmul4x_cond_copy:
1438 sub $cnt,$cnt,#8*4
1439 csel $t0,$acc0,$a0,lo
1440 stp xzr,xzr,[$tp,#8*0]
1441 csel $t1,$acc1,$a1,lo
1442 ldp $a0,$a1,[$ap_end,#8*4]
1443 ldp $acc0,$acc1,[$ap,#8*4]
1444 csel $t2,$acc2,$a2,lo
1445 stp xzr,xzr,[$tp,#8*2]
1446 add $tp,$tp,#8*4
1447 csel $t3,$acc3,$a3,lo
1448 ldp $a2,$a3,[$ap_end,#8*6]
1449 ldp $acc2,$acc3,[$ap,#8*6]
1450 add $ap,$ap,#8*4
1451 stp $t0,$t1,[$ap_end,#8*0]
1452 stp $t2,$t3,[$ap_end,#8*2]
1453 add $ap_end,$ap_end,#8*4
1454 cbnz $cnt,.Lmul4x_cond_copy
1455
1456 csel $t0,$acc0,$a0,lo
1457 stp xzr,xzr,[$tp,#8*0]
1458 csel $t1,$acc1,$a1,lo
1459 stp xzr,xzr,[$tp,#8*2]
1460 csel $t2,$acc2,$a2,lo
1461 stp xzr,xzr,[$tp,#8*3]
1462 csel $t3,$acc3,$a3,lo
1463 stp xzr,xzr,[$tp,#8*4]
1464 stp $t0,$t1,[$ap_end,#8*0]
1465 stp $t2,$t3,[$ap_end,#8*2]
1466
1467 b .Lmul4x_done
1468
1469 .align 4
1470 .Lmul4x4_post_condition:
1471 adc $carry,$carry,xzr
1472 ldr $ap,[x29,#96] // pull rp
1473 // $acc0-3,$carry hold result, $m0-7 hold modulus
1474 subs $a0,$acc0,$m0
1475 ldr x30,[x29,#8] // pull return address
1476 sbcs $a1,$acc1,$m1
1477 stp xzr,xzr,[sp,#8*0]
1478 sbcs $a2,$acc2,$m2
1479 stp xzr,xzr,[sp,#8*2]
1480 sbcs $a3,$acc3,$m3
1481 stp xzr,xzr,[sp,#8*4]
1482 sbcs xzr,$carry,xzr // did it borrow?
1483 stp xzr,xzr,[sp,#8*6]
1484
1485 // $a0-3 hold result-modulus
1486 csel $a0,$acc0,$a0,lo
1487 csel $a1,$acc1,$a1,lo
1488 csel $a2,$acc2,$a2,lo
1489 csel $a3,$acc3,$a3,lo
1490 stp $a0,$a1,[$ap,#8*0]
1491 stp $a2,$a3,[$ap,#8*2]
1492
1493 .Lmul4x_done:
1494 ldp x19,x20,[x29,#16]
1495 mov sp,x29
1496 ldp x21,x22,[x29,#32]
1497 mov x0,#1
1498 ldp x23,x24,[x29,#48]
1499 ldp x25,x26,[x29,#64]
1500 ldp x27,x28,[x29,#80]
1501 ldr x29,[sp],#128
1502 .inst 0xd50323bf // autiasp
1503 ret
1504 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1505 ___
1506 }
1507 $code.=<<___;
1508 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1509 .align 4
1510 ___
1511
1512 print $code;
1513
1514 close STDOUT or die "error closing STDOUT: $!";