]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/armv8-mont.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / bn / asm / armv8-mont.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
cb2ed545
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
d38f1b39
AP
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
cb2ed545
AP
42
43$flavour = shift;
44$output = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
57
58# int bn_mul_mont(
59$rp="x0"; # BN_ULONG *rp,
60$ap="x1"; # const BN_ULONG *ap,
61$bp="x2"; # const BN_ULONG *bp,
62$np="x3"; # const BN_ULONG *np,
63$n0="x4"; # const BN_ULONG *n0,
64$num="x5"; # int num);
65
66$code.=<<___;
67.text
68
69.globl bn_mul_mont
70.type bn_mul_mont,%function
71.align 5
72bn_mul_mont:
d38f1b39
AP
73 tst $num,#7
74 b.eq __bn_sqr8x_mont
75 tst $num,#3
76 b.eq __bn_mul4x_mont
77.Lmul_mont:
cb2ed545
AP
78 stp x29,x30,[sp,#-64]!
79 add x29,sp,#0
80 stp x19,x20,[sp,#16]
81 stp x21,x22,[sp,#32]
82 stp x23,x24,[sp,#48]
83
84 ldr $m0,[$bp],#8 // bp[0]
85 sub $tp,sp,$num,lsl#3
86 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
87 lsl $num,$num,#3
88 ldr $n0,[$n0] // *n0
89 and $tp,$tp,#-16 // ABI says so
90 ldp $hi1,$nj,[$np],#16 // np[0..1]
91
92 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
93 sub $j,$num,#16 // j=num-2
94 umulh $hi0,$hi0,$m0
95 mul $alo,$aj,$m0 // ap[1]*bp[0]
96 umulh $ahi,$aj,$m0
97
98 mul $m1,$lo0,$n0 // "tp[0]"*n0
99 mov sp,$tp // alloca
100
d38f1b39 101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
cb2ed545
AP
102 umulh $hi1,$hi1,$m1
103 mul $nlo,$nj,$m1 // np[1]*m1
d38f1b39
AP
104 // (*) adds $lo1,$lo1,$lo0 // discarded
105 // (*) As for removal of first multiplication and addition
106 // instructions. The outcome of first addition is
107 // guaranteed to be zero, which leaves two computationally
108 // significant outcomes: it either carries or not. Then
109 // question is when does it carry? Is there alternative
110 // way to deduce it? If you follow operations, you can
111 // observe that condition for carry is quite simple:
112 // $lo0 being non-zero. So that carry can be calculated
113 // by adding -1 to $lo0. That's what next instruction does.
114 subs xzr,$lo0,#1 // (*)
cb2ed545
AP
115 umulh $nhi,$nj,$m1
116 adc $hi1,$hi1,xzr
117 cbz $j,.L1st_skip
118
119.L1st:
120 ldr $aj,[$ap],#8
121 adds $lo0,$alo,$hi0
122 sub $j,$j,#8 // j--
123 adc $hi0,$ahi,xzr
124
125 ldr $nj,[$np],#8
126 adds $lo1,$nlo,$hi1
127 mul $alo,$aj,$m0 // ap[j]*bp[0]
128 adc $hi1,$nhi,xzr
129 umulh $ahi,$aj,$m0
130
131 adds $lo1,$lo1,$lo0
132 mul $nlo,$nj,$m1 // np[j]*m1
133 adc $hi1,$hi1,xzr
134 umulh $nhi,$nj,$m1
135 str $lo1,[$tp],#8 // tp[j-1]
136 cbnz $j,.L1st
137
138.L1st_skip:
139 adds $lo0,$alo,$hi0
140 sub $ap,$ap,$num // rewind $ap
141 adc $hi0,$ahi,xzr
142
143 adds $lo1,$nlo,$hi1
144 sub $np,$np,$num // rewind $np
145 adc $hi1,$nhi,xzr
146
147 adds $lo1,$lo1,$lo0
148 sub $i,$num,#8 // i=num-1
149 adcs $hi1,$hi1,$hi0
150
151 adc $ovf,xzr,xzr // upmost overflow bit
152 stp $lo1,$hi1,[$tp]
153
154.Louter:
155 ldr $m0,[$bp],#8 // bp[i]
156 ldp $hi0,$aj,[$ap],#16
157 ldr $tj,[sp] // tp[0]
158 add $tp,sp,#8
159
160 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
161 sub $j,$num,#16 // j=num-2
162 umulh $hi0,$hi0,$m0
163 ldp $hi1,$nj,[$np],#16
164 mul $alo,$aj,$m0 // ap[1]*bp[i]
165 adds $lo0,$lo0,$tj
166 umulh $ahi,$aj,$m0
167 adc $hi0,$hi0,xzr
168
169 mul $m1,$lo0,$n0
170 sub $i,$i,#8 // i--
171
d38f1b39 172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
cb2ed545
AP
173 umulh $hi1,$hi1,$m1
174 mul $nlo,$nj,$m1 // np[1]*m1
d38f1b39
AP
175 // (*) adds $lo1,$lo1,$lo0
176 subs xzr,$lo0,#1 // (*)
cb2ed545
AP
177 umulh $nhi,$nj,$m1
178 cbz $j,.Linner_skip
179
180.Linner:
181 ldr $aj,[$ap],#8
182 adc $hi1,$hi1,xzr
183 ldr $tj,[$tp],#8 // tp[j]
184 adds $lo0,$alo,$hi0
185 sub $j,$j,#8 // j--
186 adc $hi0,$ahi,xzr
187
188 adds $lo1,$nlo,$hi1
189 ldr $nj,[$np],#8
190 adc $hi1,$nhi,xzr
191
192 mul $alo,$aj,$m0 // ap[j]*bp[i]
193 adds $lo0,$lo0,$tj
194 umulh $ahi,$aj,$m0
195 adc $hi0,$hi0,xzr
196
197 mul $nlo,$nj,$m1 // np[j]*m1
198 adds $lo1,$lo1,$lo0
199 umulh $nhi,$nj,$m1
200 str $lo1,[$tp,#-16] // tp[j-1]
201 cbnz $j,.Linner
202
203.Linner_skip:
204 ldr $tj,[$tp],#8 // tp[j]
205 adc $hi1,$hi1,xzr
206 adds $lo0,$alo,$hi0
207 sub $ap,$ap,$num // rewind $ap
208 adc $hi0,$ahi,xzr
209
210 adds $lo1,$nlo,$hi1
211 sub $np,$np,$num // rewind $np
d38f1b39
AP
212 adcs $hi1,$nhi,$ovf
213 adc $ovf,xzr,xzr
cb2ed545
AP
214
215 adds $lo0,$lo0,$tj
216 adc $hi0,$hi0,xzr
217
218 adds $lo1,$lo1,$lo0
219 adcs $hi1,$hi1,$hi0
d38f1b39 220 adc $ovf,$ovf,xzr // upmost overflow bit
cb2ed545
AP
221 stp $lo1,$hi1,[$tp,#-16]
222
223 cbnz $i,.Louter
224
225 // Final step. We see if result is larger than modulus, and
226 // if it is, subtract the modulus. But comparison implies
227 // subtraction. So we subtract modulus, see if it borrowed,
d38f1b39 228 // and conditionally copy original value.
cb2ed545
AP
229 ldr $tj,[sp] // tp[0]
230 add $tp,sp,#8
231 ldr $nj,[$np],#8 // np[0]
232 subs $j,$num,#8 // j=num-1 and clear borrow
233 mov $ap,$rp
234.Lsub:
235 sbcs $aj,$tj,$nj // tp[j]-np[j]
236 ldr $tj,[$tp],#8
237 sub $j,$j,#8 // j--
238 ldr $nj,[$np],#8
239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
240 cbnz $j,.Lsub
241
242 sbcs $aj,$tj,$nj
243 sbcs $ovf,$ovf,xzr // did it borrow?
244 str $aj,[$ap],#8 // rp[num-1]
245
246 ldr $tj,[sp] // tp[0]
247 add $tp,sp,#8
248 ldr $aj,[$rp],#8 // rp[0]
249 sub $num,$num,#8 // num--
250 nop
251.Lcond_copy:
252 sub $num,$num,#8 // num--
d38f1b39 253 csel $nj,$tj,$aj,lo // did it borrow?
cb2ed545
AP
254 ldr $tj,[$tp],#8
255 ldr $aj,[$rp],#8
256 str xzr,[$tp,#-16] // wipe tp
257 str $nj,[$rp,#-16]
258 cbnz $num,.Lcond_copy
259
d38f1b39 260 csel $nj,$tj,$aj,lo
cb2ed545
AP
261 str xzr,[$tp,#-8] // wipe tp
262 str $nj,[$rp,#-8]
263
264 ldp x19,x20,[x29,#16]
265 mov sp,x29
266 ldp x21,x22,[x29,#32]
d38f1b39 267 mov x0,#1
cb2ed545
AP
268 ldp x23,x24,[x29,#48]
269 ldr x29,[sp],#64
270 ret
271.size bn_mul_mont,.-bn_mul_mont
d38f1b39
AP
272___
273{
274########################################################################
275# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
276
277my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280my ($cnt,$carry,$topmost)=("x27","x28","x30");
281my ($tp,$ap_end,$na0)=($bp,$np,$carry);
282
283$code.=<<___;
284.type __bn_sqr8x_mont,%function
285.align 5
286__bn_sqr8x_mont:
287 cmp $ap,$bp
288 b.ne __bn_mul4x_mont
289.Lsqr8x_mont:
290 stp x29,x30,[sp,#-128]!
291 add x29,sp,#0
292 stp x19,x20,[sp,#16]
293 stp x21,x22,[sp,#32]
294 stp x23,x24,[sp,#48]
295 stp x25,x26,[sp,#64]
296 stp x27,x28,[sp,#80]
297 stp $rp,$np,[sp,#96] // offload rp and np
298
299 ldp $a0,$a1,[$ap,#8*0]
300 ldp $a2,$a3,[$ap,#8*2]
301 ldp $a4,$a5,[$ap,#8*4]
302 ldp $a6,$a7,[$ap,#8*6]
303
304 sub $tp,sp,$num,lsl#4
305 lsl $num,$num,#3
306 ldr $n0,[$n0] // *n0
307 mov sp,$tp // alloca
308 sub $cnt,$num,#8*8
309 b .Lsqr8x_zero_start
310
311.Lsqr8x_zero:
312 sub $cnt,$cnt,#8*8
313 stp xzr,xzr,[$tp,#8*0]
314 stp xzr,xzr,[$tp,#8*2]
315 stp xzr,xzr,[$tp,#8*4]
316 stp xzr,xzr,[$tp,#8*6]
317.Lsqr8x_zero_start:
318 stp xzr,xzr,[$tp,#8*8]
319 stp xzr,xzr,[$tp,#8*10]
320 stp xzr,xzr,[$tp,#8*12]
321 stp xzr,xzr,[$tp,#8*14]
322 add $tp,$tp,#8*16
323 cbnz $cnt,.Lsqr8x_zero
324
325 add $ap_end,$ap,$num
326 add $ap,$ap,#8*8
327 mov $acc0,xzr
328 mov $acc1,xzr
329 mov $acc2,xzr
330 mov $acc3,xzr
331 mov $acc4,xzr
332 mov $acc5,xzr
333 mov $acc6,xzr
334 mov $acc7,xzr
335 mov $tp,sp
336 str $n0,[x29,#112] // offload n0
337
338 // Multiply everything but a[i]*a[i]
339.align 4
340.Lsqr8x_outer_loop:
341 // a[1]a[0] (i)
342 // a[2]a[0]
343 // a[3]a[0]
344 // a[4]a[0]
345 // a[5]a[0]
346 // a[6]a[0]
347 // a[7]a[0]
348 // a[2]a[1] (ii)
349 // a[3]a[1]
350 // a[4]a[1]
351 // a[5]a[1]
352 // a[6]a[1]
353 // a[7]a[1]
354 // a[3]a[2] (iii)
355 // a[4]a[2]
356 // a[5]a[2]
357 // a[6]a[2]
358 // a[7]a[2]
359 // a[4]a[3] (iv)
360 // a[5]a[3]
361 // a[6]a[3]
362 // a[7]a[3]
363 // a[5]a[4] (v)
364 // a[6]a[4]
365 // a[7]a[4]
366 // a[6]a[5] (vi)
367 // a[7]a[5]
368 // a[7]a[6] (vii)
369
370 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
371 mul $t1,$a2,$a0
372 mul $t2,$a3,$a0
373 mul $t3,$a4,$a0
374 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
375 mul $t0,$a5,$a0
376 adcs $acc2,$acc2,$t1
377 mul $t1,$a6,$a0
378 adcs $acc3,$acc3,$t2
379 mul $t2,$a7,$a0
380 adcs $acc4,$acc4,$t3
381 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
382 adcs $acc5,$acc5,$t0
383 umulh $t0,$a2,$a0
384 adcs $acc6,$acc6,$t1
385 umulh $t1,$a3,$a0
386 adcs $acc7,$acc7,$t2
387 umulh $t2,$a4,$a0
388 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
389 adc $acc0,xzr,xzr // t[8]
390 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
391 umulh $t3,$a5,$a0
392 adcs $acc3,$acc3,$t0
393 umulh $t0,$a6,$a0
394 adcs $acc4,$acc4,$t1
395 umulh $t1,$a7,$a0
396 adcs $acc5,$acc5,$t2
397 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
398 adcs $acc6,$acc6,$t3
399 mul $t3,$a3,$a1
400 adcs $acc7,$acc7,$t0
401 mul $t0,$a4,$a1
402 adc $acc0,$acc0,$t1
403
404 mul $t1,$a5,$a1
405 adds $acc3,$acc3,$t2
406 mul $t2,$a6,$a1
407 adcs $acc4,$acc4,$t3
408 mul $t3,$a7,$a1
409 adcs $acc5,$acc5,$t0
410 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
411 adcs $acc6,$acc6,$t1
412 umulh $t1,$a3,$a1
413 adcs $acc7,$acc7,$t2
414 umulh $t2,$a4,$a1
415 adcs $acc0,$acc0,$t3
416 umulh $t3,$a5,$a1
417 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
418 adc $acc1,xzr,xzr // t[9]
419 adds $acc4,$acc4,$t0
420 umulh $t0,$a6,$a1
421 adcs $acc5,$acc5,$t1
422 umulh $t1,$a7,$a1
423 adcs $acc6,$acc6,$t2
424 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
425 adcs $acc7,$acc7,$t3
426 mul $t3,$a4,$a2
427 adcs $acc0,$acc0,$t0
428 mul $t0,$a5,$a2
429 adc $acc1,$acc1,$t1
430
431 mul $t1,$a6,$a2
432 adds $acc5,$acc5,$t2
433 mul $t2,$a7,$a2
434 adcs $acc6,$acc6,$t3
435 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
436 adcs $acc7,$acc7,$t0
437 umulh $t0,$a4,$a2
438 adcs $acc0,$acc0,$t1
439 umulh $t1,$a5,$a2
440 adcs $acc1,$acc1,$t2
441 umulh $t2,$a6,$a2
442 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
443 adc $acc2,xzr,xzr // t[10]
444 adds $acc6,$acc6,$t3
445 umulh $t3,$a7,$a2
446 adcs $acc7,$acc7,$t0
447 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
448 adcs $acc0,$acc0,$t1
449 mul $t1,$a5,$a3
450 adcs $acc1,$acc1,$t2
451 mul $t2,$a6,$a3
452 adc $acc2,$acc2,$t3
453
454 mul $t3,$a7,$a3
455 adds $acc7,$acc7,$t0
456 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
457 adcs $acc0,$acc0,$t1
458 umulh $t1,$a5,$a3
459 adcs $acc1,$acc1,$t2
460 umulh $t2,$a6,$a3
461 adcs $acc2,$acc2,$t3
462 umulh $t3,$a7,$a3
463 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
464 adc $acc3,xzr,xzr // t[11]
465 adds $acc0,$acc0,$t0
466 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
467 adcs $acc1,$acc1,$t1
468 mul $t1,$a6,$a4
469 adcs $acc2,$acc2,$t2
470 mul $t2,$a7,$a4
471 adc $acc3,$acc3,$t3
472
473 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
474 adds $acc1,$acc1,$t0
475 umulh $t0,$a6,$a4
476 adcs $acc2,$acc2,$t1
477 umulh $t1,$a7,$a4
478 adcs $acc3,$acc3,$t2
479 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
480 adc $acc4,xzr,xzr // t[12]
481 adds $acc2,$acc2,$t3
482 mul $t3,$a7,$a5
483 adcs $acc3,$acc3,$t0
484 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
485 adc $acc4,$acc4,$t1
486
487 umulh $t1,$a7,$a5
488 adds $acc3,$acc3,$t2
489 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
490 adcs $acc4,$acc4,$t3
491 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
492 adc $acc5,xzr,xzr // t[13]
493 adds $acc4,$acc4,$t0
494 sub $cnt,$ap_end,$ap // done yet?
495 adc $acc5,$acc5,$t1
496
497 adds $acc5,$acc5,$t2
498 sub $t0,$ap_end,$num // rewinded ap
499 adc $acc6,xzr,xzr // t[14]
500 add $acc6,$acc6,$t3
501
502 cbz $cnt,.Lsqr8x_outer_break
503
504 mov $n0,$a0
505 ldp $a0,$a1,[$tp,#8*0]
506 ldp $a2,$a3,[$tp,#8*2]
507 ldp $a4,$a5,[$tp,#8*4]
508 ldp $a6,$a7,[$tp,#8*6]
509 adds $acc0,$acc0,$a0
510 adcs $acc1,$acc1,$a1
511 ldp $a0,$a1,[$ap,#8*0]
512 adcs $acc2,$acc2,$a2
513 adcs $acc3,$acc3,$a3
514 ldp $a2,$a3,[$ap,#8*2]
515 adcs $acc4,$acc4,$a4
516 adcs $acc5,$acc5,$a5
517 ldp $a4,$a5,[$ap,#8*4]
518 adcs $acc6,$acc6,$a6
519 mov $rp,$ap
520 adcs $acc7,xzr,$a7
521 ldp $a6,$a7,[$ap,#8*6]
522 add $ap,$ap,#8*8
523 //adc $carry,xzr,xzr // moved below
524 mov $cnt,#-8*8
525
526 // a[8]a[0]
527 // a[9]a[0]
528 // a[a]a[0]
529 // a[b]a[0]
530 // a[c]a[0]
531 // a[d]a[0]
532 // a[e]a[0]
533 // a[f]a[0]
534 // a[8]a[1]
535 // a[f]a[1]........................
536 // a[8]a[2]
537 // a[f]a[2]........................
538 // a[8]a[3]
539 // a[f]a[3]........................
540 // a[8]a[4]
541 // a[f]a[4]........................
542 // a[8]a[5]
543 // a[f]a[5]........................
544 // a[8]a[6]
545 // a[f]a[6]........................
546 // a[8]a[7]
547 // a[f]a[7]........................
548.Lsqr8x_mul:
549 mul $t0,$a0,$n0
550 adc $carry,xzr,xzr // carry bit, modulo-scheduled
551 mul $t1,$a1,$n0
552 add $cnt,$cnt,#8
553 mul $t2,$a2,$n0
554 mul $t3,$a3,$n0
555 adds $acc0,$acc0,$t0
556 mul $t0,$a4,$n0
557 adcs $acc1,$acc1,$t1
558 mul $t1,$a5,$n0
559 adcs $acc2,$acc2,$t2
560 mul $t2,$a6,$n0
561 adcs $acc3,$acc3,$t3
562 mul $t3,$a7,$n0
563 adcs $acc4,$acc4,$t0
564 umulh $t0,$a0,$n0
565 adcs $acc5,$acc5,$t1
566 umulh $t1,$a1,$n0
567 adcs $acc6,$acc6,$t2
568 umulh $t2,$a2,$n0
569 adcs $acc7,$acc7,$t3
570 umulh $t3,$a3,$n0
571 adc $carry,$carry,xzr
572 str $acc0,[$tp],#8
573 adds $acc0,$acc1,$t0
574 umulh $t0,$a4,$n0
575 adcs $acc1,$acc2,$t1
576 umulh $t1,$a5,$n0
577 adcs $acc2,$acc3,$t2
578 umulh $t2,$a6,$n0
579 adcs $acc3,$acc4,$t3
580 umulh $t3,$a7,$n0
581 ldr $n0,[$rp,$cnt]
582 adcs $acc4,$acc5,$t0
583 adcs $acc5,$acc6,$t1
584 adcs $acc6,$acc7,$t2
585 adcs $acc7,$carry,$t3
586 //adc $carry,xzr,xzr // moved above
587 cbnz $cnt,.Lsqr8x_mul
588 // note that carry flag is guaranteed
589 // to be zero at this point
590 cmp $ap,$ap_end // done yet?
591 b.eq .Lsqr8x_break
592
593 ldp $a0,$a1,[$tp,#8*0]
594 ldp $a2,$a3,[$tp,#8*2]
595 ldp $a4,$a5,[$tp,#8*4]
596 ldp $a6,$a7,[$tp,#8*6]
597 adds $acc0,$acc0,$a0
598 ldr $n0,[$rp,#-8*8]
599 adcs $acc1,$acc1,$a1
600 ldp $a0,$a1,[$ap,#8*0]
601 adcs $acc2,$acc2,$a2
602 adcs $acc3,$acc3,$a3
603 ldp $a2,$a3,[$ap,#8*2]
604 adcs $acc4,$acc4,$a4
605 adcs $acc5,$acc5,$a5
606 ldp $a4,$a5,[$ap,#8*4]
607 adcs $acc6,$acc6,$a6
608 mov $cnt,#-8*8
609 adcs $acc7,$acc7,$a7
610 ldp $a6,$a7,[$ap,#8*6]
611 add $ap,$ap,#8*8
612 //adc $carry,xzr,xzr // moved above
613 b .Lsqr8x_mul
614
615.align 4
616.Lsqr8x_break:
617 ldp $a0,$a1,[$rp,#8*0]
618 add $ap,$rp,#8*8
619 ldp $a2,$a3,[$rp,#8*2]
620 sub $t0,$ap_end,$ap // is it last iteration?
621 ldp $a4,$a5,[$rp,#8*4]
622 sub $t1,$tp,$t0
623 ldp $a6,$a7,[$rp,#8*6]
624 cbz $t0,.Lsqr8x_outer_loop
625
626 stp $acc0,$acc1,[$tp,#8*0]
627 ldp $acc0,$acc1,[$t1,#8*0]
628 stp $acc2,$acc3,[$tp,#8*2]
629 ldp $acc2,$acc3,[$t1,#8*2]
630 stp $acc4,$acc5,[$tp,#8*4]
631 ldp $acc4,$acc5,[$t1,#8*4]
632 stp $acc6,$acc7,[$tp,#8*6]
633 mov $tp,$t1
634 ldp $acc6,$acc7,[$t1,#8*6]
635 b .Lsqr8x_outer_loop
636
637.align 4
638.Lsqr8x_outer_break:
639 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
640 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
641 ldp $t1,$t2,[sp,#8*1]
642 ldp $a5,$a7,[$t0,#8*2]
643 add $ap,$t0,#8*4
644 ldp $t3,$t0,[sp,#8*3]
645
646 stp $acc0,$acc1,[$tp,#8*0]
647 mul $acc0,$a1,$a1
648 stp $acc2,$acc3,[$tp,#8*2]
649 umulh $a1,$a1,$a1
650 stp $acc4,$acc5,[$tp,#8*4]
651 mul $a2,$a3,$a3
652 stp $acc6,$acc7,[$tp,#8*6]
653 mov $tp,sp
654 umulh $a3,$a3,$a3
655 adds $acc1,$a1,$t1,lsl#1
656 extr $t1,$t2,$t1,#63
657 sub $cnt,$num,#8*4
658
659.Lsqr4x_shift_n_add:
660 adcs $acc2,$a2,$t1
661 extr $t2,$t3,$t2,#63
662 sub $cnt,$cnt,#8*4
663 adcs $acc3,$a3,$t2
664 ldp $t1,$t2,[$tp,#8*5]
665 mul $a4,$a5,$a5
666 ldp $a1,$a3,[$ap],#8*2
667 umulh $a5,$a5,$a5
668 mul $a6,$a7,$a7
669 umulh $a7,$a7,$a7
670 extr $t3,$t0,$t3,#63
671 stp $acc0,$acc1,[$tp,#8*0]
672 adcs $acc4,$a4,$t3
673 extr $t0,$t1,$t0,#63
674 stp $acc2,$acc3,[$tp,#8*2]
675 adcs $acc5,$a5,$t0
676 ldp $t3,$t0,[$tp,#8*7]
677 extr $t1,$t2,$t1,#63
678 adcs $acc6,$a6,$t1
679 extr $t2,$t3,$t2,#63
680 adcs $acc7,$a7,$t2
681 ldp $t1,$t2,[$tp,#8*9]
682 mul $a0,$a1,$a1
683 ldp $a5,$a7,[$ap],#8*2
684 umulh $a1,$a1,$a1
685 mul $a2,$a3,$a3
686 umulh $a3,$a3,$a3
687 stp $acc4,$acc5,[$tp,#8*4]
688 extr $t3,$t0,$t3,#63
689 stp $acc6,$acc7,[$tp,#8*6]
690 add $tp,$tp,#8*8
691 adcs $acc0,$a0,$t3
692 extr $t0,$t1,$t0,#63
693 adcs $acc1,$a1,$t0
694 ldp $t3,$t0,[$tp,#8*3]
695 extr $t1,$t2,$t1,#63
696 cbnz $cnt,.Lsqr4x_shift_n_add
697___
698my ($np,$np_end)=($ap,$ap_end);
699$code.=<<___;
700 ldp $np,$n0,[x29,#104] // pull np and n0
701
702 adcs $acc2,$a2,$t1
703 extr $t2,$t3,$t2,#63
704 adcs $acc3,$a3,$t2
705 ldp $t1,$t2,[$tp,#8*5]
706 mul $a4,$a5,$a5
707 umulh $a5,$a5,$a5
708 stp $acc0,$acc1,[$tp,#8*0]
709 mul $a6,$a7,$a7
710 umulh $a7,$a7,$a7
711 stp $acc2,$acc3,[$tp,#8*2]
712 extr $t3,$t0,$t3,#63
713 adcs $acc4,$a4,$t3
714 extr $t0,$t1,$t0,#63
715 ldp $acc0,$acc1,[sp,#8*0]
716 adcs $acc5,$a5,$t0
717 extr $t1,$t2,$t1,#63
718 ldp $a0,$a1,[$np,#8*0]
719 adcs $acc6,$a6,$t1
720 extr $t2,xzr,$t2,#63
721 ldp $a2,$a3,[$np,#8*2]
722 adc $acc7,$a7,$t2
723 ldp $a4,$a5,[$np,#8*4]
724
725 // Reduce by 512 bits per iteration
726 mul $na0,$n0,$acc0 // t[0]*n0
727 ldp $a6,$a7,[$np,#8*6]
728 add $np_end,$np,$num
729 ldp $acc2,$acc3,[sp,#8*2]
730 stp $acc4,$acc5,[$tp,#8*4]
731 ldp $acc4,$acc5,[sp,#8*4]
732 stp $acc6,$acc7,[$tp,#8*6]
733 ldp $acc6,$acc7,[sp,#8*6]
734 add $np,$np,#8*8
735 mov $topmost,xzr // initial top-most carry
736 mov $tp,sp
737 mov $cnt,#8
738
739.Lsqr8x_reduction:
740 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
741 mul $t1,$a1,$na0
742 sub $cnt,$cnt,#1
743 mul $t2,$a2,$na0
744 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
745 mul $t3,$a3,$na0
746 // (*) adds xzr,$acc0,$t0
747 subs xzr,$acc0,#1 // (*)
748 mul $t0,$a4,$na0
749 adcs $acc0,$acc1,$t1
750 mul $t1,$a5,$na0
751 adcs $acc1,$acc2,$t2
752 mul $t2,$a6,$na0
753 adcs $acc2,$acc3,$t3
754 mul $t3,$a7,$na0
755 adcs $acc3,$acc4,$t0
756 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
757 adcs $acc4,$acc5,$t1
758 umulh $t1,$a1,$na0
759 adcs $acc5,$acc6,$t2
760 umulh $t2,$a2,$na0
761 adcs $acc6,$acc7,$t3
762 umulh $t3,$a3,$na0
763 adc $acc7,xzr,xzr
764 adds $acc0,$acc0,$t0
765 umulh $t0,$a4,$na0
766 adcs $acc1,$acc1,$t1
767 umulh $t1,$a5,$na0
768 adcs $acc2,$acc2,$t2
769 umulh $t2,$a6,$na0
770 adcs $acc3,$acc3,$t3
771 umulh $t3,$a7,$na0
772 mul $na0,$n0,$acc0 // next t[0]*n0
773 adcs $acc4,$acc4,$t0
774 adcs $acc5,$acc5,$t1
775 adcs $acc6,$acc6,$t2
776 adc $acc7,$acc7,$t3
777 cbnz $cnt,.Lsqr8x_reduction
778
779 ldp $t0,$t1,[$tp,#8*0]
780 ldp $t2,$t3,[$tp,#8*2]
781 mov $rp,$tp
782 sub $cnt,$np_end,$np // done yet?
783 adds $acc0,$acc0,$t0
784 adcs $acc1,$acc1,$t1
785 ldp $t0,$t1,[$tp,#8*4]
786 adcs $acc2,$acc2,$t2
787 adcs $acc3,$acc3,$t3
788 ldp $t2,$t3,[$tp,#8*6]
789 adcs $acc4,$acc4,$t0
790 adcs $acc5,$acc5,$t1
791 adcs $acc6,$acc6,$t2
792 adcs $acc7,$acc7,$t3
793 //adc $carry,xzr,xzr // moved below
794 cbz $cnt,.Lsqr8x8_post_condition
795
796 ldr $n0,[$tp,#-8*8]
797 ldp $a0,$a1,[$np,#8*0]
798 ldp $a2,$a3,[$np,#8*2]
799 ldp $a4,$a5,[$np,#8*4]
800 mov $cnt,#-8*8
801 ldp $a6,$a7,[$np,#8*6]
802 add $np,$np,#8*8
803
804.Lsqr8x_tail:
805 mul $t0,$a0,$n0
806 adc $carry,xzr,xzr // carry bit, modulo-scheduled
807 mul $t1,$a1,$n0
808 add $cnt,$cnt,#8
809 mul $t2,$a2,$n0
810 mul $t3,$a3,$n0
811 adds $acc0,$acc0,$t0
812 mul $t0,$a4,$n0
813 adcs $acc1,$acc1,$t1
814 mul $t1,$a5,$n0
815 adcs $acc2,$acc2,$t2
816 mul $t2,$a6,$n0
817 adcs $acc3,$acc3,$t3
818 mul $t3,$a7,$n0
819 adcs $acc4,$acc4,$t0
820 umulh $t0,$a0,$n0
821 adcs $acc5,$acc5,$t1
822 umulh $t1,$a1,$n0
823 adcs $acc6,$acc6,$t2
824 umulh $t2,$a2,$n0
825 adcs $acc7,$acc7,$t3
826 umulh $t3,$a3,$n0
827 adc $carry,$carry,xzr
828 str $acc0,[$tp],#8
829 adds $acc0,$acc1,$t0
830 umulh $t0,$a4,$n0
831 adcs $acc1,$acc2,$t1
832 umulh $t1,$a5,$n0
833 adcs $acc2,$acc3,$t2
834 umulh $t2,$a6,$n0
835 adcs $acc3,$acc4,$t3
836 umulh $t3,$a7,$n0
837 ldr $n0,[$rp,$cnt]
838 adcs $acc4,$acc5,$t0
839 adcs $acc5,$acc6,$t1
840 adcs $acc6,$acc7,$t2
841 adcs $acc7,$carry,$t3
842 //adc $carry,xzr,xzr // moved above
843 cbnz $cnt,.Lsqr8x_tail
844 // note that carry flag is guaranteed
845 // to be zero at this point
846 ldp $a0,$a1,[$tp,#8*0]
847 sub $cnt,$np_end,$np // done yet?
848 sub $t2,$np_end,$num // rewinded np
849 ldp $a2,$a3,[$tp,#8*2]
850 ldp $a4,$a5,[$tp,#8*4]
851 ldp $a6,$a7,[$tp,#8*6]
852 cbz $cnt,.Lsqr8x_tail_break
853
854 ldr $n0,[$rp,#-8*8]
855 adds $acc0,$acc0,$a0
856 adcs $acc1,$acc1,$a1
857 ldp $a0,$a1,[$np,#8*0]
858 adcs $acc2,$acc2,$a2
859 adcs $acc3,$acc3,$a3
860 ldp $a2,$a3,[$np,#8*2]
861 adcs $acc4,$acc4,$a4
862 adcs $acc5,$acc5,$a5
863 ldp $a4,$a5,[$np,#8*4]
864 adcs $acc6,$acc6,$a6
865 mov $cnt,#-8*8
866 adcs $acc7,$acc7,$a7
867 ldp $a6,$a7,[$np,#8*6]
868 add $np,$np,#8*8
869 //adc $carry,xzr,xzr // moved above
870 b .Lsqr8x_tail
871
872.align 4
873.Lsqr8x_tail_break:
874 ldr $n0,[x29,#112] // pull n0
875 add $cnt,$tp,#8*8 // end of current t[num] window
876
877 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
878 adcs $t0,$acc0,$a0
879 adcs $t1,$acc1,$a1
880 ldp $acc0,$acc1,[$rp,#8*0]
881 adcs $acc2,$acc2,$a2
882 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
883 adcs $acc3,$acc3,$a3
884 ldp $a2,$a3,[$t2,#8*2]
885 adcs $acc4,$acc4,$a4
886 adcs $acc5,$acc5,$a5
887 ldp $a4,$a5,[$t2,#8*4]
888 adcs $acc6,$acc6,$a6
889 adcs $acc7,$acc7,$a7
890 ldp $a6,$a7,[$t2,#8*6]
891 add $np,$t2,#8*8
892 adc $topmost,xzr,xzr // top-most carry
893 mul $na0,$n0,$acc0
894 stp $t0,$t1,[$tp,#8*0]
895 stp $acc2,$acc3,[$tp,#8*2]
896 ldp $acc2,$acc3,[$rp,#8*2]
897 stp $acc4,$acc5,[$tp,#8*4]
898 ldp $acc4,$acc5,[$rp,#8*4]
899 cmp $cnt,x29 // did we hit the bottom?
900 stp $acc6,$acc7,[$tp,#8*6]
901 mov $tp,$rp // slide the window
902 ldp $acc6,$acc7,[$rp,#8*6]
903 mov $cnt,#8
904 b.ne .Lsqr8x_reduction
905
906 // Final step. We see if result is larger than modulus, and
907 // if it is, subtract the modulus. But comparison implies
908 // subtraction. So we subtract modulus, see if it borrowed,
909 // and conditionally copy original value.
910 ldr $rp,[x29,#96] // pull rp
911 add $tp,$tp,#8*8
912 subs $t0,$acc0,$a0
913 sbcs $t1,$acc1,$a1
914 sub $cnt,$num,#8*8
915 mov $ap_end,$rp // $rp copy
916
917.Lsqr8x_sub:
918 sbcs $t2,$acc2,$a2
919 ldp $a0,$a1,[$np,#8*0]
920 sbcs $t3,$acc3,$a3
921 stp $t0,$t1,[$rp,#8*0]
922 sbcs $t0,$acc4,$a4
923 ldp $a2,$a3,[$np,#8*2]
924 sbcs $t1,$acc5,$a5
925 stp $t2,$t3,[$rp,#8*2]
926 sbcs $t2,$acc6,$a6
927 ldp $a4,$a5,[$np,#8*4]
928 sbcs $t3,$acc7,$a7
929 ldp $a6,$a7,[$np,#8*6]
930 add $np,$np,#8*8
931 ldp $acc0,$acc1,[$tp,#8*0]
932 sub $cnt,$cnt,#8*8
933 ldp $acc2,$acc3,[$tp,#8*2]
934 ldp $acc4,$acc5,[$tp,#8*4]
935 ldp $acc6,$acc7,[$tp,#8*6]
936 add $tp,$tp,#8*8
937 stp $t0,$t1,[$rp,#8*4]
938 sbcs $t0,$acc0,$a0
939 stp $t2,$t3,[$rp,#8*6]
940 add $rp,$rp,#8*8
941 sbcs $t1,$acc1,$a1
942 cbnz $cnt,.Lsqr8x_sub
943
944 sbcs $t2,$acc2,$a2
945 mov $tp,sp
946 add $ap,sp,$num
947 ldp $a0,$a1,[$ap_end,#8*0]
948 sbcs $t3,$acc3,$a3
949 stp $t0,$t1,[$rp,#8*0]
950 sbcs $t0,$acc4,$a4
951 ldp $a2,$a3,[$ap_end,#8*2]
952 sbcs $t1,$acc5,$a5
953 stp $t2,$t3,[$rp,#8*2]
954 sbcs $t2,$acc6,$a6
955 ldp $acc0,$acc1,[$ap,#8*0]
956 sbcs $t3,$acc7,$a7
957 ldp $acc2,$acc3,[$ap,#8*2]
958 sbcs xzr,$topmost,xzr // did it borrow?
959 ldr x30,[x29,#8] // pull return address
960 stp $t0,$t1,[$rp,#8*4]
961 stp $t2,$t3,[$rp,#8*6]
962
963 sub $cnt,$num,#8*4
964.Lsqr4x_cond_copy:
965 sub $cnt,$cnt,#8*4
966 csel $t0,$acc0,$a0,lo
967 stp xzr,xzr,[$tp,#8*0]
968 csel $t1,$acc1,$a1,lo
969 ldp $a0,$a1,[$ap_end,#8*4]
970 ldp $acc0,$acc1,[$ap,#8*4]
971 csel $t2,$acc2,$a2,lo
972 stp xzr,xzr,[$tp,#8*2]
973 add $tp,$tp,#8*4
974 csel $t3,$acc3,$a3,lo
975 ldp $a2,$a3,[$ap_end,#8*6]
976 ldp $acc2,$acc3,[$ap,#8*6]
977 add $ap,$ap,#8*4
978 stp $t0,$t1,[$ap_end,#8*0]
979 stp $t2,$t3,[$ap_end,#8*2]
980 add $ap_end,$ap_end,#8*4
981 stp xzr,xzr,[$ap,#8*0]
982 stp xzr,xzr,[$ap,#8*2]
983 cbnz $cnt,.Lsqr4x_cond_copy
984
985 csel $t0,$acc0,$a0,lo
986 stp xzr,xzr,[$tp,#8*0]
987 csel $t1,$acc1,$a1,lo
988 stp xzr,xzr,[$tp,#8*2]
989 csel $t2,$acc2,$a2,lo
990 csel $t3,$acc3,$a3,lo
991 stp $t0,$t1,[$ap_end,#8*0]
992 stp $t2,$t3,[$ap_end,#8*2]
993
994 b .Lsqr8x_done
995
996.align 4
997.Lsqr8x8_post_condition:
998 adc $carry,xzr,xzr
999 ldr x30,[x29,#8] // pull return address
1000 // $acc0-7,$carry hold result, $a0-7 hold modulus
1001 subs $a0,$acc0,$a0
1002 ldr $ap,[x29,#96] // pull rp
1003 sbcs $a1,$acc1,$a1
1004 stp xzr,xzr,[sp,#8*0]
1005 sbcs $a2,$acc2,$a2
1006 stp xzr,xzr,[sp,#8*2]
1007 sbcs $a3,$acc3,$a3
1008 stp xzr,xzr,[sp,#8*4]
1009 sbcs $a4,$acc4,$a4
1010 stp xzr,xzr,[sp,#8*6]
1011 sbcs $a5,$acc5,$a5
1012 stp xzr,xzr,[sp,#8*8]
1013 sbcs $a6,$acc6,$a6
1014 stp xzr,xzr,[sp,#8*10]
1015 sbcs $a7,$acc7,$a7
1016 stp xzr,xzr,[sp,#8*12]
1017 sbcs $carry,$carry,xzr // did it borrow?
1018 stp xzr,xzr,[sp,#8*14]
1019
1020 // $a0-7 hold result-modulus
1021 csel $a0,$acc0,$a0,lo
1022 csel $a1,$acc1,$a1,lo
1023 csel $a2,$acc2,$a2,lo
1024 csel $a3,$acc3,$a3,lo
1025 stp $a0,$a1,[$ap,#8*0]
1026 csel $a4,$acc4,$a4,lo
1027 csel $a5,$acc5,$a5,lo
1028 stp $a2,$a3,[$ap,#8*2]
1029 csel $a6,$acc6,$a6,lo
1030 csel $a7,$acc7,$a7,lo
1031 stp $a4,$a5,[$ap,#8*4]
1032 stp $a6,$a7,[$ap,#8*6]
cb2ed545 1033
d38f1b39
AP
1034.Lsqr8x_done:
1035 ldp x19,x20,[x29,#16]
1036 mov sp,x29
1037 ldp x21,x22,[x29,#32]
1038 mov x0,#1
1039 ldp x23,x24,[x29,#48]
1040 ldp x25,x26,[x29,#64]
1041 ldp x27,x28,[x29,#80]
1042 ldr x29,[sp],#128
1043 ret
1044.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1045___
1046}
1047
1048{
1049########################################################################
1050# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1051# x86_64-mont5 module, it's different in sense that it performs
1052# reduction 256 bits at a time.
1053
1054my ($a0,$a1,$a2,$a3,
1055 $t0,$t1,$t2,$t3,
1056 $m0,$m1,$m2,$m3,
1057 $acc0,$acc1,$acc2,$acc3,$acc4,
1058 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1059my $bp_end=$rp;
1060my ($carry,$topmost) = ($rp,"x30");
1061
1062$code.=<<___;
1063.type __bn_mul4x_mont,%function
1064.align 5
1065__bn_mul4x_mont:
1066 stp x29,x30,[sp,#-128]!
1067 add x29,sp,#0
1068 stp x19,x20,[sp,#16]
1069 stp x21,x22,[sp,#32]
1070 stp x23,x24,[sp,#48]
1071 stp x25,x26,[sp,#64]
1072 stp x27,x28,[sp,#80]
1073
1074 sub $tp,sp,$num,lsl#3
1075 lsl $num,$num,#3
1076 ldr $n0,[$n0] // *n0
1077 sub sp,$tp,#8*4 // alloca
1078
1079 add $t0,$bp,$num
1080 add $ap_end,$ap,$num
1081 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1082
1083 ldr $bi,[$bp,#8*0] // b[0]
1084 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1085 ldp $a2,$a3,[$ap,#8*2]
1086 add $ap,$ap,#8*4
1087 mov $acc0,xzr
1088 mov $acc1,xzr
1089 mov $acc2,xzr
1090 mov $acc3,xzr
1091 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1092 ldp $m2,$m3,[$np,#8*2]
1093 adds $np,$np,#8*4 // clear carry bit
1094 mov $carry,xzr
1095 mov $cnt,#0
1096 mov $tp,sp
1097
1098.Loop_mul4x_1st_reduction:
1099 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1100 adc $carry,$carry,xzr // modulo-scheduled
1101 mul $t1,$a1,$bi
1102 add $cnt,$cnt,#8
1103 mul $t2,$a2,$bi
1104 and $cnt,$cnt,#31
1105 mul $t3,$a3,$bi
1106 adds $acc0,$acc0,$t0
1107 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1108 adcs $acc1,$acc1,$t1
1109 mul $mi,$acc0,$n0 // t[0]*n0
1110 adcs $acc2,$acc2,$t2
1111 umulh $t1,$a1,$bi
1112 adcs $acc3,$acc3,$t3
1113 umulh $t2,$a2,$bi
1114 adc $acc4,xzr,xzr
1115 umulh $t3,$a3,$bi
1116 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1117 adds $acc1,$acc1,$t0
1118 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1119 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1120 adcs $acc2,$acc2,$t1
1121 mul $t1,$m1,$mi
1122 adcs $acc3,$acc3,$t2
1123 mul $t2,$m2,$mi
1124 adc $acc4,$acc4,$t3 // can't overflow
1125 mul $t3,$m3,$mi
1126 // (*) adds xzr,$acc0,$t0
1127 subs xzr,$acc0,#1 // (*)
1128 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1129 adcs $acc0,$acc1,$t1
1130 umulh $t1,$m1,$mi
1131 adcs $acc1,$acc2,$t2
1132 umulh $t2,$m2,$mi
1133 adcs $acc2,$acc3,$t3
1134 umulh $t3,$m3,$mi
1135 adcs $acc3,$acc4,$carry
1136 adc $carry,xzr,xzr
1137 adds $acc0,$acc0,$t0
1138 sub $t0,$ap_end,$ap
1139 adcs $acc1,$acc1,$t1
1140 adcs $acc2,$acc2,$t2
1141 adcs $acc3,$acc3,$t3
1142 //adc $carry,$carry,xzr
1143 cbnz $cnt,.Loop_mul4x_1st_reduction
1144
1145 cbz $t0,.Lmul4x4_post_condition
1146
1147 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1148 ldp $a2,$a3,[$ap,#8*2]
1149 add $ap,$ap,#8*4
1150 ldr $mi,[sp] // a[0]*n0
1151 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1152 ldp $m2,$m3,[$np,#8*2]
1153 add $np,$np,#8*4
1154
1155.Loop_mul4x_1st_tail:
1156 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1157 adc $carry,$carry,xzr // modulo-scheduled
1158 mul $t1,$a1,$bi
1159 add $cnt,$cnt,#8
1160 mul $t2,$a2,$bi
1161 and $cnt,$cnt,#31
1162 mul $t3,$a3,$bi
1163 adds $acc0,$acc0,$t0
1164 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1165 adcs $acc1,$acc1,$t1
1166 umulh $t1,$a1,$bi
1167 adcs $acc2,$acc2,$t2
1168 umulh $t2,$a2,$bi
1169 adcs $acc3,$acc3,$t3
1170 umulh $t3,$a3,$bi
1171 adc $acc4,xzr,xzr
1172 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1173 adds $acc1,$acc1,$t0
1174 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1175 adcs $acc2,$acc2,$t1
1176 mul $t1,$m1,$mi
1177 adcs $acc3,$acc3,$t2
1178 mul $t2,$m2,$mi
1179 adc $acc4,$acc4,$t3 // can't overflow
1180 mul $t3,$m3,$mi
1181 adds $acc0,$acc0,$t0
1182 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1183 adcs $acc1,$acc1,$t1
1184 umulh $t1,$m1,$mi
1185 adcs $acc2,$acc2,$t2
1186 umulh $t2,$m2,$mi
1187 adcs $acc3,$acc3,$t3
1188 adcs $acc4,$acc4,$carry
1189 umulh $t3,$m3,$mi
1190 adc $carry,xzr,xzr
1191 ldr $mi,[sp,$cnt] // next t[0]*n0
1192 str $acc0,[$tp],#8 // result!!!
1193 adds $acc0,$acc1,$t0
1194 sub $t0,$ap_end,$ap // done yet?
1195 adcs $acc1,$acc2,$t1
1196 adcs $acc2,$acc3,$t2
1197 adcs $acc3,$acc4,$t3
1198 //adc $carry,$carry,xzr
1199 cbnz $cnt,.Loop_mul4x_1st_tail
1200
1201 sub $t1,$ap_end,$num // rewinded $ap
1202 cbz $t0,.Lmul4x_proceed
1203
1204 ldp $a0,$a1,[$ap,#8*0]
1205 ldp $a2,$a3,[$ap,#8*2]
1206 add $ap,$ap,#8*4
1207 ldp $m0,$m1,[$np,#8*0]
1208 ldp $m2,$m3,[$np,#8*2]
1209 add $np,$np,#8*4
1210 b .Loop_mul4x_1st_tail
1211
1212.align 5
1213.Lmul4x_proceed:
1214 ldr $bi,[$bp,#8*4]! // *++b
1215 adc $topmost,$carry,xzr
1216 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1217 sub $np,$np,$num // rewind np
1218 ldp $a2,$a3,[$t1,#8*2]
1219 add $ap,$t1,#8*4
1220
1221 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1222 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1223 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1224 ldp $acc2,$acc3,[sp,#8*6]
1225
1226 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1227 mov $tp,sp
1228 ldp $m2,$m3,[$np,#8*2]
1229 adds $np,$np,#8*4 // clear carry bit
1230 mov $carry,xzr
1231
1232.align 4
1233.Loop_mul4x_reduction:
1234 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1235 adc $carry,$carry,xzr // modulo-scheduled
1236 mul $t1,$a1,$bi
1237 add $cnt,$cnt,#8
1238 mul $t2,$a2,$bi
1239 and $cnt,$cnt,#31
1240 mul $t3,$a3,$bi
1241 adds $acc0,$acc0,$t0
1242 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1243 adcs $acc1,$acc1,$t1
1244 mul $mi,$acc0,$n0 // t[0]*n0
1245 adcs $acc2,$acc2,$t2
1246 umulh $t1,$a1,$bi
1247 adcs $acc3,$acc3,$t3
1248 umulh $t2,$a2,$bi
1249 adc $acc4,xzr,xzr
1250 umulh $t3,$a3,$bi
1251 ldr $bi,[$bp,$cnt] // next b[i]
1252 adds $acc1,$acc1,$t0
1253 // (*) mul $t0,$m0,$mi
1254 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1255 adcs $acc2,$acc2,$t1
1256 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1257 adcs $acc3,$acc3,$t2
1258 mul $t2,$m2,$mi
1259 adc $acc4,$acc4,$t3 // can't overflow
1260 mul $t3,$m3,$mi
1261 // (*) adds xzr,$acc0,$t0
1262 subs xzr,$acc0,#1 // (*)
1263 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1264 adcs $acc0,$acc1,$t1
1265 umulh $t1,$m1,$mi
1266 adcs $acc1,$acc2,$t2
1267 umulh $t2,$m2,$mi
1268 adcs $acc2,$acc3,$t3
1269 umulh $t3,$m3,$mi
1270 adcs $acc3,$acc4,$carry
1271 adc $carry,xzr,xzr
1272 adds $acc0,$acc0,$t0
1273 adcs $acc1,$acc1,$t1
1274 adcs $acc2,$acc2,$t2
1275 adcs $acc3,$acc3,$t3
1276 //adc $carry,$carry,xzr
1277 cbnz $cnt,.Loop_mul4x_reduction
1278
1279 adc $carry,$carry,xzr
1280 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1281 ldp $t2,$t3,[$tp,#8*6]
1282 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1283 ldp $a2,$a3,[$ap,#8*2]
1284 add $ap,$ap,#8*4
1285 adds $acc0,$acc0,$t0
1286 adcs $acc1,$acc1,$t1
1287 adcs $acc2,$acc2,$t2
1288 adcs $acc3,$acc3,$t3
1289 //adc $carry,$carry,xzr
1290
1291 ldr $mi,[sp] // t[0]*n0
1292 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1293 ldp $m2,$m3,[$np,#8*2]
1294 add $np,$np,#8*4
1295
1296.align 4
1297.Loop_mul4x_tail:
1298 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1299 adc $carry,$carry,xzr // modulo-scheduled
1300 mul $t1,$a1,$bi
1301 add $cnt,$cnt,#8
1302 mul $t2,$a2,$bi
1303 and $cnt,$cnt,#31
1304 mul $t3,$a3,$bi
1305 adds $acc0,$acc0,$t0
1306 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1307 adcs $acc1,$acc1,$t1
1308 umulh $t1,$a1,$bi
1309 adcs $acc2,$acc2,$t2
1310 umulh $t2,$a2,$bi
1311 adcs $acc3,$acc3,$t3
1312 umulh $t3,$a3,$bi
1313 adc $acc4,xzr,xzr
1314 ldr $bi,[$bp,$cnt] // next b[i]
1315 adds $acc1,$acc1,$t0
1316 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1317 adcs $acc2,$acc2,$t1
1318 mul $t1,$m1,$mi
1319 adcs $acc3,$acc3,$t2
1320 mul $t2,$m2,$mi
1321 adc $acc4,$acc4,$t3 // can't overflow
1322 mul $t3,$m3,$mi
1323 adds $acc0,$acc0,$t0
1324 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1325 adcs $acc1,$acc1,$t1
1326 umulh $t1,$m1,$mi
1327 adcs $acc2,$acc2,$t2
1328 umulh $t2,$m2,$mi
1329 adcs $acc3,$acc3,$t3
1330 umulh $t3,$m3,$mi
1331 adcs $acc4,$acc4,$carry
1332 ldr $mi,[sp,$cnt] // next a[0]*n0
1333 adc $carry,xzr,xzr
1334 str $acc0,[$tp],#8 // result!!!
1335 adds $acc0,$acc1,$t0
1336 sub $t0,$ap_end,$ap // done yet?
1337 adcs $acc1,$acc2,$t1
1338 adcs $acc2,$acc3,$t2
1339 adcs $acc3,$acc4,$t3
1340 //adc $carry,$carry,xzr
1341 cbnz $cnt,.Loop_mul4x_tail
1342
1343 sub $t1,$np,$num // rewinded np?
1344 adc $carry,$carry,xzr
1345 cbz $t0,.Loop_mul4x_break
1346
1347 ldp $t0,$t1,[$tp,#8*4]
1348 ldp $t2,$t3,[$tp,#8*6]
1349 ldp $a0,$a1,[$ap,#8*0]
1350 ldp $a2,$a3,[$ap,#8*2]
1351 add $ap,$ap,#8*4
1352 adds $acc0,$acc0,$t0
1353 adcs $acc1,$acc1,$t1
1354 adcs $acc2,$acc2,$t2
1355 adcs $acc3,$acc3,$t3
1356 //adc $carry,$carry,xzr
1357 ldp $m0,$m1,[$np,#8*0]
1358 ldp $m2,$m3,[$np,#8*2]
1359 add $np,$np,#8*4
1360 b .Loop_mul4x_tail
1361
1362.align 4
1363.Loop_mul4x_break:
1364 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1365 adds $acc0,$acc0,$topmost
1366 add $bp,$bp,#8*4 // bp++
1367 adcs $acc1,$acc1,xzr
1368 sub $ap,$ap,$num // rewind ap
1369 adcs $acc2,$acc2,xzr
1370 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1371 adcs $acc3,$acc3,xzr
1372 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1373 adc $topmost,$carry,xzr
1374 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1375 cmp $bp,$t3 // done yet?
1376 ldp $acc2,$acc3,[sp,#8*6]
1377 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1378 ldp $m2,$m3,[$t1,#8*2]
1379 add $np,$t1,#8*4
1380 b.eq .Lmul4x_post
1381
1382 ldr $bi,[$bp]
1383 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1384 ldp $a2,$a3,[$ap,#8*2]
1385 adds $ap,$ap,#8*4 // clear carry bit
1386 mov $carry,xzr
1387 mov $tp,sp
1388 b .Loop_mul4x_reduction
1389
1390.align 4
1391.Lmul4x_post:
1392 // Final step. We see if result is larger than modulus, and
1393 // if it is, subtract the modulus. But comparison implies
1394 // subtraction. So we subtract modulus, see if it borrowed,
1395 // and conditionally copy original value.
1396 mov $rp,$t2
1397 mov $ap_end,$t2 // $rp copy
1398 subs $t0,$acc0,$m0
1399 add $tp,sp,#8*8
1400 sbcs $t1,$acc1,$m1
1401 sub $cnt,$num,#8*4
1402
1403.Lmul4x_sub:
1404 sbcs $t2,$acc2,$m2
1405 ldp $m0,$m1,[$np,#8*0]
1406 sub $cnt,$cnt,#8*4
1407 ldp $acc0,$acc1,[$tp,#8*0]
1408 sbcs $t3,$acc3,$m3
1409 ldp $m2,$m3,[$np,#8*2]
1410 add $np,$np,#8*4
1411 ldp $acc2,$acc3,[$tp,#8*2]
1412 add $tp,$tp,#8*4
1413 stp $t0,$t1,[$rp,#8*0]
1414 sbcs $t0,$acc0,$m0
1415 stp $t2,$t3,[$rp,#8*2]
1416 add $rp,$rp,#8*4
1417 sbcs $t1,$acc1,$m1
1418 cbnz $cnt,.Lmul4x_sub
1419
1420 sbcs $t2,$acc2,$m2
1421 mov $tp,sp
1422 add $ap,sp,#8*4
1423 ldp $a0,$a1,[$ap_end,#8*0]
1424 sbcs $t3,$acc3,$m3
1425 stp $t0,$t1,[$rp,#8*0]
1426 ldp $a2,$a3,[$ap_end,#8*2]
1427 stp $t2,$t3,[$rp,#8*2]
1428 ldp $acc0,$acc1,[$ap,#8*0]
1429 ldp $acc2,$acc3,[$ap,#8*2]
1430 sbcs xzr,$topmost,xzr // did it borrow?
1431 ldr x30,[x29,#8] // pull return address
1432
1433 sub $cnt,$num,#8*4
1434.Lmul4x_cond_copy:
1435 sub $cnt,$cnt,#8*4
1436 csel $t0,$acc0,$a0,lo
1437 stp xzr,xzr,[$tp,#8*0]
1438 csel $t1,$acc1,$a1,lo
1439 ldp $a0,$a1,[$ap_end,#8*4]
1440 ldp $acc0,$acc1,[$ap,#8*4]
1441 csel $t2,$acc2,$a2,lo
1442 stp xzr,xzr,[$tp,#8*2]
1443 add $tp,$tp,#8*4
1444 csel $t3,$acc3,$a3,lo
1445 ldp $a2,$a3,[$ap_end,#8*6]
1446 ldp $acc2,$acc3,[$ap,#8*6]
1447 add $ap,$ap,#8*4
1448 stp $t0,$t1,[$ap_end,#8*0]
1449 stp $t2,$t3,[$ap_end,#8*2]
1450 add $ap_end,$ap_end,#8*4
1451 cbnz $cnt,.Lmul4x_cond_copy
1452
1453 csel $t0,$acc0,$a0,lo
1454 stp xzr,xzr,[$tp,#8*0]
1455 csel $t1,$acc1,$a1,lo
1456 stp xzr,xzr,[$tp,#8*2]
1457 csel $t2,$acc2,$a2,lo
1458 stp xzr,xzr,[$tp,#8*3]
1459 csel $t3,$acc3,$a3,lo
1460 stp xzr,xzr,[$tp,#8*4]
1461 stp $t0,$t1,[$ap_end,#8*0]
1462 stp $t2,$t3,[$ap_end,#8*2]
1463
1464 b .Lmul4x_done
1465
1466.align 4
1467.Lmul4x4_post_condition:
1468 adc $carry,$carry,xzr
1469 ldr $ap,[x29,#96] // pull rp
1470 // $acc0-3,$carry hold result, $m0-7 hold modulus
1471 subs $a0,$acc0,$m0
1472 ldr x30,[x29,#8] // pull return address
1473 sbcs $a1,$acc1,$m1
1474 stp xzr,xzr,[sp,#8*0]
1475 sbcs $a2,$acc2,$m2
1476 stp xzr,xzr,[sp,#8*2]
1477 sbcs $a3,$acc3,$m3
1478 stp xzr,xzr,[sp,#8*4]
1479 sbcs xzr,$carry,xzr // did it borrow?
1480 stp xzr,xzr,[sp,#8*6]
1481
1482 // $a0-3 hold result-modulus
1483 csel $a0,$acc0,$a0,lo
1484 csel $a1,$acc1,$a1,lo
1485 csel $a2,$acc2,$a2,lo
1486 csel $a3,$acc3,$a3,lo
1487 stp $a0,$a1,[$ap,#8*0]
1488 stp $a2,$a3,[$ap,#8*2]
1489
1490.Lmul4x_done:
1491 ldp x19,x20,[x29,#16]
1492 mov sp,x29
1493 ldp x21,x22,[x29,#32]
1494 mov x0,#1
1495 ldp x23,x24,[x29,#48]
1496 ldp x25,x26,[x29,#64]
1497 ldp x27,x28,[x29,#80]
1498 ldr x29,[sp],#128
1499 ret
1500.size __bn_mul4x_mont,.-__bn_mul4x_mont
1501___
1502}
1503$code.=<<___;
cb2ed545
AP
1504.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1505.align 4
1506___
1507
1508print $code;
1509
1510close STDOUT;