]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
cb2ed545 AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # March 2015 | |
18 | # | |
19 | # "Teaser" Montgomery multiplication module for ARMv8. Needs more | |
20 | # work. While it does improve RSA sign performance by 20-30% (less for | |
21 | # longer keys) on most processors, for some reason RSA2048 is not | |
22 | # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication | |
23 | # instruction issue rate is limited on processor in question, meaning | |
24 | # that dedicated squaring procedure is a must. Well, actually all | |
25 | # contemporary AArch64 processors seem to have limited multiplication | |
26 | # issue rate, i.e. they can't issue multiplication every cycle, which | |
27 | # explains moderate improvement coefficients in comparison to | |
28 | # compiler-generated code. Recall that compiler is instructed to use | |
29 | # umulh and therefore uses same amount of multiplication instructions | |
30 | # to do the job. Assembly's edge is to minimize number of "collateral" | |
31 | # instructions and of course instruction scheduling. | |
d38f1b39 AP |
32 | # |
33 | # April 2015 | |
34 | # | |
35 | # Squaring procedure that handles lengths divisible by 8 improves | |
36 | # RSA/DSA performance by 25-40-60% depending on processor and key | |
37 | # length. Overall improvement coefficients are always positive in | |
38 | # comparison to compiler-generated code. On Cortex-A57 improvement | |
39 | # is still modest on longest key lengths, while others exhibit e.g. | |
40 | # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster | |
41 | # on Cortex-A57 and ~60-100% faster on others. | |
cb2ed545 AP |
42 | |
43 | $flavour = shift; | |
44 | $output = shift; | |
45 | ||
46 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
47 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
48 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
49 | die "can't locate arm-xlate.pl"; | |
50 | ||
51 | open OUT,"| \"$^X\" $xlate $flavour $output"; | |
52 | *STDOUT=*OUT; | |
53 | ||
54 | ($lo0,$hi0,$aj,$m0,$alo,$ahi, | |
55 | $lo1,$hi1,$nj,$m1,$nlo,$nhi, | |
56 | $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); | |
57 | ||
58 | # int bn_mul_mont( | |
59 | $rp="x0"; # BN_ULONG *rp, | |
60 | $ap="x1"; # const BN_ULONG *ap, | |
61 | $bp="x2"; # const BN_ULONG *bp, | |
62 | $np="x3"; # const BN_ULONG *np, | |
63 | $n0="x4"; # const BN_ULONG *n0, | |
64 | $num="x5"; # int num); | |
65 | ||
66 | $code.=<<___; | |
67 | .text | |
68 | ||
69 | .globl bn_mul_mont | |
70 | .type bn_mul_mont,%function | |
71 | .align 5 | |
72 | bn_mul_mont: | |
d38f1b39 AP |
73 | tst $num,#7 |
74 | b.eq __bn_sqr8x_mont | |
75 | tst $num,#3 | |
76 | b.eq __bn_mul4x_mont | |
77 | .Lmul_mont: | |
cb2ed545 AP |
78 | stp x29,x30,[sp,#-64]! |
79 | add x29,sp,#0 | |
80 | stp x19,x20,[sp,#16] | |
81 | stp x21,x22,[sp,#32] | |
82 | stp x23,x24,[sp,#48] | |
83 | ||
84 | ldr $m0,[$bp],#8 // bp[0] | |
85 | sub $tp,sp,$num,lsl#3 | |
86 | ldp $hi0,$aj,[$ap],#16 // ap[0..1] | |
87 | lsl $num,$num,#3 | |
88 | ldr $n0,[$n0] // *n0 | |
89 | and $tp,$tp,#-16 // ABI says so | |
90 | ldp $hi1,$nj,[$np],#16 // np[0..1] | |
91 | ||
92 | mul $lo0,$hi0,$m0 // ap[0]*bp[0] | |
93 | sub $j,$num,#16 // j=num-2 | |
94 | umulh $hi0,$hi0,$m0 | |
95 | mul $alo,$aj,$m0 // ap[1]*bp[0] | |
96 | umulh $ahi,$aj,$m0 | |
97 | ||
98 | mul $m1,$lo0,$n0 // "tp[0]"*n0 | |
99 | mov sp,$tp // alloca | |
100 | ||
d38f1b39 | 101 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1 |
cb2ed545 AP |
102 | umulh $hi1,$hi1,$m1 |
103 | mul $nlo,$nj,$m1 // np[1]*m1 | |
d38f1b39 AP |
104 | // (*) adds $lo1,$lo1,$lo0 // discarded |
105 | // (*) As for removal of first multiplication and addition | |
106 | // instructions. The outcome of first addition is | |
107 | // guaranteed to be zero, which leaves two computationally | |
108 | // significant outcomes: it either carries or not. Then | |
109 | // question is when does it carry? Is there alternative | |
110 | // way to deduce it? If you follow operations, you can | |
111 | // observe that condition for carry is quite simple: | |
112 | // $lo0 being non-zero. So that carry can be calculated | |
113 | // by adding -1 to $lo0. That's what next instruction does. | |
114 | subs xzr,$lo0,#1 // (*) | |
cb2ed545 AP |
115 | umulh $nhi,$nj,$m1 |
116 | adc $hi1,$hi1,xzr | |
117 | cbz $j,.L1st_skip | |
118 | ||
119 | .L1st: | |
120 | ldr $aj,[$ap],#8 | |
121 | adds $lo0,$alo,$hi0 | |
122 | sub $j,$j,#8 // j-- | |
123 | adc $hi0,$ahi,xzr | |
124 | ||
125 | ldr $nj,[$np],#8 | |
126 | adds $lo1,$nlo,$hi1 | |
127 | mul $alo,$aj,$m0 // ap[j]*bp[0] | |
128 | adc $hi1,$nhi,xzr | |
129 | umulh $ahi,$aj,$m0 | |
130 | ||
131 | adds $lo1,$lo1,$lo0 | |
132 | mul $nlo,$nj,$m1 // np[j]*m1 | |
133 | adc $hi1,$hi1,xzr | |
134 | umulh $nhi,$nj,$m1 | |
135 | str $lo1,[$tp],#8 // tp[j-1] | |
136 | cbnz $j,.L1st | |
137 | ||
138 | .L1st_skip: | |
139 | adds $lo0,$alo,$hi0 | |
140 | sub $ap,$ap,$num // rewind $ap | |
141 | adc $hi0,$ahi,xzr | |
142 | ||
143 | adds $lo1,$nlo,$hi1 | |
144 | sub $np,$np,$num // rewind $np | |
145 | adc $hi1,$nhi,xzr | |
146 | ||
147 | adds $lo1,$lo1,$lo0 | |
148 | sub $i,$num,#8 // i=num-1 | |
149 | adcs $hi1,$hi1,$hi0 | |
150 | ||
151 | adc $ovf,xzr,xzr // upmost overflow bit | |
152 | stp $lo1,$hi1,[$tp] | |
153 | ||
154 | .Louter: | |
155 | ldr $m0,[$bp],#8 // bp[i] | |
156 | ldp $hi0,$aj,[$ap],#16 | |
157 | ldr $tj,[sp] // tp[0] | |
158 | add $tp,sp,#8 | |
159 | ||
160 | mul $lo0,$hi0,$m0 // ap[0]*bp[i] | |
161 | sub $j,$num,#16 // j=num-2 | |
162 | umulh $hi0,$hi0,$m0 | |
163 | ldp $hi1,$nj,[$np],#16 | |
164 | mul $alo,$aj,$m0 // ap[1]*bp[i] | |
165 | adds $lo0,$lo0,$tj | |
166 | umulh $ahi,$aj,$m0 | |
167 | adc $hi0,$hi0,xzr | |
168 | ||
169 | mul $m1,$lo0,$n0 | |
170 | sub $i,$i,#8 // i-- | |
171 | ||
d38f1b39 | 172 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1 |
cb2ed545 AP |
173 | umulh $hi1,$hi1,$m1 |
174 | mul $nlo,$nj,$m1 // np[1]*m1 | |
d38f1b39 AP |
175 | // (*) adds $lo1,$lo1,$lo0 |
176 | subs xzr,$lo0,#1 // (*) | |
cb2ed545 AP |
177 | umulh $nhi,$nj,$m1 |
178 | cbz $j,.Linner_skip | |
179 | ||
180 | .Linner: | |
181 | ldr $aj,[$ap],#8 | |
182 | adc $hi1,$hi1,xzr | |
183 | ldr $tj,[$tp],#8 // tp[j] | |
184 | adds $lo0,$alo,$hi0 | |
185 | sub $j,$j,#8 // j-- | |
186 | adc $hi0,$ahi,xzr | |
187 | ||
188 | adds $lo1,$nlo,$hi1 | |
189 | ldr $nj,[$np],#8 | |
190 | adc $hi1,$nhi,xzr | |
191 | ||
192 | mul $alo,$aj,$m0 // ap[j]*bp[i] | |
193 | adds $lo0,$lo0,$tj | |
194 | umulh $ahi,$aj,$m0 | |
195 | adc $hi0,$hi0,xzr | |
196 | ||
197 | mul $nlo,$nj,$m1 // np[j]*m1 | |
198 | adds $lo1,$lo1,$lo0 | |
199 | umulh $nhi,$nj,$m1 | |
200 | str $lo1,[$tp,#-16] // tp[j-1] | |
201 | cbnz $j,.Linner | |
202 | ||
203 | .Linner_skip: | |
204 | ldr $tj,[$tp],#8 // tp[j] | |
205 | adc $hi1,$hi1,xzr | |
206 | adds $lo0,$alo,$hi0 | |
207 | sub $ap,$ap,$num // rewind $ap | |
208 | adc $hi0,$ahi,xzr | |
209 | ||
210 | adds $lo1,$nlo,$hi1 | |
211 | sub $np,$np,$num // rewind $np | |
d38f1b39 AP |
212 | adcs $hi1,$nhi,$ovf |
213 | adc $ovf,xzr,xzr | |
cb2ed545 AP |
214 | |
215 | adds $lo0,$lo0,$tj | |
216 | adc $hi0,$hi0,xzr | |
217 | ||
218 | adds $lo1,$lo1,$lo0 | |
219 | adcs $hi1,$hi1,$hi0 | |
d38f1b39 | 220 | adc $ovf,$ovf,xzr // upmost overflow bit |
cb2ed545 AP |
221 | stp $lo1,$hi1,[$tp,#-16] |
222 | ||
223 | cbnz $i,.Louter | |
224 | ||
225 | // Final step. We see if result is larger than modulus, and | |
226 | // if it is, subtract the modulus. But comparison implies | |
227 | // subtraction. So we subtract modulus, see if it borrowed, | |
d38f1b39 | 228 | // and conditionally copy original value. |
cb2ed545 AP |
229 | ldr $tj,[sp] // tp[0] |
230 | add $tp,sp,#8 | |
231 | ldr $nj,[$np],#8 // np[0] | |
232 | subs $j,$num,#8 // j=num-1 and clear borrow | |
233 | mov $ap,$rp | |
234 | .Lsub: | |
235 | sbcs $aj,$tj,$nj // tp[j]-np[j] | |
236 | ldr $tj,[$tp],#8 | |
237 | sub $j,$j,#8 // j-- | |
238 | ldr $nj,[$np],#8 | |
239 | str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] | |
240 | cbnz $j,.Lsub | |
241 | ||
242 | sbcs $aj,$tj,$nj | |
243 | sbcs $ovf,$ovf,xzr // did it borrow? | |
244 | str $aj,[$ap],#8 // rp[num-1] | |
245 | ||
246 | ldr $tj,[sp] // tp[0] | |
247 | add $tp,sp,#8 | |
248 | ldr $aj,[$rp],#8 // rp[0] | |
249 | sub $num,$num,#8 // num-- | |
250 | nop | |
251 | .Lcond_copy: | |
252 | sub $num,$num,#8 // num-- | |
d38f1b39 | 253 | csel $nj,$tj,$aj,lo // did it borrow? |
cb2ed545 AP |
254 | ldr $tj,[$tp],#8 |
255 | ldr $aj,[$rp],#8 | |
256 | str xzr,[$tp,#-16] // wipe tp | |
257 | str $nj,[$rp,#-16] | |
258 | cbnz $num,.Lcond_copy | |
259 | ||
d38f1b39 | 260 | csel $nj,$tj,$aj,lo |
cb2ed545 AP |
261 | str xzr,[$tp,#-8] // wipe tp |
262 | str $nj,[$rp,#-8] | |
263 | ||
264 | ldp x19,x20,[x29,#16] | |
265 | mov sp,x29 | |
266 | ldp x21,x22,[x29,#32] | |
d38f1b39 | 267 | mov x0,#1 |
cb2ed545 AP |
268 | ldp x23,x24,[x29,#48] |
269 | ldr x29,[sp],#64 | |
270 | ret | |
271 | .size bn_mul_mont,.-bn_mul_mont | |
d38f1b39 AP |
272 | ___ |
273 | { | |
274 | ######################################################################## | |
275 | # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. | |
276 | ||
277 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); | |
278 | my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); | |
279 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); | |
280 | my ($cnt,$carry,$topmost)=("x27","x28","x30"); | |
281 | my ($tp,$ap_end,$na0)=($bp,$np,$carry); | |
282 | ||
283 | $code.=<<___; | |
284 | .type __bn_sqr8x_mont,%function | |
285 | .align 5 | |
286 | __bn_sqr8x_mont: | |
287 | cmp $ap,$bp | |
288 | b.ne __bn_mul4x_mont | |
289 | .Lsqr8x_mont: | |
290 | stp x29,x30,[sp,#-128]! | |
291 | add x29,sp,#0 | |
292 | stp x19,x20,[sp,#16] | |
293 | stp x21,x22,[sp,#32] | |
294 | stp x23,x24,[sp,#48] | |
295 | stp x25,x26,[sp,#64] | |
296 | stp x27,x28,[sp,#80] | |
297 | stp $rp,$np,[sp,#96] // offload rp and np | |
298 | ||
299 | ldp $a0,$a1,[$ap,#8*0] | |
300 | ldp $a2,$a3,[$ap,#8*2] | |
301 | ldp $a4,$a5,[$ap,#8*4] | |
302 | ldp $a6,$a7,[$ap,#8*6] | |
303 | ||
304 | sub $tp,sp,$num,lsl#4 | |
305 | lsl $num,$num,#3 | |
306 | ldr $n0,[$n0] // *n0 | |
307 | mov sp,$tp // alloca | |
308 | sub $cnt,$num,#8*8 | |
309 | b .Lsqr8x_zero_start | |
310 | ||
311 | .Lsqr8x_zero: | |
312 | sub $cnt,$cnt,#8*8 | |
313 | stp xzr,xzr,[$tp,#8*0] | |
314 | stp xzr,xzr,[$tp,#8*2] | |
315 | stp xzr,xzr,[$tp,#8*4] | |
316 | stp xzr,xzr,[$tp,#8*6] | |
317 | .Lsqr8x_zero_start: | |
318 | stp xzr,xzr,[$tp,#8*8] | |
319 | stp xzr,xzr,[$tp,#8*10] | |
320 | stp xzr,xzr,[$tp,#8*12] | |
321 | stp xzr,xzr,[$tp,#8*14] | |
322 | add $tp,$tp,#8*16 | |
323 | cbnz $cnt,.Lsqr8x_zero | |
324 | ||
325 | add $ap_end,$ap,$num | |
326 | add $ap,$ap,#8*8 | |
327 | mov $acc0,xzr | |
328 | mov $acc1,xzr | |
329 | mov $acc2,xzr | |
330 | mov $acc3,xzr | |
331 | mov $acc4,xzr | |
332 | mov $acc5,xzr | |
333 | mov $acc6,xzr | |
334 | mov $acc7,xzr | |
335 | mov $tp,sp | |
336 | str $n0,[x29,#112] // offload n0 | |
337 | ||
338 | // Multiply everything but a[i]*a[i] | |
339 | .align 4 | |
340 | .Lsqr8x_outer_loop: | |
341 | // a[1]a[0] (i) | |
342 | // a[2]a[0] | |
343 | // a[3]a[0] | |
344 | // a[4]a[0] | |
345 | // a[5]a[0] | |
346 | // a[6]a[0] | |
347 | // a[7]a[0] | |
348 | // a[2]a[1] (ii) | |
349 | // a[3]a[1] | |
350 | // a[4]a[1] | |
351 | // a[5]a[1] | |
352 | // a[6]a[1] | |
353 | // a[7]a[1] | |
354 | // a[3]a[2] (iii) | |
355 | // a[4]a[2] | |
356 | // a[5]a[2] | |
357 | // a[6]a[2] | |
358 | // a[7]a[2] | |
359 | // a[4]a[3] (iv) | |
360 | // a[5]a[3] | |
361 | // a[6]a[3] | |
362 | // a[7]a[3] | |
363 | // a[5]a[4] (v) | |
364 | // a[6]a[4] | |
365 | // a[7]a[4] | |
366 | // a[6]a[5] (vi) | |
367 | // a[7]a[5] | |
368 | // a[7]a[6] (vii) | |
369 | ||
370 | mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) | |
371 | mul $t1,$a2,$a0 | |
372 | mul $t2,$a3,$a0 | |
373 | mul $t3,$a4,$a0 | |
374 | adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) | |
375 | mul $t0,$a5,$a0 | |
376 | adcs $acc2,$acc2,$t1 | |
377 | mul $t1,$a6,$a0 | |
378 | adcs $acc3,$acc3,$t2 | |
379 | mul $t2,$a7,$a0 | |
380 | adcs $acc4,$acc4,$t3 | |
381 | umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) | |
382 | adcs $acc5,$acc5,$t0 | |
383 | umulh $t0,$a2,$a0 | |
384 | adcs $acc6,$acc6,$t1 | |
385 | umulh $t1,$a3,$a0 | |
386 | adcs $acc7,$acc7,$t2 | |
387 | umulh $t2,$a4,$a0 | |
388 | stp $acc0,$acc1,[$tp],#8*2 // t[0..1] | |
389 | adc $acc0,xzr,xzr // t[8] | |
390 | adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) | |
391 | umulh $t3,$a5,$a0 | |
392 | adcs $acc3,$acc3,$t0 | |
393 | umulh $t0,$a6,$a0 | |
394 | adcs $acc4,$acc4,$t1 | |
395 | umulh $t1,$a7,$a0 | |
396 | adcs $acc5,$acc5,$t2 | |
397 | mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) | |
398 | adcs $acc6,$acc6,$t3 | |
399 | mul $t3,$a3,$a1 | |
400 | adcs $acc7,$acc7,$t0 | |
401 | mul $t0,$a4,$a1 | |
402 | adc $acc0,$acc0,$t1 | |
403 | ||
404 | mul $t1,$a5,$a1 | |
405 | adds $acc3,$acc3,$t2 | |
406 | mul $t2,$a6,$a1 | |
407 | adcs $acc4,$acc4,$t3 | |
408 | mul $t3,$a7,$a1 | |
409 | adcs $acc5,$acc5,$t0 | |
410 | umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) | |
411 | adcs $acc6,$acc6,$t1 | |
412 | umulh $t1,$a3,$a1 | |
413 | adcs $acc7,$acc7,$t2 | |
414 | umulh $t2,$a4,$a1 | |
415 | adcs $acc0,$acc0,$t3 | |
416 | umulh $t3,$a5,$a1 | |
417 | stp $acc2,$acc3,[$tp],#8*2 // t[2..3] | |
418 | adc $acc1,xzr,xzr // t[9] | |
419 | adds $acc4,$acc4,$t0 | |
420 | umulh $t0,$a6,$a1 | |
421 | adcs $acc5,$acc5,$t1 | |
422 | umulh $t1,$a7,$a1 | |
423 | adcs $acc6,$acc6,$t2 | |
424 | mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) | |
425 | adcs $acc7,$acc7,$t3 | |
426 | mul $t3,$a4,$a2 | |
427 | adcs $acc0,$acc0,$t0 | |
428 | mul $t0,$a5,$a2 | |
429 | adc $acc1,$acc1,$t1 | |
430 | ||
431 | mul $t1,$a6,$a2 | |
432 | adds $acc5,$acc5,$t2 | |
433 | mul $t2,$a7,$a2 | |
434 | adcs $acc6,$acc6,$t3 | |
435 | umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) | |
436 | adcs $acc7,$acc7,$t0 | |
437 | umulh $t0,$a4,$a2 | |
438 | adcs $acc0,$acc0,$t1 | |
439 | umulh $t1,$a5,$a2 | |
440 | adcs $acc1,$acc1,$t2 | |
441 | umulh $t2,$a6,$a2 | |
442 | stp $acc4,$acc5,[$tp],#8*2 // t[4..5] | |
443 | adc $acc2,xzr,xzr // t[10] | |
444 | adds $acc6,$acc6,$t3 | |
445 | umulh $t3,$a7,$a2 | |
446 | adcs $acc7,$acc7,$t0 | |
447 | mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) | |
448 | adcs $acc0,$acc0,$t1 | |
449 | mul $t1,$a5,$a3 | |
450 | adcs $acc1,$acc1,$t2 | |
451 | mul $t2,$a6,$a3 | |
452 | adc $acc2,$acc2,$t3 | |
453 | ||
454 | mul $t3,$a7,$a3 | |
455 | adds $acc7,$acc7,$t0 | |
456 | umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) | |
457 | adcs $acc0,$acc0,$t1 | |
458 | umulh $t1,$a5,$a3 | |
459 | adcs $acc1,$acc1,$t2 | |
460 | umulh $t2,$a6,$a3 | |
461 | adcs $acc2,$acc2,$t3 | |
462 | umulh $t3,$a7,$a3 | |
463 | stp $acc6,$acc7,[$tp],#8*2 // t[6..7] | |
464 | adc $acc3,xzr,xzr // t[11] | |
465 | adds $acc0,$acc0,$t0 | |
466 | mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) | |
467 | adcs $acc1,$acc1,$t1 | |
468 | mul $t1,$a6,$a4 | |
469 | adcs $acc2,$acc2,$t2 | |
470 | mul $t2,$a7,$a4 | |
471 | adc $acc3,$acc3,$t3 | |
472 | ||
473 | umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) | |
474 | adds $acc1,$acc1,$t0 | |
475 | umulh $t0,$a6,$a4 | |
476 | adcs $acc2,$acc2,$t1 | |
477 | umulh $t1,$a7,$a4 | |
478 | adcs $acc3,$acc3,$t2 | |
479 | mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) | |
480 | adc $acc4,xzr,xzr // t[12] | |
481 | adds $acc2,$acc2,$t3 | |
482 | mul $t3,$a7,$a5 | |
483 | adcs $acc3,$acc3,$t0 | |
484 | umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) | |
485 | adc $acc4,$acc4,$t1 | |
486 | ||
487 | umulh $t1,$a7,$a5 | |
488 | adds $acc3,$acc3,$t2 | |
489 | mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) | |
490 | adcs $acc4,$acc4,$t3 | |
491 | umulh $t3,$a7,$a6 // hi(a[7]*a[6]) | |
492 | adc $acc5,xzr,xzr // t[13] | |
493 | adds $acc4,$acc4,$t0 | |
494 | sub $cnt,$ap_end,$ap // done yet? | |
495 | adc $acc5,$acc5,$t1 | |
496 | ||
497 | adds $acc5,$acc5,$t2 | |
498 | sub $t0,$ap_end,$num // rewinded ap | |
499 | adc $acc6,xzr,xzr // t[14] | |
500 | add $acc6,$acc6,$t3 | |
501 | ||
502 | cbz $cnt,.Lsqr8x_outer_break | |
503 | ||
504 | mov $n0,$a0 | |
505 | ldp $a0,$a1,[$tp,#8*0] | |
506 | ldp $a2,$a3,[$tp,#8*2] | |
507 | ldp $a4,$a5,[$tp,#8*4] | |
508 | ldp $a6,$a7,[$tp,#8*6] | |
509 | adds $acc0,$acc0,$a0 | |
510 | adcs $acc1,$acc1,$a1 | |
511 | ldp $a0,$a1,[$ap,#8*0] | |
512 | adcs $acc2,$acc2,$a2 | |
513 | adcs $acc3,$acc3,$a3 | |
514 | ldp $a2,$a3,[$ap,#8*2] | |
515 | adcs $acc4,$acc4,$a4 | |
516 | adcs $acc5,$acc5,$a5 | |
517 | ldp $a4,$a5,[$ap,#8*4] | |
518 | adcs $acc6,$acc6,$a6 | |
519 | mov $rp,$ap | |
520 | adcs $acc7,xzr,$a7 | |
521 | ldp $a6,$a7,[$ap,#8*6] | |
522 | add $ap,$ap,#8*8 | |
523 | //adc $carry,xzr,xzr // moved below | |
524 | mov $cnt,#-8*8 | |
525 | ||
526 | // a[8]a[0] | |
527 | // a[9]a[0] | |
528 | // a[a]a[0] | |
529 | // a[b]a[0] | |
530 | // a[c]a[0] | |
531 | // a[d]a[0] | |
532 | // a[e]a[0] | |
533 | // a[f]a[0] | |
534 | // a[8]a[1] | |
535 | // a[f]a[1]........................ | |
536 | // a[8]a[2] | |
537 | // a[f]a[2]........................ | |
538 | // a[8]a[3] | |
539 | // a[f]a[3]........................ | |
540 | // a[8]a[4] | |
541 | // a[f]a[4]........................ | |
542 | // a[8]a[5] | |
543 | // a[f]a[5]........................ | |
544 | // a[8]a[6] | |
545 | // a[f]a[6]........................ | |
546 | // a[8]a[7] | |
547 | // a[f]a[7]........................ | |
548 | .Lsqr8x_mul: | |
549 | mul $t0,$a0,$n0 | |
550 | adc $carry,xzr,xzr // carry bit, modulo-scheduled | |
551 | mul $t1,$a1,$n0 | |
552 | add $cnt,$cnt,#8 | |
553 | mul $t2,$a2,$n0 | |
554 | mul $t3,$a3,$n0 | |
555 | adds $acc0,$acc0,$t0 | |
556 | mul $t0,$a4,$n0 | |
557 | adcs $acc1,$acc1,$t1 | |
558 | mul $t1,$a5,$n0 | |
559 | adcs $acc2,$acc2,$t2 | |
560 | mul $t2,$a6,$n0 | |
561 | adcs $acc3,$acc3,$t3 | |
562 | mul $t3,$a7,$n0 | |
563 | adcs $acc4,$acc4,$t0 | |
564 | umulh $t0,$a0,$n0 | |
565 | adcs $acc5,$acc5,$t1 | |
566 | umulh $t1,$a1,$n0 | |
567 | adcs $acc6,$acc6,$t2 | |
568 | umulh $t2,$a2,$n0 | |
569 | adcs $acc7,$acc7,$t3 | |
570 | umulh $t3,$a3,$n0 | |
571 | adc $carry,$carry,xzr | |
572 | str $acc0,[$tp],#8 | |
573 | adds $acc0,$acc1,$t0 | |
574 | umulh $t0,$a4,$n0 | |
575 | adcs $acc1,$acc2,$t1 | |
576 | umulh $t1,$a5,$n0 | |
577 | adcs $acc2,$acc3,$t2 | |
578 | umulh $t2,$a6,$n0 | |
579 | adcs $acc3,$acc4,$t3 | |
580 | umulh $t3,$a7,$n0 | |
581 | ldr $n0,[$rp,$cnt] | |
582 | adcs $acc4,$acc5,$t0 | |
583 | adcs $acc5,$acc6,$t1 | |
584 | adcs $acc6,$acc7,$t2 | |
585 | adcs $acc7,$carry,$t3 | |
586 | //adc $carry,xzr,xzr // moved above | |
587 | cbnz $cnt,.Lsqr8x_mul | |
588 | // note that carry flag is guaranteed | |
589 | // to be zero at this point | |
590 | cmp $ap,$ap_end // done yet? | |
591 | b.eq .Lsqr8x_break | |
592 | ||
593 | ldp $a0,$a1,[$tp,#8*0] | |
594 | ldp $a2,$a3,[$tp,#8*2] | |
595 | ldp $a4,$a5,[$tp,#8*4] | |
596 | ldp $a6,$a7,[$tp,#8*6] | |
597 | adds $acc0,$acc0,$a0 | |
598 | ldr $n0,[$rp,#-8*8] | |
599 | adcs $acc1,$acc1,$a1 | |
600 | ldp $a0,$a1,[$ap,#8*0] | |
601 | adcs $acc2,$acc2,$a2 | |
602 | adcs $acc3,$acc3,$a3 | |
603 | ldp $a2,$a3,[$ap,#8*2] | |
604 | adcs $acc4,$acc4,$a4 | |
605 | adcs $acc5,$acc5,$a5 | |
606 | ldp $a4,$a5,[$ap,#8*4] | |
607 | adcs $acc6,$acc6,$a6 | |
608 | mov $cnt,#-8*8 | |
609 | adcs $acc7,$acc7,$a7 | |
610 | ldp $a6,$a7,[$ap,#8*6] | |
611 | add $ap,$ap,#8*8 | |
612 | //adc $carry,xzr,xzr // moved above | |
613 | b .Lsqr8x_mul | |
614 | ||
615 | .align 4 | |
616 | .Lsqr8x_break: | |
617 | ldp $a0,$a1,[$rp,#8*0] | |
618 | add $ap,$rp,#8*8 | |
619 | ldp $a2,$a3,[$rp,#8*2] | |
620 | sub $t0,$ap_end,$ap // is it last iteration? | |
621 | ldp $a4,$a5,[$rp,#8*4] | |
622 | sub $t1,$tp,$t0 | |
623 | ldp $a6,$a7,[$rp,#8*6] | |
624 | cbz $t0,.Lsqr8x_outer_loop | |
625 | ||
626 | stp $acc0,$acc1,[$tp,#8*0] | |
627 | ldp $acc0,$acc1,[$t1,#8*0] | |
628 | stp $acc2,$acc3,[$tp,#8*2] | |
629 | ldp $acc2,$acc3,[$t1,#8*2] | |
630 | stp $acc4,$acc5,[$tp,#8*4] | |
631 | ldp $acc4,$acc5,[$t1,#8*4] | |
632 | stp $acc6,$acc7,[$tp,#8*6] | |
633 | mov $tp,$t1 | |
634 | ldp $acc6,$acc7,[$t1,#8*6] | |
635 | b .Lsqr8x_outer_loop | |
636 | ||
637 | .align 4 | |
638 | .Lsqr8x_outer_break: | |
639 | // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] | |
640 | ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] | |
641 | ldp $t1,$t2,[sp,#8*1] | |
642 | ldp $a5,$a7,[$t0,#8*2] | |
643 | add $ap,$t0,#8*4 | |
644 | ldp $t3,$t0,[sp,#8*3] | |
645 | ||
646 | stp $acc0,$acc1,[$tp,#8*0] | |
647 | mul $acc0,$a1,$a1 | |
648 | stp $acc2,$acc3,[$tp,#8*2] | |
649 | umulh $a1,$a1,$a1 | |
650 | stp $acc4,$acc5,[$tp,#8*4] | |
651 | mul $a2,$a3,$a3 | |
652 | stp $acc6,$acc7,[$tp,#8*6] | |
653 | mov $tp,sp | |
654 | umulh $a3,$a3,$a3 | |
655 | adds $acc1,$a1,$t1,lsl#1 | |
656 | extr $t1,$t2,$t1,#63 | |
657 | sub $cnt,$num,#8*4 | |
658 | ||
659 | .Lsqr4x_shift_n_add: | |
660 | adcs $acc2,$a2,$t1 | |
661 | extr $t2,$t3,$t2,#63 | |
662 | sub $cnt,$cnt,#8*4 | |
663 | adcs $acc3,$a3,$t2 | |
664 | ldp $t1,$t2,[$tp,#8*5] | |
665 | mul $a4,$a5,$a5 | |
666 | ldp $a1,$a3,[$ap],#8*2 | |
667 | umulh $a5,$a5,$a5 | |
668 | mul $a6,$a7,$a7 | |
669 | umulh $a7,$a7,$a7 | |
670 | extr $t3,$t0,$t3,#63 | |
671 | stp $acc0,$acc1,[$tp,#8*0] | |
672 | adcs $acc4,$a4,$t3 | |
673 | extr $t0,$t1,$t0,#63 | |
674 | stp $acc2,$acc3,[$tp,#8*2] | |
675 | adcs $acc5,$a5,$t0 | |
676 | ldp $t3,$t0,[$tp,#8*7] | |
677 | extr $t1,$t2,$t1,#63 | |
678 | adcs $acc6,$a6,$t1 | |
679 | extr $t2,$t3,$t2,#63 | |
680 | adcs $acc7,$a7,$t2 | |
681 | ldp $t1,$t2,[$tp,#8*9] | |
682 | mul $a0,$a1,$a1 | |
683 | ldp $a5,$a7,[$ap],#8*2 | |
684 | umulh $a1,$a1,$a1 | |
685 | mul $a2,$a3,$a3 | |
686 | umulh $a3,$a3,$a3 | |
687 | stp $acc4,$acc5,[$tp,#8*4] | |
688 | extr $t3,$t0,$t3,#63 | |
689 | stp $acc6,$acc7,[$tp,#8*6] | |
690 | add $tp,$tp,#8*8 | |
691 | adcs $acc0,$a0,$t3 | |
692 | extr $t0,$t1,$t0,#63 | |
693 | adcs $acc1,$a1,$t0 | |
694 | ldp $t3,$t0,[$tp,#8*3] | |
695 | extr $t1,$t2,$t1,#63 | |
696 | cbnz $cnt,.Lsqr4x_shift_n_add | |
697 | ___ | |
698 | my ($np,$np_end)=($ap,$ap_end); | |
699 | $code.=<<___; | |
700 | ldp $np,$n0,[x29,#104] // pull np and n0 | |
701 | ||
702 | adcs $acc2,$a2,$t1 | |
703 | extr $t2,$t3,$t2,#63 | |
704 | adcs $acc3,$a3,$t2 | |
705 | ldp $t1,$t2,[$tp,#8*5] | |
706 | mul $a4,$a5,$a5 | |
707 | umulh $a5,$a5,$a5 | |
708 | stp $acc0,$acc1,[$tp,#8*0] | |
709 | mul $a6,$a7,$a7 | |
710 | umulh $a7,$a7,$a7 | |
711 | stp $acc2,$acc3,[$tp,#8*2] | |
712 | extr $t3,$t0,$t3,#63 | |
713 | adcs $acc4,$a4,$t3 | |
714 | extr $t0,$t1,$t0,#63 | |
715 | ldp $acc0,$acc1,[sp,#8*0] | |
716 | adcs $acc5,$a5,$t0 | |
717 | extr $t1,$t2,$t1,#63 | |
718 | ldp $a0,$a1,[$np,#8*0] | |
719 | adcs $acc6,$a6,$t1 | |
720 | extr $t2,xzr,$t2,#63 | |
721 | ldp $a2,$a3,[$np,#8*2] | |
722 | adc $acc7,$a7,$t2 | |
723 | ldp $a4,$a5,[$np,#8*4] | |
724 | ||
725 | // Reduce by 512 bits per iteration | |
726 | mul $na0,$n0,$acc0 // t[0]*n0 | |
727 | ldp $a6,$a7,[$np,#8*6] | |
728 | add $np_end,$np,$num | |
729 | ldp $acc2,$acc3,[sp,#8*2] | |
730 | stp $acc4,$acc5,[$tp,#8*4] | |
731 | ldp $acc4,$acc5,[sp,#8*4] | |
732 | stp $acc6,$acc7,[$tp,#8*6] | |
733 | ldp $acc6,$acc7,[sp,#8*6] | |
734 | add $np,$np,#8*8 | |
735 | mov $topmost,xzr // initial top-most carry | |
736 | mov $tp,sp | |
737 | mov $cnt,#8 | |
738 | ||
739 | .Lsqr8x_reduction: | |
740 | // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) | |
741 | mul $t1,$a1,$na0 | |
742 | sub $cnt,$cnt,#1 | |
743 | mul $t2,$a2,$na0 | |
744 | str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing | |
745 | mul $t3,$a3,$na0 | |
746 | // (*) adds xzr,$acc0,$t0 | |
747 | subs xzr,$acc0,#1 // (*) | |
748 | mul $t0,$a4,$na0 | |
749 | adcs $acc0,$acc1,$t1 | |
750 | mul $t1,$a5,$na0 | |
751 | adcs $acc1,$acc2,$t2 | |
752 | mul $t2,$a6,$na0 | |
753 | adcs $acc2,$acc3,$t3 | |
754 | mul $t3,$a7,$na0 | |
755 | adcs $acc3,$acc4,$t0 | |
756 | umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) | |
757 | adcs $acc4,$acc5,$t1 | |
758 | umulh $t1,$a1,$na0 | |
759 | adcs $acc5,$acc6,$t2 | |
760 | umulh $t2,$a2,$na0 | |
761 | adcs $acc6,$acc7,$t3 | |
762 | umulh $t3,$a3,$na0 | |
763 | adc $acc7,xzr,xzr | |
764 | adds $acc0,$acc0,$t0 | |
765 | umulh $t0,$a4,$na0 | |
766 | adcs $acc1,$acc1,$t1 | |
767 | umulh $t1,$a5,$na0 | |
768 | adcs $acc2,$acc2,$t2 | |
769 | umulh $t2,$a6,$na0 | |
770 | adcs $acc3,$acc3,$t3 | |
771 | umulh $t3,$a7,$na0 | |
772 | mul $na0,$n0,$acc0 // next t[0]*n0 | |
773 | adcs $acc4,$acc4,$t0 | |
774 | adcs $acc5,$acc5,$t1 | |
775 | adcs $acc6,$acc6,$t2 | |
776 | adc $acc7,$acc7,$t3 | |
777 | cbnz $cnt,.Lsqr8x_reduction | |
778 | ||
779 | ldp $t0,$t1,[$tp,#8*0] | |
780 | ldp $t2,$t3,[$tp,#8*2] | |
781 | mov $rp,$tp | |
782 | sub $cnt,$np_end,$np // done yet? | |
783 | adds $acc0,$acc0,$t0 | |
784 | adcs $acc1,$acc1,$t1 | |
785 | ldp $t0,$t1,[$tp,#8*4] | |
786 | adcs $acc2,$acc2,$t2 | |
787 | adcs $acc3,$acc3,$t3 | |
788 | ldp $t2,$t3,[$tp,#8*6] | |
789 | adcs $acc4,$acc4,$t0 | |
790 | adcs $acc5,$acc5,$t1 | |
791 | adcs $acc6,$acc6,$t2 | |
792 | adcs $acc7,$acc7,$t3 | |
793 | //adc $carry,xzr,xzr // moved below | |
794 | cbz $cnt,.Lsqr8x8_post_condition | |
795 | ||
796 | ldr $n0,[$tp,#-8*8] | |
797 | ldp $a0,$a1,[$np,#8*0] | |
798 | ldp $a2,$a3,[$np,#8*2] | |
799 | ldp $a4,$a5,[$np,#8*4] | |
800 | mov $cnt,#-8*8 | |
801 | ldp $a6,$a7,[$np,#8*6] | |
802 | add $np,$np,#8*8 | |
803 | ||
804 | .Lsqr8x_tail: | |
805 | mul $t0,$a0,$n0 | |
806 | adc $carry,xzr,xzr // carry bit, modulo-scheduled | |
807 | mul $t1,$a1,$n0 | |
808 | add $cnt,$cnt,#8 | |
809 | mul $t2,$a2,$n0 | |
810 | mul $t3,$a3,$n0 | |
811 | adds $acc0,$acc0,$t0 | |
812 | mul $t0,$a4,$n0 | |
813 | adcs $acc1,$acc1,$t1 | |
814 | mul $t1,$a5,$n0 | |
815 | adcs $acc2,$acc2,$t2 | |
816 | mul $t2,$a6,$n0 | |
817 | adcs $acc3,$acc3,$t3 | |
818 | mul $t3,$a7,$n0 | |
819 | adcs $acc4,$acc4,$t0 | |
820 | umulh $t0,$a0,$n0 | |
821 | adcs $acc5,$acc5,$t1 | |
822 | umulh $t1,$a1,$n0 | |
823 | adcs $acc6,$acc6,$t2 | |
824 | umulh $t2,$a2,$n0 | |
825 | adcs $acc7,$acc7,$t3 | |
826 | umulh $t3,$a3,$n0 | |
827 | adc $carry,$carry,xzr | |
828 | str $acc0,[$tp],#8 | |
829 | adds $acc0,$acc1,$t0 | |
830 | umulh $t0,$a4,$n0 | |
831 | adcs $acc1,$acc2,$t1 | |
832 | umulh $t1,$a5,$n0 | |
833 | adcs $acc2,$acc3,$t2 | |
834 | umulh $t2,$a6,$n0 | |
835 | adcs $acc3,$acc4,$t3 | |
836 | umulh $t3,$a7,$n0 | |
837 | ldr $n0,[$rp,$cnt] | |
838 | adcs $acc4,$acc5,$t0 | |
839 | adcs $acc5,$acc6,$t1 | |
840 | adcs $acc6,$acc7,$t2 | |
841 | adcs $acc7,$carry,$t3 | |
842 | //adc $carry,xzr,xzr // moved above | |
843 | cbnz $cnt,.Lsqr8x_tail | |
844 | // note that carry flag is guaranteed | |
845 | // to be zero at this point | |
846 | ldp $a0,$a1,[$tp,#8*0] | |
847 | sub $cnt,$np_end,$np // done yet? | |
848 | sub $t2,$np_end,$num // rewinded np | |
849 | ldp $a2,$a3,[$tp,#8*2] | |
850 | ldp $a4,$a5,[$tp,#8*4] | |
851 | ldp $a6,$a7,[$tp,#8*6] | |
852 | cbz $cnt,.Lsqr8x_tail_break | |
853 | ||
854 | ldr $n0,[$rp,#-8*8] | |
855 | adds $acc0,$acc0,$a0 | |
856 | adcs $acc1,$acc1,$a1 | |
857 | ldp $a0,$a1,[$np,#8*0] | |
858 | adcs $acc2,$acc2,$a2 | |
859 | adcs $acc3,$acc3,$a3 | |
860 | ldp $a2,$a3,[$np,#8*2] | |
861 | adcs $acc4,$acc4,$a4 | |
862 | adcs $acc5,$acc5,$a5 | |
863 | ldp $a4,$a5,[$np,#8*4] | |
864 | adcs $acc6,$acc6,$a6 | |
865 | mov $cnt,#-8*8 | |
866 | adcs $acc7,$acc7,$a7 | |
867 | ldp $a6,$a7,[$np,#8*6] | |
868 | add $np,$np,#8*8 | |
869 | //adc $carry,xzr,xzr // moved above | |
870 | b .Lsqr8x_tail | |
871 | ||
872 | .align 4 | |
873 | .Lsqr8x_tail_break: | |
874 | ldr $n0,[x29,#112] // pull n0 | |
875 | add $cnt,$tp,#8*8 // end of current t[num] window | |
876 | ||
877 | subs xzr,$topmost,#1 // "move" top-most carry to carry bit | |
878 | adcs $t0,$acc0,$a0 | |
879 | adcs $t1,$acc1,$a1 | |
880 | ldp $acc0,$acc1,[$rp,#8*0] | |
881 | adcs $acc2,$acc2,$a2 | |
882 | ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] | |
883 | adcs $acc3,$acc3,$a3 | |
884 | ldp $a2,$a3,[$t2,#8*2] | |
885 | adcs $acc4,$acc4,$a4 | |
886 | adcs $acc5,$acc5,$a5 | |
887 | ldp $a4,$a5,[$t2,#8*4] | |
888 | adcs $acc6,$acc6,$a6 | |
889 | adcs $acc7,$acc7,$a7 | |
890 | ldp $a6,$a7,[$t2,#8*6] | |
891 | add $np,$t2,#8*8 | |
892 | adc $topmost,xzr,xzr // top-most carry | |
893 | mul $na0,$n0,$acc0 | |
894 | stp $t0,$t1,[$tp,#8*0] | |
895 | stp $acc2,$acc3,[$tp,#8*2] | |
896 | ldp $acc2,$acc3,[$rp,#8*2] | |
897 | stp $acc4,$acc5,[$tp,#8*4] | |
898 | ldp $acc4,$acc5,[$rp,#8*4] | |
899 | cmp $cnt,x29 // did we hit the bottom? | |
900 | stp $acc6,$acc7,[$tp,#8*6] | |
901 | mov $tp,$rp // slide the window | |
902 | ldp $acc6,$acc7,[$rp,#8*6] | |
903 | mov $cnt,#8 | |
904 | b.ne .Lsqr8x_reduction | |
905 | ||
906 | // Final step. We see if result is larger than modulus, and | |
907 | // if it is, subtract the modulus. But comparison implies | |
908 | // subtraction. So we subtract modulus, see if it borrowed, | |
909 | // and conditionally copy original value. | |
910 | ldr $rp,[x29,#96] // pull rp | |
911 | add $tp,$tp,#8*8 | |
912 | subs $t0,$acc0,$a0 | |
913 | sbcs $t1,$acc1,$a1 | |
914 | sub $cnt,$num,#8*8 | |
915 | mov $ap_end,$rp // $rp copy | |
916 | ||
917 | .Lsqr8x_sub: | |
918 | sbcs $t2,$acc2,$a2 | |
919 | ldp $a0,$a1,[$np,#8*0] | |
920 | sbcs $t3,$acc3,$a3 | |
921 | stp $t0,$t1,[$rp,#8*0] | |
922 | sbcs $t0,$acc4,$a4 | |
923 | ldp $a2,$a3,[$np,#8*2] | |
924 | sbcs $t1,$acc5,$a5 | |
925 | stp $t2,$t3,[$rp,#8*2] | |
926 | sbcs $t2,$acc6,$a6 | |
927 | ldp $a4,$a5,[$np,#8*4] | |
928 | sbcs $t3,$acc7,$a7 | |
929 | ldp $a6,$a7,[$np,#8*6] | |
930 | add $np,$np,#8*8 | |
931 | ldp $acc0,$acc1,[$tp,#8*0] | |
932 | sub $cnt,$cnt,#8*8 | |
933 | ldp $acc2,$acc3,[$tp,#8*2] | |
934 | ldp $acc4,$acc5,[$tp,#8*4] | |
935 | ldp $acc6,$acc7,[$tp,#8*6] | |
936 | add $tp,$tp,#8*8 | |
937 | stp $t0,$t1,[$rp,#8*4] | |
938 | sbcs $t0,$acc0,$a0 | |
939 | stp $t2,$t3,[$rp,#8*6] | |
940 | add $rp,$rp,#8*8 | |
941 | sbcs $t1,$acc1,$a1 | |
942 | cbnz $cnt,.Lsqr8x_sub | |
943 | ||
944 | sbcs $t2,$acc2,$a2 | |
945 | mov $tp,sp | |
946 | add $ap,sp,$num | |
947 | ldp $a0,$a1,[$ap_end,#8*0] | |
948 | sbcs $t3,$acc3,$a3 | |
949 | stp $t0,$t1,[$rp,#8*0] | |
950 | sbcs $t0,$acc4,$a4 | |
951 | ldp $a2,$a3,[$ap_end,#8*2] | |
952 | sbcs $t1,$acc5,$a5 | |
953 | stp $t2,$t3,[$rp,#8*2] | |
954 | sbcs $t2,$acc6,$a6 | |
955 | ldp $acc0,$acc1,[$ap,#8*0] | |
956 | sbcs $t3,$acc7,$a7 | |
957 | ldp $acc2,$acc3,[$ap,#8*2] | |
958 | sbcs xzr,$topmost,xzr // did it borrow? | |
959 | ldr x30,[x29,#8] // pull return address | |
960 | stp $t0,$t1,[$rp,#8*4] | |
961 | stp $t2,$t3,[$rp,#8*6] | |
962 | ||
963 | sub $cnt,$num,#8*4 | |
964 | .Lsqr4x_cond_copy: | |
965 | sub $cnt,$cnt,#8*4 | |
966 | csel $t0,$acc0,$a0,lo | |
967 | stp xzr,xzr,[$tp,#8*0] | |
968 | csel $t1,$acc1,$a1,lo | |
969 | ldp $a0,$a1,[$ap_end,#8*4] | |
970 | ldp $acc0,$acc1,[$ap,#8*4] | |
971 | csel $t2,$acc2,$a2,lo | |
972 | stp xzr,xzr,[$tp,#8*2] | |
973 | add $tp,$tp,#8*4 | |
974 | csel $t3,$acc3,$a3,lo | |
975 | ldp $a2,$a3,[$ap_end,#8*6] | |
976 | ldp $acc2,$acc3,[$ap,#8*6] | |
977 | add $ap,$ap,#8*4 | |
978 | stp $t0,$t1,[$ap_end,#8*0] | |
979 | stp $t2,$t3,[$ap_end,#8*2] | |
980 | add $ap_end,$ap_end,#8*4 | |
981 | stp xzr,xzr,[$ap,#8*0] | |
982 | stp xzr,xzr,[$ap,#8*2] | |
983 | cbnz $cnt,.Lsqr4x_cond_copy | |
984 | ||
985 | csel $t0,$acc0,$a0,lo | |
986 | stp xzr,xzr,[$tp,#8*0] | |
987 | csel $t1,$acc1,$a1,lo | |
988 | stp xzr,xzr,[$tp,#8*2] | |
989 | csel $t2,$acc2,$a2,lo | |
990 | csel $t3,$acc3,$a3,lo | |
991 | stp $t0,$t1,[$ap_end,#8*0] | |
992 | stp $t2,$t3,[$ap_end,#8*2] | |
993 | ||
994 | b .Lsqr8x_done | |
995 | ||
996 | .align 4 | |
997 | .Lsqr8x8_post_condition: | |
998 | adc $carry,xzr,xzr | |
999 | ldr x30,[x29,#8] // pull return address | |
1000 | // $acc0-7,$carry hold result, $a0-7 hold modulus | |
1001 | subs $a0,$acc0,$a0 | |
1002 | ldr $ap,[x29,#96] // pull rp | |
1003 | sbcs $a1,$acc1,$a1 | |
1004 | stp xzr,xzr,[sp,#8*0] | |
1005 | sbcs $a2,$acc2,$a2 | |
1006 | stp xzr,xzr,[sp,#8*2] | |
1007 | sbcs $a3,$acc3,$a3 | |
1008 | stp xzr,xzr,[sp,#8*4] | |
1009 | sbcs $a4,$acc4,$a4 | |
1010 | stp xzr,xzr,[sp,#8*6] | |
1011 | sbcs $a5,$acc5,$a5 | |
1012 | stp xzr,xzr,[sp,#8*8] | |
1013 | sbcs $a6,$acc6,$a6 | |
1014 | stp xzr,xzr,[sp,#8*10] | |
1015 | sbcs $a7,$acc7,$a7 | |
1016 | stp xzr,xzr,[sp,#8*12] | |
1017 | sbcs $carry,$carry,xzr // did it borrow? | |
1018 | stp xzr,xzr,[sp,#8*14] | |
1019 | ||
1020 | // $a0-7 hold result-modulus | |
1021 | csel $a0,$acc0,$a0,lo | |
1022 | csel $a1,$acc1,$a1,lo | |
1023 | csel $a2,$acc2,$a2,lo | |
1024 | csel $a3,$acc3,$a3,lo | |
1025 | stp $a0,$a1,[$ap,#8*0] | |
1026 | csel $a4,$acc4,$a4,lo | |
1027 | csel $a5,$acc5,$a5,lo | |
1028 | stp $a2,$a3,[$ap,#8*2] | |
1029 | csel $a6,$acc6,$a6,lo | |
1030 | csel $a7,$acc7,$a7,lo | |
1031 | stp $a4,$a5,[$ap,#8*4] | |
1032 | stp $a6,$a7,[$ap,#8*6] | |
cb2ed545 | 1033 | |
d38f1b39 AP |
1034 | .Lsqr8x_done: |
1035 | ldp x19,x20,[x29,#16] | |
1036 | mov sp,x29 | |
1037 | ldp x21,x22,[x29,#32] | |
1038 | mov x0,#1 | |
1039 | ldp x23,x24,[x29,#48] | |
1040 | ldp x25,x26,[x29,#64] | |
1041 | ldp x27,x28,[x29,#80] | |
1042 | ldr x29,[sp],#128 | |
1043 | ret | |
1044 | .size __bn_sqr8x_mont,.-__bn_sqr8x_mont | |
1045 | ___ | |
1046 | } | |
1047 | ||
1048 | { | |
1049 | ######################################################################## | |
1050 | # Even though this might look as ARMv8 adaptation of mulx4x_mont from | |
1051 | # x86_64-mont5 module, it's different in sense that it performs | |
1052 | # reduction 256 bits at a time. | |
1053 | ||
1054 | my ($a0,$a1,$a2,$a3, | |
1055 | $t0,$t1,$t2,$t3, | |
1056 | $m0,$m1,$m2,$m3, | |
1057 | $acc0,$acc1,$acc2,$acc3,$acc4, | |
1058 | $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); | |
1059 | my $bp_end=$rp; | |
1060 | my ($carry,$topmost) = ($rp,"x30"); | |
1061 | ||
1062 | $code.=<<___; | |
1063 | .type __bn_mul4x_mont,%function | |
1064 | .align 5 | |
1065 | __bn_mul4x_mont: | |
1066 | stp x29,x30,[sp,#-128]! | |
1067 | add x29,sp,#0 | |
1068 | stp x19,x20,[sp,#16] | |
1069 | stp x21,x22,[sp,#32] | |
1070 | stp x23,x24,[sp,#48] | |
1071 | stp x25,x26,[sp,#64] | |
1072 | stp x27,x28,[sp,#80] | |
1073 | ||
1074 | sub $tp,sp,$num,lsl#3 | |
1075 | lsl $num,$num,#3 | |
1076 | ldr $n0,[$n0] // *n0 | |
1077 | sub sp,$tp,#8*4 // alloca | |
1078 | ||
1079 | add $t0,$bp,$num | |
1080 | add $ap_end,$ap,$num | |
1081 | stp $rp,$t0,[x29,#96] // offload rp and &b[num] | |
1082 | ||
1083 | ldr $bi,[$bp,#8*0] // b[0] | |
1084 | ldp $a0,$a1,[$ap,#8*0] // a[0..3] | |
1085 | ldp $a2,$a3,[$ap,#8*2] | |
1086 | add $ap,$ap,#8*4 | |
1087 | mov $acc0,xzr | |
1088 | mov $acc1,xzr | |
1089 | mov $acc2,xzr | |
1090 | mov $acc3,xzr | |
1091 | ldp $m0,$m1,[$np,#8*0] // n[0..3] | |
1092 | ldp $m2,$m3,[$np,#8*2] | |
1093 | adds $np,$np,#8*4 // clear carry bit | |
1094 | mov $carry,xzr | |
1095 | mov $cnt,#0 | |
1096 | mov $tp,sp | |
1097 | ||
1098 | .Loop_mul4x_1st_reduction: | |
1099 | mul $t0,$a0,$bi // lo(a[0..3]*b[0]) | |
1100 | adc $carry,$carry,xzr // modulo-scheduled | |
1101 | mul $t1,$a1,$bi | |
1102 | add $cnt,$cnt,#8 | |
1103 | mul $t2,$a2,$bi | |
1104 | and $cnt,$cnt,#31 | |
1105 | mul $t3,$a3,$bi | |
1106 | adds $acc0,$acc0,$t0 | |
1107 | umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) | |
1108 | adcs $acc1,$acc1,$t1 | |
1109 | mul $mi,$acc0,$n0 // t[0]*n0 | |
1110 | adcs $acc2,$acc2,$t2 | |
1111 | umulh $t1,$a1,$bi | |
1112 | adcs $acc3,$acc3,$t3 | |
1113 | umulh $t2,$a2,$bi | |
1114 | adc $acc4,xzr,xzr | |
1115 | umulh $t3,$a3,$bi | |
1116 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) | |
1117 | adds $acc1,$acc1,$t0 | |
1118 | // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) | |
1119 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing | |
1120 | adcs $acc2,$acc2,$t1 | |
1121 | mul $t1,$m1,$mi | |
1122 | adcs $acc3,$acc3,$t2 | |
1123 | mul $t2,$m2,$mi | |
1124 | adc $acc4,$acc4,$t3 // can't overflow | |
1125 | mul $t3,$m3,$mi | |
1126 | // (*) adds xzr,$acc0,$t0 | |
1127 | subs xzr,$acc0,#1 // (*) | |
1128 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) | |
1129 | adcs $acc0,$acc1,$t1 | |
1130 | umulh $t1,$m1,$mi | |
1131 | adcs $acc1,$acc2,$t2 | |
1132 | umulh $t2,$m2,$mi | |
1133 | adcs $acc2,$acc3,$t3 | |
1134 | umulh $t3,$m3,$mi | |
1135 | adcs $acc3,$acc4,$carry | |
1136 | adc $carry,xzr,xzr | |
1137 | adds $acc0,$acc0,$t0 | |
1138 | sub $t0,$ap_end,$ap | |
1139 | adcs $acc1,$acc1,$t1 | |
1140 | adcs $acc2,$acc2,$t2 | |
1141 | adcs $acc3,$acc3,$t3 | |
1142 | //adc $carry,$carry,xzr | |
1143 | cbnz $cnt,.Loop_mul4x_1st_reduction | |
1144 | ||
1145 | cbz $t0,.Lmul4x4_post_condition | |
1146 | ||
1147 | ldp $a0,$a1,[$ap,#8*0] // a[4..7] | |
1148 | ldp $a2,$a3,[$ap,#8*2] | |
1149 | add $ap,$ap,#8*4 | |
1150 | ldr $mi,[sp] // a[0]*n0 | |
1151 | ldp $m0,$m1,[$np,#8*0] // n[4..7] | |
1152 | ldp $m2,$m3,[$np,#8*2] | |
1153 | add $np,$np,#8*4 | |
1154 | ||
1155 | .Loop_mul4x_1st_tail: | |
1156 | mul $t0,$a0,$bi // lo(a[4..7]*b[i]) | |
1157 | adc $carry,$carry,xzr // modulo-scheduled | |
1158 | mul $t1,$a1,$bi | |
1159 | add $cnt,$cnt,#8 | |
1160 | mul $t2,$a2,$bi | |
1161 | and $cnt,$cnt,#31 | |
1162 | mul $t3,$a3,$bi | |
1163 | adds $acc0,$acc0,$t0 | |
1164 | umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) | |
1165 | adcs $acc1,$acc1,$t1 | |
1166 | umulh $t1,$a1,$bi | |
1167 | adcs $acc2,$acc2,$t2 | |
1168 | umulh $t2,$a2,$bi | |
1169 | adcs $acc3,$acc3,$t3 | |
1170 | umulh $t3,$a3,$bi | |
1171 | adc $acc4,xzr,xzr | |
1172 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) | |
1173 | adds $acc1,$acc1,$t0 | |
1174 | mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) | |
1175 | adcs $acc2,$acc2,$t1 | |
1176 | mul $t1,$m1,$mi | |
1177 | adcs $acc3,$acc3,$t2 | |
1178 | mul $t2,$m2,$mi | |
1179 | adc $acc4,$acc4,$t3 // can't overflow | |
1180 | mul $t3,$m3,$mi | |
1181 | adds $acc0,$acc0,$t0 | |
1182 | umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) | |
1183 | adcs $acc1,$acc1,$t1 | |
1184 | umulh $t1,$m1,$mi | |
1185 | adcs $acc2,$acc2,$t2 | |
1186 | umulh $t2,$m2,$mi | |
1187 | adcs $acc3,$acc3,$t3 | |
1188 | adcs $acc4,$acc4,$carry | |
1189 | umulh $t3,$m3,$mi | |
1190 | adc $carry,xzr,xzr | |
1191 | ldr $mi,[sp,$cnt] // next t[0]*n0 | |
1192 | str $acc0,[$tp],#8 // result!!! | |
1193 | adds $acc0,$acc1,$t0 | |
1194 | sub $t0,$ap_end,$ap // done yet? | |
1195 | adcs $acc1,$acc2,$t1 | |
1196 | adcs $acc2,$acc3,$t2 | |
1197 | adcs $acc3,$acc4,$t3 | |
1198 | //adc $carry,$carry,xzr | |
1199 | cbnz $cnt,.Loop_mul4x_1st_tail | |
1200 | ||
1201 | sub $t1,$ap_end,$num // rewinded $ap | |
1202 | cbz $t0,.Lmul4x_proceed | |
1203 | ||
1204 | ldp $a0,$a1,[$ap,#8*0] | |
1205 | ldp $a2,$a3,[$ap,#8*2] | |
1206 | add $ap,$ap,#8*4 | |
1207 | ldp $m0,$m1,[$np,#8*0] | |
1208 | ldp $m2,$m3,[$np,#8*2] | |
1209 | add $np,$np,#8*4 | |
1210 | b .Loop_mul4x_1st_tail | |
1211 | ||
1212 | .align 5 | |
1213 | .Lmul4x_proceed: | |
1214 | ldr $bi,[$bp,#8*4]! // *++b | |
1215 | adc $topmost,$carry,xzr | |
1216 | ldp $a0,$a1,[$t1,#8*0] // a[0..3] | |
1217 | sub $np,$np,$num // rewind np | |
1218 | ldp $a2,$a3,[$t1,#8*2] | |
1219 | add $ap,$t1,#8*4 | |
1220 | ||
1221 | stp $acc0,$acc1,[$tp,#8*0] // result!!! | |
1222 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3] | |
1223 | stp $acc2,$acc3,[$tp,#8*2] // result!!! | |
1224 | ldp $acc2,$acc3,[sp,#8*6] | |
1225 | ||
1226 | ldp $m0,$m1,[$np,#8*0] // n[0..3] | |
1227 | mov $tp,sp | |
1228 | ldp $m2,$m3,[$np,#8*2] | |
1229 | adds $np,$np,#8*4 // clear carry bit | |
1230 | mov $carry,xzr | |
1231 | ||
1232 | .align 4 | |
1233 | .Loop_mul4x_reduction: | |
1234 | mul $t0,$a0,$bi // lo(a[0..3]*b[4]) | |
1235 | adc $carry,$carry,xzr // modulo-scheduled | |
1236 | mul $t1,$a1,$bi | |
1237 | add $cnt,$cnt,#8 | |
1238 | mul $t2,$a2,$bi | |
1239 | and $cnt,$cnt,#31 | |
1240 | mul $t3,$a3,$bi | |
1241 | adds $acc0,$acc0,$t0 | |
1242 | umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) | |
1243 | adcs $acc1,$acc1,$t1 | |
1244 | mul $mi,$acc0,$n0 // t[0]*n0 | |
1245 | adcs $acc2,$acc2,$t2 | |
1246 | umulh $t1,$a1,$bi | |
1247 | adcs $acc3,$acc3,$t3 | |
1248 | umulh $t2,$a2,$bi | |
1249 | adc $acc4,xzr,xzr | |
1250 | umulh $t3,$a3,$bi | |
1251 | ldr $bi,[$bp,$cnt] // next b[i] | |
1252 | adds $acc1,$acc1,$t0 | |
1253 | // (*) mul $t0,$m0,$mi | |
1254 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing | |
1255 | adcs $acc2,$acc2,$t1 | |
1256 | mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 | |
1257 | adcs $acc3,$acc3,$t2 | |
1258 | mul $t2,$m2,$mi | |
1259 | adc $acc4,$acc4,$t3 // can't overflow | |
1260 | mul $t3,$m3,$mi | |
1261 | // (*) adds xzr,$acc0,$t0 | |
1262 | subs xzr,$acc0,#1 // (*) | |
1263 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 | |
1264 | adcs $acc0,$acc1,$t1 | |
1265 | umulh $t1,$m1,$mi | |
1266 | adcs $acc1,$acc2,$t2 | |
1267 | umulh $t2,$m2,$mi | |
1268 | adcs $acc2,$acc3,$t3 | |
1269 | umulh $t3,$m3,$mi | |
1270 | adcs $acc3,$acc4,$carry | |
1271 | adc $carry,xzr,xzr | |
1272 | adds $acc0,$acc0,$t0 | |
1273 | adcs $acc1,$acc1,$t1 | |
1274 | adcs $acc2,$acc2,$t2 | |
1275 | adcs $acc3,$acc3,$t3 | |
1276 | //adc $carry,$carry,xzr | |
1277 | cbnz $cnt,.Loop_mul4x_reduction | |
1278 | ||
1279 | adc $carry,$carry,xzr | |
1280 | ldp $t0,$t1,[$tp,#8*4] // t[4..7] | |
1281 | ldp $t2,$t3,[$tp,#8*6] | |
1282 | ldp $a0,$a1,[$ap,#8*0] // a[4..7] | |
1283 | ldp $a2,$a3,[$ap,#8*2] | |
1284 | add $ap,$ap,#8*4 | |
1285 | adds $acc0,$acc0,$t0 | |
1286 | adcs $acc1,$acc1,$t1 | |
1287 | adcs $acc2,$acc2,$t2 | |
1288 | adcs $acc3,$acc3,$t3 | |
1289 | //adc $carry,$carry,xzr | |
1290 | ||
1291 | ldr $mi,[sp] // t[0]*n0 | |
1292 | ldp $m0,$m1,[$np,#8*0] // n[4..7] | |
1293 | ldp $m2,$m3,[$np,#8*2] | |
1294 | add $np,$np,#8*4 | |
1295 | ||
1296 | .align 4 | |
1297 | .Loop_mul4x_tail: | |
1298 | mul $t0,$a0,$bi // lo(a[4..7]*b[4]) | |
1299 | adc $carry,$carry,xzr // modulo-scheduled | |
1300 | mul $t1,$a1,$bi | |
1301 | add $cnt,$cnt,#8 | |
1302 | mul $t2,$a2,$bi | |
1303 | and $cnt,$cnt,#31 | |
1304 | mul $t3,$a3,$bi | |
1305 | adds $acc0,$acc0,$t0 | |
1306 | umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) | |
1307 | adcs $acc1,$acc1,$t1 | |
1308 | umulh $t1,$a1,$bi | |
1309 | adcs $acc2,$acc2,$t2 | |
1310 | umulh $t2,$a2,$bi | |
1311 | adcs $acc3,$acc3,$t3 | |
1312 | umulh $t3,$a3,$bi | |
1313 | adc $acc4,xzr,xzr | |
1314 | ldr $bi,[$bp,$cnt] // next b[i] | |
1315 | adds $acc1,$acc1,$t0 | |
1316 | mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) | |
1317 | adcs $acc2,$acc2,$t1 | |
1318 | mul $t1,$m1,$mi | |
1319 | adcs $acc3,$acc3,$t2 | |
1320 | mul $t2,$m2,$mi | |
1321 | adc $acc4,$acc4,$t3 // can't overflow | |
1322 | mul $t3,$m3,$mi | |
1323 | adds $acc0,$acc0,$t0 | |
1324 | umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) | |
1325 | adcs $acc1,$acc1,$t1 | |
1326 | umulh $t1,$m1,$mi | |
1327 | adcs $acc2,$acc2,$t2 | |
1328 | umulh $t2,$m2,$mi | |
1329 | adcs $acc3,$acc3,$t3 | |
1330 | umulh $t3,$m3,$mi | |
1331 | adcs $acc4,$acc4,$carry | |
1332 | ldr $mi,[sp,$cnt] // next a[0]*n0 | |
1333 | adc $carry,xzr,xzr | |
1334 | str $acc0,[$tp],#8 // result!!! | |
1335 | adds $acc0,$acc1,$t0 | |
1336 | sub $t0,$ap_end,$ap // done yet? | |
1337 | adcs $acc1,$acc2,$t1 | |
1338 | adcs $acc2,$acc3,$t2 | |
1339 | adcs $acc3,$acc4,$t3 | |
1340 | //adc $carry,$carry,xzr | |
1341 | cbnz $cnt,.Loop_mul4x_tail | |
1342 | ||
1343 | sub $t1,$np,$num // rewinded np? | |
1344 | adc $carry,$carry,xzr | |
1345 | cbz $t0,.Loop_mul4x_break | |
1346 | ||
1347 | ldp $t0,$t1,[$tp,#8*4] | |
1348 | ldp $t2,$t3,[$tp,#8*6] | |
1349 | ldp $a0,$a1,[$ap,#8*0] | |
1350 | ldp $a2,$a3,[$ap,#8*2] | |
1351 | add $ap,$ap,#8*4 | |
1352 | adds $acc0,$acc0,$t0 | |
1353 | adcs $acc1,$acc1,$t1 | |
1354 | adcs $acc2,$acc2,$t2 | |
1355 | adcs $acc3,$acc3,$t3 | |
1356 | //adc $carry,$carry,xzr | |
1357 | ldp $m0,$m1,[$np,#8*0] | |
1358 | ldp $m2,$m3,[$np,#8*2] | |
1359 | add $np,$np,#8*4 | |
1360 | b .Loop_mul4x_tail | |
1361 | ||
1362 | .align 4 | |
1363 | .Loop_mul4x_break: | |
1364 | ldp $t2,$t3,[x29,#96] // pull rp and &b[num] | |
1365 | adds $acc0,$acc0,$topmost | |
1366 | add $bp,$bp,#8*4 // bp++ | |
1367 | adcs $acc1,$acc1,xzr | |
1368 | sub $ap,$ap,$num // rewind ap | |
1369 | adcs $acc2,$acc2,xzr | |
1370 | stp $acc0,$acc1,[$tp,#8*0] // result!!! | |
1371 | adcs $acc3,$acc3,xzr | |
1372 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3] | |
1373 | adc $topmost,$carry,xzr | |
1374 | stp $acc2,$acc3,[$tp,#8*2] // result!!! | |
1375 | cmp $bp,$t3 // done yet? | |
1376 | ldp $acc2,$acc3,[sp,#8*6] | |
1377 | ldp $m0,$m1,[$t1,#8*0] // n[0..3] | |
1378 | ldp $m2,$m3,[$t1,#8*2] | |
1379 | add $np,$t1,#8*4 | |
1380 | b.eq .Lmul4x_post | |
1381 | ||
1382 | ldr $bi,[$bp] | |
1383 | ldp $a0,$a1,[$ap,#8*0] // a[0..3] | |
1384 | ldp $a2,$a3,[$ap,#8*2] | |
1385 | adds $ap,$ap,#8*4 // clear carry bit | |
1386 | mov $carry,xzr | |
1387 | mov $tp,sp | |
1388 | b .Loop_mul4x_reduction | |
1389 | ||
1390 | .align 4 | |
1391 | .Lmul4x_post: | |
1392 | // Final step. We see if result is larger than modulus, and | |
1393 | // if it is, subtract the modulus. But comparison implies | |
1394 | // subtraction. So we subtract modulus, see if it borrowed, | |
1395 | // and conditionally copy original value. | |
1396 | mov $rp,$t2 | |
1397 | mov $ap_end,$t2 // $rp copy | |
1398 | subs $t0,$acc0,$m0 | |
1399 | add $tp,sp,#8*8 | |
1400 | sbcs $t1,$acc1,$m1 | |
1401 | sub $cnt,$num,#8*4 | |
1402 | ||
1403 | .Lmul4x_sub: | |
1404 | sbcs $t2,$acc2,$m2 | |
1405 | ldp $m0,$m1,[$np,#8*0] | |
1406 | sub $cnt,$cnt,#8*4 | |
1407 | ldp $acc0,$acc1,[$tp,#8*0] | |
1408 | sbcs $t3,$acc3,$m3 | |
1409 | ldp $m2,$m3,[$np,#8*2] | |
1410 | add $np,$np,#8*4 | |
1411 | ldp $acc2,$acc3,[$tp,#8*2] | |
1412 | add $tp,$tp,#8*4 | |
1413 | stp $t0,$t1,[$rp,#8*0] | |
1414 | sbcs $t0,$acc0,$m0 | |
1415 | stp $t2,$t3,[$rp,#8*2] | |
1416 | add $rp,$rp,#8*4 | |
1417 | sbcs $t1,$acc1,$m1 | |
1418 | cbnz $cnt,.Lmul4x_sub | |
1419 | ||
1420 | sbcs $t2,$acc2,$m2 | |
1421 | mov $tp,sp | |
1422 | add $ap,sp,#8*4 | |
1423 | ldp $a0,$a1,[$ap_end,#8*0] | |
1424 | sbcs $t3,$acc3,$m3 | |
1425 | stp $t0,$t1,[$rp,#8*0] | |
1426 | ldp $a2,$a3,[$ap_end,#8*2] | |
1427 | stp $t2,$t3,[$rp,#8*2] | |
1428 | ldp $acc0,$acc1,[$ap,#8*0] | |
1429 | ldp $acc2,$acc3,[$ap,#8*2] | |
1430 | sbcs xzr,$topmost,xzr // did it borrow? | |
1431 | ldr x30,[x29,#8] // pull return address | |
1432 | ||
1433 | sub $cnt,$num,#8*4 | |
1434 | .Lmul4x_cond_copy: | |
1435 | sub $cnt,$cnt,#8*4 | |
1436 | csel $t0,$acc0,$a0,lo | |
1437 | stp xzr,xzr,[$tp,#8*0] | |
1438 | csel $t1,$acc1,$a1,lo | |
1439 | ldp $a0,$a1,[$ap_end,#8*4] | |
1440 | ldp $acc0,$acc1,[$ap,#8*4] | |
1441 | csel $t2,$acc2,$a2,lo | |
1442 | stp xzr,xzr,[$tp,#8*2] | |
1443 | add $tp,$tp,#8*4 | |
1444 | csel $t3,$acc3,$a3,lo | |
1445 | ldp $a2,$a3,[$ap_end,#8*6] | |
1446 | ldp $acc2,$acc3,[$ap,#8*6] | |
1447 | add $ap,$ap,#8*4 | |
1448 | stp $t0,$t1,[$ap_end,#8*0] | |
1449 | stp $t2,$t3,[$ap_end,#8*2] | |
1450 | add $ap_end,$ap_end,#8*4 | |
1451 | cbnz $cnt,.Lmul4x_cond_copy | |
1452 | ||
1453 | csel $t0,$acc0,$a0,lo | |
1454 | stp xzr,xzr,[$tp,#8*0] | |
1455 | csel $t1,$acc1,$a1,lo | |
1456 | stp xzr,xzr,[$tp,#8*2] | |
1457 | csel $t2,$acc2,$a2,lo | |
1458 | stp xzr,xzr,[$tp,#8*3] | |
1459 | csel $t3,$acc3,$a3,lo | |
1460 | stp xzr,xzr,[$tp,#8*4] | |
1461 | stp $t0,$t1,[$ap_end,#8*0] | |
1462 | stp $t2,$t3,[$ap_end,#8*2] | |
1463 | ||
1464 | b .Lmul4x_done | |
1465 | ||
1466 | .align 4 | |
1467 | .Lmul4x4_post_condition: | |
1468 | adc $carry,$carry,xzr | |
1469 | ldr $ap,[x29,#96] // pull rp | |
1470 | // $acc0-3,$carry hold result, $m0-7 hold modulus | |
1471 | subs $a0,$acc0,$m0 | |
1472 | ldr x30,[x29,#8] // pull return address | |
1473 | sbcs $a1,$acc1,$m1 | |
1474 | stp xzr,xzr,[sp,#8*0] | |
1475 | sbcs $a2,$acc2,$m2 | |
1476 | stp xzr,xzr,[sp,#8*2] | |
1477 | sbcs $a3,$acc3,$m3 | |
1478 | stp xzr,xzr,[sp,#8*4] | |
1479 | sbcs xzr,$carry,xzr // did it borrow? | |
1480 | stp xzr,xzr,[sp,#8*6] | |
1481 | ||
1482 | // $a0-3 hold result-modulus | |
1483 | csel $a0,$acc0,$a0,lo | |
1484 | csel $a1,$acc1,$a1,lo | |
1485 | csel $a2,$acc2,$a2,lo | |
1486 | csel $a3,$acc3,$a3,lo | |
1487 | stp $a0,$a1,[$ap,#8*0] | |
1488 | stp $a2,$a3,[$ap,#8*2] | |
1489 | ||
1490 | .Lmul4x_done: | |
1491 | ldp x19,x20,[x29,#16] | |
1492 | mov sp,x29 | |
1493 | ldp x21,x22,[x29,#32] | |
1494 | mov x0,#1 | |
1495 | ldp x23,x24,[x29,#48] | |
1496 | ldp x25,x26,[x29,#64] | |
1497 | ldp x27,x28,[x29,#80] | |
1498 | ldr x29,[sp],#128 | |
1499 | ret | |
1500 | .size __bn_mul4x_mont,.-__bn_mul4x_mont | |
1501 | ___ | |
1502 | } | |
1503 | $code.=<<___; | |
cb2ed545 AP |
1504 | .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
1505 | .align 4 | |
1506 | ___ | |
1507 | ||
1508 | print $code; | |
1509 | ||
1510 | close STDOUT; |