]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/armv4-mont.pl
ARM assembly pack: make it Windows-friendly.
[thirdparty/openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # January 2007.
18
19 # Montgomery multiplication for ARMv4.
20 #
21 # Performance improvement naturally varies among CPU implementations
22 # and compilers. The code was observed to provide +65-35% improvement
23 # [depending on key length, less for longer keys] on ARM920T, and
24 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25 # base and compiler generated code with in-lined umull and even umlal
26 # instructions. The latter means that this code didn't really have an
27 # "advantage" of utilizing some "secret" instruction.
28 #
29 # The code is interoperable with Thumb ISA and is rather compact, less
30 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
31 # about decorations, ABI and instruction syntax are identical.
32
33 # November 2013
34 #
35 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36 # performance improvement on Cortex-A8 is ~45-100% depending on key
37 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38 # On Snapdragon S4 improvement was measured to vary from ~70% to
39 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40 # rather because original integer-only code seems to perform
41 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
42 # different. It's being looked into, but the trouble is that
43 # performance for vectors longer than 256 bits is actually couple
44 # of percent worse than for integer-only code. The code is chosen
45 # for execution on all NEON-capable processors, because gain on
46 # others outweighs the marginal loss on Cortex-A9.
47
48 # September 2015
49 #
50 # Align Cortex-A9 performance with November 2013 improvements, i.e.
51 # NEON code is now ~20-105% faster than integer-only one on this
52 # processor. But this optimization further improved performance even
53 # on other processors: NEON code path is ~45-180% faster than original
54 # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55 # Snapdragon S4.
56
57 $flavour = shift;
58 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61 if ($flavour && $flavour ne "void") {
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
65 die "can't locate arm-xlate.pl";
66
67 open STDOUT,"| \"$^X\" $xlate $flavour $output";
68 } else {
69 open STDOUT,">$output";
70 }
71
72 $num="r0"; # starts as num argument, but holds &tp[num-1]
73 $ap="r1";
74 $bp="r2"; $bi="r2"; $rp="r2";
75 $np="r3";
76 $tp="r4";
77 $aj="r5";
78 $nj="r6";
79 $tj="r7";
80 $n0="r8";
81 ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
82 $alo="r10"; # sl, gcc uses it to keep @GOT
83 $ahi="r11"; # fp
84 $nlo="r12"; # ip
85 ########### # r13 is stack pointer
86 $nhi="r14"; # lr
87 ########### # r15 is program counter
88
89 #### argument block layout relative to &tp[num-1], a.k.a. $num
90 $_rp="$num,#12*4";
91 # ap permanently resides in r1
92 $_bp="$num,#13*4";
93 # np permanently resides in r3
94 $_n0="$num,#14*4";
95 $_num="$num,#15*4"; $_bpend=$_num;
96
97 $code=<<___;
98 #include "arm_arch.h"
99
100 #if defined(__thumb2__)
101 .syntax unified
102 .thumb
103 #else
104 .code 32
105 #endif
106
107 .text
108
109 #if __ARM_MAX_ARCH__>=7
110 .align 5
111 .LOPENSSL_armcap:
112 # ifdef _WIN32
113 .word OPENSSL_armcap_P
114 # else
115 .word OPENSSL_armcap_P-.Lbn_mul_mont
116 # endif
117 #endif
118
119 .global bn_mul_mont
120 .type bn_mul_mont,%function
121
122 .align 5
123 bn_mul_mont:
124 .Lbn_mul_mont:
125 ldr ip,[sp,#4] @ load num
126 stmdb sp!,{r0,r2} @ sp points at argument block
127 #if __ARM_MAX_ARCH__>=7
128 tst ip,#7
129 bne .Lialu
130 ldr r0,.LOPENSSL_armcap
131 #if !defined(_WIN32)
132 adr r2,.Lbn_mul_mont
133 ldr r0,[r0,r2]
134 # endif
135 # if defined(__APPLE__) || defined(_WIN32)
136 ldr r0,[r0]
137 # endif
138 tst r0,#ARMV7_NEON @ NEON available?
139 ldmia sp, {r0,r2}
140 beq .Lialu
141 add sp,sp,#8
142 b bn_mul8x_mont_neon
143 .align 4
144 .Lialu:
145 #endif
146 cmp ip,#2
147 mov $num,ip @ load num
148 #ifdef __thumb2__
149 ittt lt
150 #endif
151 movlt r0,#0
152 addlt sp,sp,#2*4
153 blt .Labrt
154
155 stmdb sp!,{r4-r12,lr} @ save 10 registers
156
157 mov $num,$num,lsl#2 @ rescale $num for byte count
158 sub sp,sp,$num @ alloca(4*num)
159 sub sp,sp,#4 @ +extra dword
160 sub $num,$num,#4 @ "num=num-1"
161 add $tp,$bp,$num @ &bp[num-1]
162
163 add $num,sp,$num @ $num to point at &tp[num-1]
164 ldr $n0,[$_n0] @ &n0
165 ldr $bi,[$bp] @ bp[0]
166 ldr $aj,[$ap],#4 @ ap[0],ap++
167 ldr $nj,[$np],#4 @ np[0],np++
168 ldr $n0,[$n0] @ *n0
169 str $tp,[$_bpend] @ save &bp[num]
170
171 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
172 str $n0,[$_n0] @ save n0 value
173 mul $n0,$alo,$n0 @ "tp[0]"*n0
174 mov $nlo,#0
175 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
176 mov $tp,sp
177
178 .L1st:
179 ldr $aj,[$ap],#4 @ ap[j],ap++
180 mov $alo,$ahi
181 ldr $nj,[$np],#4 @ np[j],np++
182 mov $ahi,#0
183 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
184 mov $nhi,#0
185 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
186 adds $nlo,$nlo,$alo
187 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
188 adc $nlo,$nhi,#0
189 cmp $tp,$num
190 bne .L1st
191
192 adds $nlo,$nlo,$ahi
193 ldr $tp,[$_bp] @ restore bp
194 mov $nhi,#0
195 ldr $n0,[$_n0] @ restore n0
196 adc $nhi,$nhi,#0
197 str $nlo,[$num] @ tp[num-1]=
198 mov $tj,sp
199 str $nhi,[$num,#4] @ tp[num]=
200 \f
201 .Louter:
202 sub $tj,$num,$tj @ "original" $num-1 value
203 sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
204 ldr $bi,[$tp,#4]! @ *(++bp)
205 sub $np,$np,$tj @ "rewind" np to &np[1]
206 ldr $aj,[$ap,#-4] @ ap[0]
207 ldr $alo,[sp] @ tp[0]
208 ldr $nj,[$np,#-4] @ np[0]
209 ldr $tj,[sp,#4] @ tp[1]
210
211 mov $ahi,#0
212 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
213 str $tp,[$_bp] @ save bp
214 mul $n0,$alo,$n0
215 mov $nlo,#0
216 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
217 mov $tp,sp
218
219 .Linner:
220 ldr $aj,[$ap],#4 @ ap[j],ap++
221 adds $alo,$ahi,$tj @ +=tp[j]
222 ldr $nj,[$np],#4 @ np[j],np++
223 mov $ahi,#0
224 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
225 mov $nhi,#0
226 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
227 adc $ahi,$ahi,#0
228 ldr $tj,[$tp,#8] @ tp[j+1]
229 adds $nlo,$nlo,$alo
230 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
231 adc $nlo,$nhi,#0
232 cmp $tp,$num
233 bne .Linner
234
235 adds $nlo,$nlo,$ahi
236 mov $nhi,#0
237 ldr $tp,[$_bp] @ restore bp
238 adc $nhi,$nhi,#0
239 ldr $n0,[$_n0] @ restore n0
240 adds $nlo,$nlo,$tj
241 ldr $tj,[$_bpend] @ restore &bp[num]
242 adc $nhi,$nhi,#0
243 str $nlo,[$num] @ tp[num-1]=
244 str $nhi,[$num,#4] @ tp[num]=
245
246 cmp $tp,$tj
247 #ifdef __thumb2__
248 itt ne
249 #endif
250 movne $tj,sp
251 bne .Louter
252 \f
253 ldr $rp,[$_rp] @ pull rp
254 mov $aj,sp
255 add $num,$num,#4 @ $num to point at &tp[num]
256 sub $aj,$num,$aj @ "original" num value
257 mov $tp,sp @ "rewind" $tp
258 mov $ap,$tp @ "borrow" $ap
259 sub $np,$np,$aj @ "rewind" $np to &np[0]
260
261 subs $tj,$tj,$tj @ "clear" carry flag
262 .Lsub: ldr $tj,[$tp],#4
263 ldr $nj,[$np],#4
264 sbcs $tj,$tj,$nj @ tp[j]-np[j]
265 str $tj,[$rp],#4 @ rp[j]=
266 teq $tp,$num @ preserve carry
267 bne .Lsub
268 sbcs $nhi,$nhi,#0 @ upmost carry
269 mov $tp,sp @ "rewind" $tp
270 sub $rp,$rp,$aj @ "rewind" $rp
271
272 .Lcopy: ldr $tj,[$tp] @ conditional copy
273 ldr $aj,[$rp]
274 str sp,[$tp],#4 @ zap tp
275 #ifdef __thumb2__
276 it cc
277 #endif
278 movcc $aj,$tj
279 str $aj,[$rp],#4
280 teq $tp,$num @ preserve carry
281 bne .Lcopy
282
283 mov sp,$num
284 add sp,sp,#4 @ skip over tp[num+1]
285 ldmia sp!,{r4-r12,lr} @ restore registers
286 add sp,sp,#2*4 @ skip over {r0,r2}
287 mov r0,#1
288 .Labrt:
289 #if __ARM_ARCH__>=5
290 ret @ bx lr
291 #else
292 tst lr,#1
293 moveq pc,lr @ be binary compatible with V4, yet
294 bx lr @ interoperable with Thumb ISA:-)
295 #endif
296 .size bn_mul_mont,.-bn_mul_mont
297 ___
298 {
299 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
300 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
301 my ($Z,$Temp)=("q4","q5");
302 my @ACC=map("q$_",(6..13));
303 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
304 my $zero="$Z#lo";
305 my $temp="$Temp#lo";
306
307 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
308 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
309
310 $code.=<<___;
311 #if __ARM_MAX_ARCH__>=7
312 .arch armv7-a
313 .fpu neon
314
315 .type bn_mul8x_mont_neon,%function
316 .align 5
317 bn_mul8x_mont_neon:
318 mov ip,sp
319 stmdb sp!,{r4-r11}
320 vstmdb sp!,{d8-d15} @ ABI specification says so
321 ldmia ip,{r4-r5} @ load rest of parameter block
322 mov ip,sp
323
324 cmp $num,#8
325 bhi .LNEON_8n
326
327 @ special case for $num==8, everything is in register bank...
328
329 vld1.32 {${Bi}[0]}, [$bptr,:32]!
330 veor $zero,$zero,$zero
331 sub $toutptr,sp,$num,lsl#4
332 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
333 and $toutptr,$toutptr,#-64
334 vld1.32 {${M0}[0]}, [$n0,:32]
335 mov sp,$toutptr @ alloca
336 vzip.16 $Bi,$zero
337
338 vmull.u32 @ACC[0],$Bi,${A0}[0]
339 vmull.u32 @ACC[1],$Bi,${A0}[1]
340 vmull.u32 @ACC[2],$Bi,${A1}[0]
341 vshl.i64 $Ni,@ACC[0]#hi,#16
342 vmull.u32 @ACC[3],$Bi,${A1}[1]
343
344 vadd.u64 $Ni,$Ni,@ACC[0]#lo
345 veor $zero,$zero,$zero
346 vmul.u32 $Ni,$Ni,$M0
347
348 vmull.u32 @ACC[4],$Bi,${A2}[0]
349 vld1.32 {$N0-$N3}, [$nptr]!
350 vmull.u32 @ACC[5],$Bi,${A2}[1]
351 vmull.u32 @ACC[6],$Bi,${A3}[0]
352 vzip.16 $Ni,$zero
353 vmull.u32 @ACC[7],$Bi,${A3}[1]
354
355 vmlal.u32 @ACC[0],$Ni,${N0}[0]
356 sub $outer,$num,#1
357 vmlal.u32 @ACC[1],$Ni,${N0}[1]
358 vmlal.u32 @ACC[2],$Ni,${N1}[0]
359 vmlal.u32 @ACC[3],$Ni,${N1}[1]
360
361 vmlal.u32 @ACC[4],$Ni,${N2}[0]
362 vmov $Temp,@ACC[0]
363 vmlal.u32 @ACC[5],$Ni,${N2}[1]
364 vmov @ACC[0],@ACC[1]
365 vmlal.u32 @ACC[6],$Ni,${N3}[0]
366 vmov @ACC[1],@ACC[2]
367 vmlal.u32 @ACC[7],$Ni,${N3}[1]
368 vmov @ACC[2],@ACC[3]
369 vmov @ACC[3],@ACC[4]
370 vshr.u64 $temp,$temp,#16
371 vmov @ACC[4],@ACC[5]
372 vmov @ACC[5],@ACC[6]
373 vadd.u64 $temp,$temp,$Temp#hi
374 vmov @ACC[6],@ACC[7]
375 veor @ACC[7],@ACC[7]
376 vshr.u64 $temp,$temp,#16
377
378 b .LNEON_outer8
379
380 .align 4
381 .LNEON_outer8:
382 vld1.32 {${Bi}[0]}, [$bptr,:32]!
383 veor $zero,$zero,$zero
384 vzip.16 $Bi,$zero
385 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
386
387 vmlal.u32 @ACC[0],$Bi,${A0}[0]
388 vmlal.u32 @ACC[1],$Bi,${A0}[1]
389 vmlal.u32 @ACC[2],$Bi,${A1}[0]
390 vshl.i64 $Ni,@ACC[0]#hi,#16
391 vmlal.u32 @ACC[3],$Bi,${A1}[1]
392
393 vadd.u64 $Ni,$Ni,@ACC[0]#lo
394 veor $zero,$zero,$zero
395 subs $outer,$outer,#1
396 vmul.u32 $Ni,$Ni,$M0
397
398 vmlal.u32 @ACC[4],$Bi,${A2}[0]
399 vmlal.u32 @ACC[5],$Bi,${A2}[1]
400 vmlal.u32 @ACC[6],$Bi,${A3}[0]
401 vzip.16 $Ni,$zero
402 vmlal.u32 @ACC[7],$Bi,${A3}[1]
403
404 vmlal.u32 @ACC[0],$Ni,${N0}[0]
405 vmlal.u32 @ACC[1],$Ni,${N0}[1]
406 vmlal.u32 @ACC[2],$Ni,${N1}[0]
407 vmlal.u32 @ACC[3],$Ni,${N1}[1]
408
409 vmlal.u32 @ACC[4],$Ni,${N2}[0]
410 vmov $Temp,@ACC[0]
411 vmlal.u32 @ACC[5],$Ni,${N2}[1]
412 vmov @ACC[0],@ACC[1]
413 vmlal.u32 @ACC[6],$Ni,${N3}[0]
414 vmov @ACC[1],@ACC[2]
415 vmlal.u32 @ACC[7],$Ni,${N3}[1]
416 vmov @ACC[2],@ACC[3]
417 vmov @ACC[3],@ACC[4]
418 vshr.u64 $temp,$temp,#16
419 vmov @ACC[4],@ACC[5]
420 vmov @ACC[5],@ACC[6]
421 vadd.u64 $temp,$temp,$Temp#hi
422 vmov @ACC[6],@ACC[7]
423 veor @ACC[7],@ACC[7]
424 vshr.u64 $temp,$temp,#16
425
426 bne .LNEON_outer8
427
428 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
429 mov $toutptr,sp
430 vshr.u64 $temp,@ACC[0]#lo,#16
431 mov $inner,$num
432 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
433 add $tinptr,sp,#96
434 vshr.u64 $temp,@ACC[0]#hi,#16
435 vzip.16 @ACC[0]#lo,@ACC[0]#hi
436
437 b .LNEON_tail_entry
438
439 .align 4
440 .LNEON_8n:
441 veor @ACC[0],@ACC[0],@ACC[0]
442 sub $toutptr,sp,#128
443 veor @ACC[1],@ACC[1],@ACC[1]
444 sub $toutptr,$toutptr,$num,lsl#4
445 veor @ACC[2],@ACC[2],@ACC[2]
446 and $toutptr,$toutptr,#-64
447 veor @ACC[3],@ACC[3],@ACC[3]
448 mov sp,$toutptr @ alloca
449 veor @ACC[4],@ACC[4],@ACC[4]
450 add $toutptr,$toutptr,#256
451 veor @ACC[5],@ACC[5],@ACC[5]
452 sub $inner,$num,#8
453 veor @ACC[6],@ACC[6],@ACC[6]
454 veor @ACC[7],@ACC[7],@ACC[7]
455
456 .LNEON_8n_init:
457 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
458 subs $inner,$inner,#8
459 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
460 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
461 vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
462 bne .LNEON_8n_init
463
464 add $tinptr,sp,#256
465 vld1.32 {$A0-$A3},[$aptr]!
466 add $bnptr,sp,#8
467 vld1.32 {${M0}[0]},[$n0,:32]
468 mov $outer,$num
469 b .LNEON_8n_outer
470
471 .align 4
472 .LNEON_8n_outer:
473 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
474 veor $zero,$zero,$zero
475 vzip.16 $Bi,$zero
476 add $toutptr,sp,#128
477 vld1.32 {$N0-$N3},[$nptr]!
478
479 vmlal.u32 @ACC[0],$Bi,${A0}[0]
480 vmlal.u32 @ACC[1],$Bi,${A0}[1]
481 veor $zero,$zero,$zero
482 vmlal.u32 @ACC[2],$Bi,${A1}[0]
483 vshl.i64 $Ni,@ACC[0]#hi,#16
484 vmlal.u32 @ACC[3],$Bi,${A1}[1]
485 vadd.u64 $Ni,$Ni,@ACC[0]#lo
486 vmlal.u32 @ACC[4],$Bi,${A2}[0]
487 vmul.u32 $Ni,$Ni,$M0
488 vmlal.u32 @ACC[5],$Bi,${A2}[1]
489 vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
490 vmlal.u32 @ACC[6],$Bi,${A3}[0]
491 vzip.16 $Ni,$zero
492 vmlal.u32 @ACC[7],$Bi,${A3}[1]
493 ___
494 for ($i=0; $i<7;) {
495 $code.=<<___;
496 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
497 vmlal.u32 @ACC[0],$Ni,${N0}[0]
498 veor $temp,$temp,$temp
499 vmlal.u32 @ACC[1],$Ni,${N0}[1]
500 vzip.16 $Bi,$temp
501 vmlal.u32 @ACC[2],$Ni,${N1}[0]
502 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
503 vmlal.u32 @ACC[3],$Ni,${N1}[1]
504 vmlal.u32 @ACC[4],$Ni,${N2}[0]
505 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
506 vmlal.u32 @ACC[5],$Ni,${N2}[1]
507 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
508 vmlal.u32 @ACC[6],$Ni,${N3}[0]
509 vmlal.u32 @ACC[7],$Ni,${N3}[1]
510 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
511 vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
512 ___
513 push(@ACC,shift(@ACC)); $i++;
514 $code.=<<___;
515 vmlal.u32 @ACC[0],$Bi,${A0}[0]
516 vld1.64 {@ACC[7]},[$tinptr,:128]!
517 vmlal.u32 @ACC[1],$Bi,${A0}[1]
518 veor $zero,$zero,$zero
519 vmlal.u32 @ACC[2],$Bi,${A1}[0]
520 vshl.i64 $Ni,@ACC[0]#hi,#16
521 vmlal.u32 @ACC[3],$Bi,${A1}[1]
522 vadd.u64 $Ni,$Ni,@ACC[0]#lo
523 vmlal.u32 @ACC[4],$Bi,${A2}[0]
524 vmul.u32 $Ni,$Ni,$M0
525 vmlal.u32 @ACC[5],$Bi,${A2}[1]
526 vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
527 vmlal.u32 @ACC[6],$Bi,${A3}[0]
528 vzip.16 $Ni,$zero
529 vmlal.u32 @ACC[7],$Bi,${A3}[1]
530 ___
531 }
532 $code.=<<___;
533 vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
534 vmlal.u32 @ACC[0],$Ni,${N0}[0]
535 vld1.32 {$A0-$A3},[$aptr]!
536 vmlal.u32 @ACC[1],$Ni,${N0}[1]
537 vmlal.u32 @ACC[2],$Ni,${N1}[0]
538 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
539 vmlal.u32 @ACC[3],$Ni,${N1}[1]
540 vmlal.u32 @ACC[4],$Ni,${N2}[0]
541 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
542 vmlal.u32 @ACC[5],$Ni,${N2}[1]
543 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
544 vmlal.u32 @ACC[6],$Ni,${N3}[0]
545 vmlal.u32 @ACC[7],$Ni,${N3}[1]
546 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
547 vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
548 add $bnptr,sp,#8 @ rewind
549 ___
550 push(@ACC,shift(@ACC));
551 $code.=<<___;
552 sub $inner,$num,#8
553 b .LNEON_8n_inner
554
555 .align 4
556 .LNEON_8n_inner:
557 subs $inner,$inner,#8
558 vmlal.u32 @ACC[0],$Bi,${A0}[0]
559 vld1.64 {@ACC[7]},[$tinptr,:128]
560 vmlal.u32 @ACC[1],$Bi,${A0}[1]
561 vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
562 vmlal.u32 @ACC[2],$Bi,${A1}[0]
563 vld1.32 {$N0-$N3},[$nptr]!
564 vmlal.u32 @ACC[3],$Bi,${A1}[1]
565 it ne
566 addne $tinptr,$tinptr,#16 @ don't advance in last iteration
567 vmlal.u32 @ACC[4],$Bi,${A2}[0]
568 vmlal.u32 @ACC[5],$Bi,${A2}[1]
569 vmlal.u32 @ACC[6],$Bi,${A3}[0]
570 vmlal.u32 @ACC[7],$Bi,${A3}[1]
571 ___
572 for ($i=1; $i<8; $i++) {
573 $code.=<<___;
574 vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
575 vmlal.u32 @ACC[0],$Ni,${N0}[0]
576 vmlal.u32 @ACC[1],$Ni,${N0}[1]
577 vmlal.u32 @ACC[2],$Ni,${N1}[0]
578 vmlal.u32 @ACC[3],$Ni,${N1}[1]
579 vmlal.u32 @ACC[4],$Ni,${N2}[0]
580 vmlal.u32 @ACC[5],$Ni,${N2}[1]
581 vmlal.u32 @ACC[6],$Ni,${N3}[0]
582 vmlal.u32 @ACC[7],$Ni,${N3}[1]
583 vst1.64 {@ACC[0]},[$toutptr,:128]!
584 ___
585 push(@ACC,shift(@ACC));
586 $code.=<<___;
587 vmlal.u32 @ACC[0],$Bi,${A0}[0]
588 vld1.64 {@ACC[7]},[$tinptr,:128]
589 vmlal.u32 @ACC[1],$Bi,${A0}[1]
590 vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
591 vmlal.u32 @ACC[2],$Bi,${A1}[0]
592 it ne
593 addne $tinptr,$tinptr,#16 @ don't advance in last iteration
594 vmlal.u32 @ACC[3],$Bi,${A1}[1]
595 vmlal.u32 @ACC[4],$Bi,${A2}[0]
596 vmlal.u32 @ACC[5],$Bi,${A2}[1]
597 vmlal.u32 @ACC[6],$Bi,${A3}[0]
598 vmlal.u32 @ACC[7],$Bi,${A3}[1]
599 ___
600 }
601 $code.=<<___;
602 it eq
603 subeq $aptr,$aptr,$num,lsl#2 @ rewind
604 vmlal.u32 @ACC[0],$Ni,${N0}[0]
605 vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
606 vmlal.u32 @ACC[1],$Ni,${N0}[1]
607 vld1.32 {$A0-$A3},[$aptr]!
608 vmlal.u32 @ACC[2],$Ni,${N1}[0]
609 add $bnptr,sp,#8 @ rewind
610 vmlal.u32 @ACC[3],$Ni,${N1}[1]
611 vmlal.u32 @ACC[4],$Ni,${N2}[0]
612 vmlal.u32 @ACC[5],$Ni,${N2}[1]
613 vmlal.u32 @ACC[6],$Ni,${N3}[0]
614 vst1.64 {@ACC[0]},[$toutptr,:128]!
615 vmlal.u32 @ACC[7],$Ni,${N3}[1]
616
617 bne .LNEON_8n_inner
618 ___
619 push(@ACC,shift(@ACC));
620 $code.=<<___;
621 add $tinptr,sp,#128
622 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
623 veor q2,q2,q2 @ $N0-$N1
624 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
625 veor q3,q3,q3 @ $N2-$N3
626 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
627 vst1.64 {@ACC[6]},[$toutptr,:128]
628
629 subs $outer,$outer,#8
630 vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
631 vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
632 vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
633 vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
634
635 itt ne
636 subne $nptr,$nptr,$num,lsl#2 @ rewind
637 bne .LNEON_8n_outer
638
639 add $toutptr,sp,#128
640 vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
641 vshr.u64 $temp,@ACC[0]#lo,#16
642 vst1.64 {q2-q3},[sp,:256]!
643 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
644 vst1.64 {q2-q3}, [sp,:256]!
645 vshr.u64 $temp,@ACC[0]#hi,#16
646 vst1.64 {q2-q3}, [sp,:256]!
647 vzip.16 @ACC[0]#lo,@ACC[0]#hi
648
649 mov $inner,$num
650 b .LNEON_tail_entry
651
652 .align 4
653 .LNEON_tail:
654 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
655 vshr.u64 $temp,@ACC[0]#lo,#16
656 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
657 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
658 vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
659 vshr.u64 $temp,@ACC[0]#hi,#16
660 vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
661 vzip.16 @ACC[0]#lo,@ACC[0]#hi
662
663 .LNEON_tail_entry:
664 ___
665 for ($i=1; $i<8; $i++) {
666 $code.=<<___;
667 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
668 vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
669 vshr.u64 $temp,@ACC[1]#lo,#16
670 vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
671 vshr.u64 $temp,@ACC[1]#hi,#16
672 vzip.16 @ACC[1]#lo,@ACC[1]#hi
673 ___
674 push(@ACC,shift(@ACC));
675 }
676 push(@ACC,shift(@ACC));
677 $code.=<<___;
678 vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
679 subs $inner,$inner,#8
680 vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
681 bne .LNEON_tail
682
683 vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
684 sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
685 subs $aptr,sp,#0 @ clear carry flag
686 add $bptr,sp,$num,lsl#2
687
688 .LNEON_sub:
689 ldmia $aptr!, {r4-r7}
690 ldmia $nptr!, {r8-r11}
691 sbcs r8, r4,r8
692 sbcs r9, r5,r9
693 sbcs r10,r6,r10
694 sbcs r11,r7,r11
695 teq $aptr,$bptr @ preserves carry
696 stmia $rptr!, {r8-r11}
697 bne .LNEON_sub
698
699 ldr r10, [$aptr] @ load top-most bit
700 mov r11,sp
701 veor q0,q0,q0
702 sub r11,$bptr,r11 @ this is num*4
703 veor q1,q1,q1
704 mov $aptr,sp
705 sub $rptr,$rptr,r11 @ rewind $rptr
706 mov $nptr,$bptr @ second 3/4th of frame
707 sbcs r10,r10,#0 @ result is carry flag
708
709 .LNEON_copy_n_zap:
710 ldmia $aptr!, {r4-r7}
711 ldmia $rptr, {r8-r11}
712 it cc
713 movcc r8, r4
714 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
715 itt cc
716 movcc r9, r5
717 movcc r10,r6
718 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
719 it cc
720 movcc r11,r7
721 ldmia $aptr, {r4-r7}
722 stmia $rptr!, {r8-r11}
723 sub $aptr,$aptr,#16
724 ldmia $rptr, {r8-r11}
725 it cc
726 movcc r8, r4
727 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
728 itt cc
729 movcc r9, r5
730 movcc r10,r6
731 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
732 it cc
733 movcc r11,r7
734 teq $aptr,$bptr @ preserves carry
735 stmia $rptr!, {r8-r11}
736 bne .LNEON_copy_n_zap
737
738 mov sp,ip
739 vldmia sp!,{d8-d15}
740 ldmia sp!,{r4-r11}
741 ret @ bx lr
742 .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
743 #endif
744 ___
745 }
746 $code.=<<___;
747 .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
748 .align 2
749 #if __ARM_MAX_ARCH__>=7
750 .comm OPENSSL_armcap_P,4,4
751 #endif
752 ___
753
754 foreach (split("\n",$code)) {
755 s/\`([^\`]*)\`/eval $1/ge;
756
757 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
758 s/\bret\b/bx lr/g or
759 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
760
761 print $_,"\n";
762 }
763
764 close STDOUT;