2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Montgomery multiplication for ARMv4.
21 # Performance improvement naturally varies among CPU implementations
22 # and compilers. The code was observed to provide +65-35% improvement
23 # [depending on key length, less for longer keys] on ARM920T, and
24 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25 # base and compiler generated code with in-lined umull and even umlal
26 # instructions. The latter means that this code didn't really have an
27 # "advantage" of utilizing some "secret" instruction.
29 # The code is interoperable with Thumb ISA and is rather compact, less
30 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
31 # about decorations, ABI and instruction syntax are identical.
35 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36 # performance improvement on Cortex-A8 is ~45-100% depending on key
37 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38 # On Snapdragon S4 improvement was measured to vary from ~70% to
39 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40 # rather because original integer-only code seems to perform
41 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
42 # different. It's being looked into, but the trouble is that
43 # performance for vectors longer than 256 bits is actually couple
44 # of percent worse than for integer-only code. The code is chosen
45 # for execution on all NEON-capable processors, because gain on
46 # others outweighs the marginal loss on Cortex-A9.
50 # Align Cortex-A9 performance with November 2013 improvements, i.e.
51 # NEON code is now ~20-105% faster than integer-only one on this
52 # processor. But this optimization further improved performance even
53 # on other processors: NEON code path is ~45-180% faster than original
54 # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
58 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
61 if ($flavour && $flavour ne "void") {
62 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
63 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
64 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
65 die "can't locate arm-xlate.pl";
67 open STDOUT
,"| \"$^X\" $xlate $flavour $output";
69 open STDOUT
,">$output";
72 $num="r0"; # starts as num argument, but holds &tp[num-1]
74 $bp="r2"; $bi="r2"; $rp="r2";
81 ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
82 $alo="r10"; # sl, gcc uses it to keep @GOT
85 ########### # r13 is stack pointer
87 ########### # r15 is program counter
89 #### argument block layout relative to &tp[num-1], a.k.a. $num
91 # ap permanently resides in r1
93 # np permanently resides in r3
95 $_num="$num,#15*4"; $_bpend=$_num;
100 #if defined(__thumb2__)
109 #if __ARM_MAX_ARCH__>=7
113 .word OPENSSL_armcap_P
115 .word OPENSSL_armcap_P
-.Lbn_mul_mont
120 .type bn_mul_mont
,%function
125 ldr ip
,[sp
,#4] @ load num
126 stmdb sp
!,{r0
,r2
} @ sp points at argument block
127 #if __ARM_MAX_ARCH__>=7
130 ldr r0
,.LOPENSSL_armcap
135 # if defined(__APPLE__) || defined(_WIN32)
138 tst r0
,#ARMV7_NEON @ NEON available?
147 mov
$num,ip @ load num
155 stmdb sp
!,{r4
-r12
,lr
} @ save
10 registers
157 mov
$num,$num,lsl
#2 @ rescale $num for byte count
158 sub sp
,sp
,$num @ alloca
(4*num
)
159 sub sp
,sp
,#4 @ +extra dword
160 sub $num,$num,#4 @ "num=num-1"
161 add
$tp,$bp,$num @
&bp
[num
-1]
163 add
$num,sp
,$num @
$num to point at
&tp
[num
-1]
165 ldr
$bi,[$bp] @ bp
[0]
166 ldr
$aj,[$ap],#4 @ ap[0],ap++
167 ldr
$nj,[$np],#4 @ np[0],np++
169 str
$tp,[$_bpend] @ save
&bp
[num
]
171 umull
$alo,$ahi,$aj,$bi @ ap
[0]*bp
[0]
172 str
$n0,[$_n0] @ save n0 value
173 mul
$n0,$alo,$n0 @
"tp[0]"*n0
175 umlal
$alo,$nlo,$nj,$n0 @ np
[0]*n0
+"t[0]"
179 ldr
$aj,[$ap],#4 @ ap[j],ap++
181 ldr
$nj,[$np],#4 @ np[j],np++
183 umlal
$alo,$ahi,$aj,$bi @ ap
[j
]*bp
[0]
185 umlal
$nlo,$nhi,$nj,$n0 @ np
[j
]*n0
187 str
$nlo,[$tp],#4 @ tp[j-1]=,tp++
193 ldr
$tp,[$_bp] @ restore bp
195 ldr
$n0,[$_n0] @ restore n0
197 str
$nlo,[$num] @ tp
[num
-1]=
199 str
$nhi,[$num,#4] @ tp[num]=
202 sub $tj,$num,$tj @
"original" $num-1 value
203 sub $ap,$ap,$tj @
"rewind" ap to
&ap
[1]
204 ldr
$bi,[$tp,#4]! @ *(++bp)
205 sub $np,$np,$tj @
"rewind" np to
&np
[1]
206 ldr
$aj,[$ap,#-4] @ ap[0]
207 ldr
$alo,[sp
] @ tp
[0]
208 ldr
$nj,[$np,#-4] @ np[0]
209 ldr
$tj,[sp
,#4] @ tp[1]
212 umlal
$alo,$ahi,$aj,$bi @ ap
[0]*bp
[i
]+tp
[0]
213 str
$tp,[$_bp] @ save bp
216 umlal
$alo,$nlo,$nj,$n0 @ np
[0]*n0
+"tp[0]"
220 ldr
$aj,[$ap],#4 @ ap[j],ap++
221 adds
$alo,$ahi,$tj @
+=tp
[j
]
222 ldr
$nj,[$np],#4 @ np[j],np++
224 umlal
$alo,$ahi,$aj,$bi @ ap
[j
]*bp
[i
]
226 umlal
$nlo,$nhi,$nj,$n0 @ np
[j
]*n0
228 ldr
$tj,[$tp,#8] @ tp[j+1]
230 str
$nlo,[$tp],#4 @ tp[j-1]=,tp++
237 ldr
$tp,[$_bp] @ restore bp
239 ldr
$n0,[$_n0] @ restore n0
241 ldr
$tj,[$_bpend] @ restore
&bp
[num
]
243 str
$nlo,[$num] @ tp
[num
-1]=
244 str
$nhi,[$num,#4] @ tp[num]=
253 ldr
$rp,[$_rp] @ pull rp
255 add
$num,$num,#4 @ $num to point at &tp[num]
256 sub $aj,$num,$aj @
"original" num value
257 mov
$tp,sp @
"rewind" $tp
258 mov
$ap,$tp @
"borrow" $ap
259 sub $np,$np,$aj @
"rewind" $np to
&np
[0]
261 subs
$tj,$tj,$tj @
"clear" carry flag
262 .Lsub
: ldr
$tj,[$tp],#4
264 sbcs
$tj,$tj,$nj @ tp
[j
]-np
[j
]
265 str
$tj,[$rp],#4 @ rp[j]=
266 teq
$tp,$num @ preserve carry
268 sbcs
$nhi,$nhi,#0 @ upmost carry
269 mov
$tp,sp @
"rewind" $tp
270 sub $rp,$rp,$aj @
"rewind" $rp
272 .Lcopy
: ldr
$tj,[$tp] @ conditional copy
274 str sp
,[$tp],#4 @ zap tp
280 teq
$tp,$num @ preserve carry
284 add sp
,sp
,#4 @ skip over tp[num+1]
285 ldmia sp
!,{r4
-r12
,lr
} @ restore registers
286 add sp
,sp
,#2*4 @ skip over {r0,r2}
293 moveq pc
,lr @ be binary compatible with V4
, yet
294 bx lr @ interoperable with Thumb ISA
:-)
296 .size bn_mul_mont
,.-bn_mul_mont
299 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
300 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
301 my ($Z,$Temp)=("q4","q5");
302 my @ACC=map("q$_",(6..13));
303 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
307 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
308 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
311 #if __ARM_MAX_ARCH__>=7
315 .type bn_mul8x_mont_neon
,%function
320 vstmdb sp
!,{d8
-d15
} @ ABI specification says so
321 ldmia ip
,{r4
-r5
} @ load rest of parameter block
327 @ special case
for $num==8, everything is
in register bank
...
329 vld1
.32
{${Bi
}[0]}, [$bptr,:32]!
330 veor
$zero,$zero,$zero
331 sub $toutptr,sp
,$num,lsl
#4
332 vld1
.32
{$A0-$A3}, [$aptr]! @ can
't specify :32 :-(
333 and $toutptr,$toutptr,#-64
334 vld1.32 {${M0}[0]}, [$n0,:32]
335 mov sp,$toutptr @ alloca
338 vmull.u32 @ACC[0],$Bi,${A0}[0]
339 vmull.u32 @ACC[1],$Bi,${A0}[1]
340 vmull.u32 @ACC[2],$Bi,${A1}[0]
341 vshl.i64 $Ni,@ACC[0]#hi,#16
342 vmull.u32 @ACC[3],$Bi,${A1}[1]
344 vadd.u64 $Ni,$Ni,@ACC[0]#lo
345 veor $zero,$zero,$zero
348 vmull.u32 @ACC[4],$Bi,${A2}[0]
349 vld1.32 {$N0-$N3}, [$nptr]!
350 vmull.u32 @ACC[5],$Bi,${A2}[1]
351 vmull.u32 @ACC[6],$Bi,${A3}[0]
353 vmull.u32 @ACC[7],$Bi,${A3}[1]
355 vmlal.u32 @ACC[0],$Ni,${N0}[0]
357 vmlal.u32 @ACC[1],$Ni,${N0}[1]
358 vmlal.u32 @ACC[2],$Ni,${N1}[0]
359 vmlal.u32 @ACC[3],$Ni,${N1}[1]
361 vmlal.u32 @ACC[4],$Ni,${N2}[0]
363 vmlal.u32 @ACC[5],$Ni,${N2}[1]
365 vmlal.u32 @ACC[6],$Ni,${N3}[0]
367 vmlal.u32 @ACC[7],$Ni,${N3}[1]
370 vshr.u64 $temp,$temp,#16
373 vadd.u64 $temp,$temp,$Temp#hi
376 vshr.u64 $temp,$temp,#16
382 vld1.32 {${Bi}[0]}, [$bptr,:32]!
383 veor $zero,$zero,$zero
385 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
387 vmlal.u32 @ACC[0],$Bi,${A0}[0]
388 vmlal.u32 @ACC[1],$Bi,${A0}[1]
389 vmlal.u32 @ACC[2],$Bi,${A1}[0]
390 vshl.i64 $Ni,@ACC[0]#hi,#16
391 vmlal.u32 @ACC[3],$Bi,${A1}[1]
393 vadd.u64 $Ni,$Ni,@ACC[0]#lo
394 veor $zero,$zero,$zero
395 subs $outer,$outer,#1
398 vmlal.u32 @ACC[4],$Bi,${A2}[0]
399 vmlal.u32 @ACC[5],$Bi,${A2}[1]
400 vmlal.u32 @ACC[6],$Bi,${A3}[0]
402 vmlal.u32 @ACC[7],$Bi,${A3}[1]
404 vmlal.u32 @ACC[0],$Ni,${N0}[0]
405 vmlal.u32 @ACC[1],$Ni,${N0}[1]
406 vmlal.u32 @ACC[2],$Ni,${N1}[0]
407 vmlal.u32 @ACC[3],$Ni,${N1}[1]
409 vmlal.u32 @ACC[4],$Ni,${N2}[0]
411 vmlal.u32 @ACC[5],$Ni,${N2}[1]
413 vmlal.u32 @ACC[6],$Ni,${N3}[0]
415 vmlal.u32 @ACC[7],$Ni,${N3}[1]
418 vshr.u64 $temp,$temp,#16
421 vadd.u64 $temp,$temp,$Temp#hi
424 vshr.u64 $temp,$temp,#16
428 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
430 vshr.u64 $temp,@ACC[0]#lo,#16
432 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
434 vshr.u64 $temp,@ACC[0]#hi,#16
435 vzip.16 @ACC[0]#lo,@ACC[0]#hi
441 veor @ACC[0],@ACC[0],@ACC[0]
443 veor @ACC[1],@ACC[1],@ACC[1]
444 sub $toutptr,$toutptr,$num,lsl#4
445 veor @ACC[2],@ACC[2],@ACC[2]
446 and $toutptr,$toutptr,#-64
447 veor @ACC[3],@ACC[3],@ACC[3]
448 mov sp,$toutptr @ alloca
449 veor @ACC[4],@ACC[4],@ACC[4]
450 add $toutptr,$toutptr,#256
451 veor @ACC[5],@ACC[5],@ACC[5]
453 veor @ACC[6],@ACC[6],@ACC[6]
454 veor @ACC[7],@ACC[7],@ACC[7]
457 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
458 subs $inner,$inner,#8
459 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
460 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
461 vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
465 vld1.32 {$A0-$A3},[$aptr]!
467 vld1.32 {${M0}[0]},[$n0,:32]
473 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
474 veor $zero,$zero,$zero
477 vld1.32 {$N0-$N3},[$nptr]!
479 vmlal.u32 @ACC[0],$Bi,${A0}[0]
480 vmlal.u32 @ACC[1],$Bi,${A0}[1]
481 veor $zero,$zero,$zero
482 vmlal.u32 @ACC[2],$Bi,${A1}[0]
483 vshl.i64 $Ni,@ACC[0]#hi,#16
484 vmlal.u32 @ACC[3],$Bi,${A1}[1]
485 vadd.u64 $Ni,$Ni,@ACC[0]#lo
486 vmlal.u32 @ACC[4],$Bi,${A2}[0]
488 vmlal.u32 @ACC[5],$Bi,${A2}[1]
489 vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
490 vmlal.u32 @ACC[6],$Bi,${A3}[0]
492 vmlal.u32 @ACC[7],$Bi,${A3}[1]
496 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
497 vmlal.u32 @ACC[0],$Ni,${N0}[0]
498 veor $temp,$temp,$temp
499 vmlal.u32 @ACC[1],$Ni,${N0}[1]
501 vmlal.u32 @ACC[2],$Ni,${N1}[0]
502 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
503 vmlal.u32 @ACC[3],$Ni,${N1}[1]
504 vmlal.u32 @ACC[4],$Ni,${N2}[0]
505 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
506 vmlal.u32 @ACC[5],$Ni,${N2}[1]
507 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
508 vmlal.u32 @ACC[6],$Ni,${N3}[0]
509 vmlal.u32 @ACC[7],$Ni,${N3}[1]
510 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
511 vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
513 push(@ACC,shift(@ACC)); $i++;
515 vmlal.u32 @ACC[0],$Bi,${A0}[0]
516 vld1.64 {@ACC[7]},[$tinptr,:128]!
517 vmlal.u32 @ACC[1],$Bi,${A0}[1]
518 veor $zero,$zero,$zero
519 vmlal.u32 @ACC[2],$Bi,${A1}[0]
520 vshl.i64 $Ni,@ACC[0]#hi,#16
521 vmlal.u32 @ACC[3],$Bi,${A1}[1]
522 vadd.u64 $Ni,$Ni,@ACC[0]#lo
523 vmlal.u32 @ACC[4],$Bi,${A2}[0]
525 vmlal.u32 @ACC[5],$Bi,${A2}[1]
526 vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
527 vmlal.u32 @ACC[6],$Bi,${A3}[0]
529 vmlal.u32 @ACC[7],$Bi,${A3}[1]
533 vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
534 vmlal.u32 @ACC[0],$Ni,${N0}[0]
535 vld1.32 {$A0-$A3},[$aptr]!
536 vmlal.u32 @ACC[1],$Ni,${N0}[1]
537 vmlal.u32 @ACC[2],$Ni,${N1}[0]
538 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
539 vmlal.u32 @ACC[3],$Ni,${N1}[1]
540 vmlal.u32 @ACC[4],$Ni,${N2}[0]
541 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
542 vmlal.u32 @ACC[5],$Ni,${N2}[1]
543 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
544 vmlal.u32 @ACC[6],$Ni,${N3}[0]
545 vmlal.u32 @ACC[7],$Ni,${N3}[1]
546 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
547 vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
548 add $bnptr,sp,#8 @ rewind
550 push(@ACC,shift(@ACC));
557 subs $inner,$inner,#8
558 vmlal.u32 @ACC[0],$Bi,${A0}[0]
559 vld1.64 {@ACC[7]},[$tinptr,:128]
560 vmlal.u32 @ACC[1],$Bi,${A0}[1]
561 vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
562 vmlal.u32 @ACC[2],$Bi,${A1}[0]
563 vld1.32 {$N0-$N3},[$nptr]!
564 vmlal.u32 @ACC[3],$Bi,${A1}[1]
566 addne $tinptr,$tinptr,#16 @ don't advance
in last iteration
567 vmlal
.u32
@ACC[4],$Bi,${A2
}[0]
568 vmlal
.u32
@ACC[5],$Bi,${A2
}[1]
569 vmlal
.u32
@ACC[6],$Bi,${A3
}[0]
570 vmlal
.u32
@ACC[7],$Bi,${A3
}[1]
572 for ($i=1; $i<8; $i++) {
574 vld1
.32
{$Bi},[$bnptr,:64]! @ pull smashed b
[8*i
+$i]
575 vmlal
.u32
@ACC[0],$Ni,${N0
}[0]
576 vmlal
.u32
@ACC[1],$Ni,${N0
}[1]
577 vmlal
.u32
@ACC[2],$Ni,${N1
}[0]
578 vmlal
.u32
@ACC[3],$Ni,${N1
}[1]
579 vmlal
.u32
@ACC[4],$Ni,${N2
}[0]
580 vmlal
.u32
@ACC[5],$Ni,${N2
}[1]
581 vmlal
.u32
@ACC[6],$Ni,${N3
}[0]
582 vmlal
.u32
@ACC[7],$Ni,${N3
}[1]
583 vst1
.64
{@ACC[0]},[$toutptr,:128]!
585 push(@ACC,shift(@ACC));
587 vmlal
.u32
@ACC[0],$Bi,${A0
}[0]
588 vld1
.64
{@ACC[7]},[$tinptr,:128]
589 vmlal
.u32
@ACC[1],$Bi,${A0
}[1]
590 vld1
.32
{$Ni},[$bnptr,:64]! @ pull smashed m
[8*i
+$i]
591 vmlal
.u32
@ACC[2],$Bi,${A1
}[0]
593 addne
$tinptr,$tinptr,#16 @ don't advance in last iteration
594 vmlal
.u32
@ACC[3],$Bi,${A1
}[1]
595 vmlal
.u32
@ACC[4],$Bi,${A2
}[0]
596 vmlal
.u32
@ACC[5],$Bi,${A2
}[1]
597 vmlal
.u32
@ACC[6],$Bi,${A3
}[0]
598 vmlal
.u32
@ACC[7],$Bi,${A3
}[1]
603 subeq
$aptr,$aptr,$num,lsl
#2 @ rewind
604 vmlal
.u32
@ACC[0],$Ni,${N0
}[0]
605 vld1
.32
{$Bi},[sp
,:64] @ pull smashed b
[8*i
+0]
606 vmlal
.u32
@ACC[1],$Ni,${N0
}[1]
607 vld1
.32
{$A0-$A3},[$aptr]!
608 vmlal
.u32
@ACC[2],$Ni,${N1
}[0]
609 add
$bnptr,sp
,#8 @ rewind
610 vmlal
.u32
@ACC[3],$Ni,${N1
}[1]
611 vmlal
.u32
@ACC[4],$Ni,${N2
}[0]
612 vmlal
.u32
@ACC[5],$Ni,${N2
}[1]
613 vmlal
.u32
@ACC[6],$Ni,${N3
}[0]
614 vst1
.64
{@ACC[0]},[$toutptr,:128]!
615 vmlal
.u32
@ACC[7],$Ni,${N3
}[1]
619 push(@ACC,shift(@ACC));
622 vst1
.64
{@ACC[0]-@ACC[1]},[$toutptr,:256]!
623 veor q2
,q2
,q2 @
$N0-$N1
624 vst1
.64
{@ACC[2]-@ACC[3]},[$toutptr,:256]!
625 veor q3
,q3
,q3 @
$N2-$N3
626 vst1
.64
{@ACC[4]-@ACC[5]},[$toutptr,:256]!
627 vst1
.64
{@ACC[6]},[$toutptr,:128]
629 subs
$outer,$outer,#8
630 vld1
.64
{@ACC[0]-@ACC[1]},[$tinptr,:256]!
631 vld1
.64
{@ACC[2]-@ACC[3]},[$tinptr,:256]!
632 vld1
.64
{@ACC[4]-@ACC[5]},[$tinptr,:256]!
633 vld1
.64
{@ACC[6]-@ACC[7]},[$tinptr,:256]!
636 subne
$nptr,$nptr,$num,lsl
#2 @ rewind
640 vst1
.64
{q2
-q3
}, [sp
,:256]! @ start wiping stack frame
641 vshr
.u64
$temp,@ACC[0]#lo,#16
642 vst1
.64
{q2
-q3
},[sp
,:256]!
643 vadd
.u64
@ACC[0]#hi,@ACC[0]#hi,$temp
644 vst1
.64
{q2
-q3
}, [sp
,:256]!
645 vshr
.u64
$temp,@ACC[0]#hi,#16
646 vst1
.64
{q2
-q3
}, [sp
,:256]!
647 vzip
.16 @ACC[0]#lo,@ACC[0]#hi
654 vadd
.u64
@ACC[0]#lo,@ACC[0]#lo,$temp
655 vshr
.u64
$temp,@ACC[0]#lo,#16
656 vld1
.64
{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
657 vadd
.u64
@ACC[0]#hi,@ACC[0]#hi,$temp
658 vld1
.64
{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
659 vshr
.u64
$temp,@ACC[0]#hi,#16
660 vld1
.64
{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
661 vzip
.16 @ACC[0]#lo,@ACC[0]#hi
665 for ($i=1; $i<8; $i++) {
667 vadd
.u64
@ACC[1]#lo,@ACC[1]#lo,$temp
668 vst1
.32
{@ACC[0]#lo[0]}, [$toutptr, :32]!
669 vshr
.u64
$temp,@ACC[1]#lo,#16
670 vadd
.u64
@ACC[1]#hi,@ACC[1]#hi,$temp
671 vshr
.u64
$temp,@ACC[1]#hi,#16
672 vzip
.16 @ACC[1]#lo,@ACC[1]#hi
674 push(@ACC,shift(@ACC));
676 push(@ACC,shift(@ACC));
678 vld1
.64
{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
679 subs
$inner,$inner,#8
680 vst1
.32
{@ACC[7]#lo[0]}, [$toutptr, :32]!
683 vst1
.32
{${temp
}[0]}, [$toutptr, :32] @ top
-most bit
684 sub $nptr,$nptr,$num,lsl
#2 @ rewind $nptr
685 subs
$aptr,sp
,#0 @ clear carry flag
686 add
$bptr,sp
,$num,lsl
#2
689 ldmia
$aptr!, {r4
-r7
}
690 ldmia
$nptr!, {r8
-r11
}
695 teq
$aptr,$bptr @ preserves carry
696 stmia
$rptr!, {r8
-r11
}
699 ldr r10
, [$aptr] @ load top
-most bit
702 sub r11
,$bptr,r11 @ this is num
*4
705 sub $rptr,$rptr,r11 @ rewind
$rptr
706 mov
$nptr,$bptr @ second
3/4th of frame
707 sbcs r10
,r10
,#0 @ result is carry flag
710 ldmia
$aptr!, {r4
-r7
}
711 ldmia
$rptr, {r8
-r11
}
714 vst1
.64
{q0
-q1
}, [$nptr,:256]! @ wipe
718 vst1
.64
{q0
-q1
}, [$nptr,:256]! @ wipe
722 stmia
$rptr!, {r8
-r11
}
724 ldmia
$rptr, {r8
-r11
}
727 vst1
.64
{q0
-q1
}, [$aptr,:256]! @ wipe
731 vst1
.64
{q0
-q1
}, [$nptr,:256]! @ wipe
734 teq
$aptr,$bptr @ preserves carry
735 stmia
$rptr!, {r8
-r11
}
736 bne
.LNEON_copy_n_zap
742 .size bn_mul8x_mont_neon
,.-bn_mul8x_mont_neon
747 .asciz
"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
749 #if __ARM_MAX_ARCH__>=7
750 .comm OPENSSL_armcap_P
,4,4
754 foreach (split("\n",$code)) {
755 s/\`([^\`]*)\`/eval $1/ge;
757 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
759 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4