]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
da1c088f | 2 | # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
43b8fe1c AP |
9 | |
10 | # ==================================================================== | |
d1671f4f | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
43b8fe1c AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # January 2007. | |
18 | ||
19 | # Montgomery multiplication for ARMv4. | |
20 | # | |
21 | # Performance improvement naturally varies among CPU implementations | |
22 | # and compilers. The code was observed to provide +65-35% improvement | |
23 | # [depending on key length, less for longer keys] on ARM920T, and | |
24 | # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code | |
25 | # base and compiler generated code with in-lined umull and even umlal | |
609b0852 | 26 | # instructions. The latter means that this code didn't really have an |
43b8fe1c AP |
27 | # "advantage" of utilizing some "secret" instruction. |
28 | # | |
29 | # The code is interoperable with Thumb ISA and is rather compact, less | |
30 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | |
31 | # about decorations, ABI and instruction syntax are identical. | |
32 | ||
d1671f4f AP |
33 | # November 2013 |
34 | # | |
35 | # Add NEON code path, which handles lengths divisible by 8. RSA/DSA | |
36 | # performance improvement on Cortex-A8 is ~45-100% depending on key | |
37 | # length, more for longer keys. On Cortex-A15 the span is ~10-105%. | |
38 | # On Snapdragon S4 improvement was measured to vary from ~70% to | |
39 | # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is | |
40 | # rather because original integer-only code seems to perform | |
41 | # suboptimally on S4. Situation on Cortex-A9 is unfortunately | |
42 | # different. It's being looked into, but the trouble is that | |
43 | # performance for vectors longer than 256 bits is actually couple | |
44 | # of percent worse than for integer-only code. The code is chosen | |
45 | # for execution on all NEON-capable processors, because gain on | |
46 | # others outweighs the marginal loss on Cortex-A9. | |
47 | ||
8eed3289 AP |
48 | # September 2015 |
49 | # | |
50 | # Align Cortex-A9 performance with November 2013 improvements, i.e. | |
51 | # NEON code is now ~20-105% faster than integer-only one on this | |
52 | # processor. But this optimization further improved performance even | |
53 | # on other processors: NEON code path is ~45-180% faster than original | |
54 | # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on | |
55 | # Snapdragon S4. | |
56 | ||
1aa89a7a RL |
57 | # $output is the last argument if it looks like a file (it has an extension) |
58 | # $flavour is the first argument if it doesn't look like a file | |
59 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
60 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
313e6ec1 AP |
61 | |
62 | if ($flavour && $flavour ne "void") { | |
63 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
64 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
65 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
66 | die "can't locate arm-xlate.pl"; | |
67 | ||
1aa89a7a RL |
68 | open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" |
69 | or die "can't call $xlate: $1"; | |
313e6ec1 | 70 | } else { |
1aa89a7a | 71 | $output and open STDOUT,">$output"; |
313e6ec1 | 72 | } |
e216cd6e | 73 | |
43b8fe1c AP |
74 | $num="r0"; # starts as num argument, but holds &tp[num-1] |
75 | $ap="r1"; | |
76 | $bp="r2"; $bi="r2"; $rp="r2"; | |
77 | $np="r3"; | |
78 | $tp="r4"; | |
79 | $aj="r5"; | |
80 | $nj="r6"; | |
81 | $tj="r7"; | |
82 | $n0="r8"; | |
83 | ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer | |
84 | $alo="r10"; # sl, gcc uses it to keep @GOT | |
85 | $ahi="r11"; # fp | |
86 | $nlo="r12"; # ip | |
87 | ########### # r13 is stack pointer | |
88 | $nhi="r14"; # lr | |
89 | ########### # r15 is program counter | |
90 | ||
91 | #### argument block layout relative to &tp[num-1], a.k.a. $num | |
92 | $_rp="$num,#12*4"; | |
93 | # ap permanently resides in r1 | |
94 | $_bp="$num,#13*4"; | |
95 | # np permanently resides in r3 | |
96 | $_n0="$num,#14*4"; | |
97 | $_num="$num,#15*4"; $_bpend=$_num; | |
98 | ||
99 | $code=<<___; | |
d1671f4f AP |
100 | #include "arm_arch.h" |
101 | ||
a2859927 | 102 | #if defined(__thumb2__) |
11208dcf AP |
103 | .syntax unified |
104 | .thumb | |
105 | #else | |
d1671f4f | 106 | .code 32 |
11208dcf | 107 | #endif |
d1671f4f | 108 | |
3405db97 AP |
109 | .text |
110 | ||
c1669e1c | 111 | #if __ARM_MAX_ARCH__>=7 |
d1671f4f AP |
112 | .align 5 |
113 | .LOPENSSL_armcap: | |
3405db97 AP |
114 | # ifdef _WIN32 |
115 | .word OPENSSL_armcap_P | |
116 | # else | |
313e6ec1 | 117 | .word OPENSSL_armcap_P-.Lbn_mul_mont |
3405db97 | 118 | # endif |
d1671f4f | 119 | #endif |
43b8fe1c AP |
120 | |
121 | .global bn_mul_mont | |
122 | .type bn_mul_mont,%function | |
123 | ||
d1671f4f | 124 | .align 5 |
43b8fe1c | 125 | bn_mul_mont: |
313e6ec1 | 126 | .Lbn_mul_mont: |
d1671f4f | 127 | ldr ip,[sp,#4] @ load num |
43b8fe1c | 128 | stmdb sp!,{r0,r2} @ sp points at argument block |
c1669e1c | 129 | #if __ARM_MAX_ARCH__>=7 |
d1671f4f AP |
130 | tst ip,#7 |
131 | bne .Lialu | |
3405db97 AP |
132 | ldr r0,.LOPENSSL_armcap |
133 | #if !defined(_WIN32) | |
134 | adr r2,.Lbn_mul_mont | |
d1671f4f | 135 | ldr r0,[r0,r2] |
3405db97 AP |
136 | # endif |
137 | # if defined(__APPLE__) || defined(_WIN32) | |
313e6ec1 | 138 | ldr r0,[r0] |
3405db97 | 139 | # endif |
bdbd3aea | 140 | tst r0,#ARMV7_NEON @ NEON available? |
d1671f4f AP |
141 | ldmia sp, {r0,r2} |
142 | beq .Lialu | |
143 | add sp,sp,#8 | |
144 | b bn_mul8x_mont_neon | |
145 | .align 4 | |
146 | .Lialu: | |
147 | #endif | |
148 | cmp ip,#2 | |
149 | mov $num,ip @ load num | |
11208dcf AP |
150 | #ifdef __thumb2__ |
151 | ittt lt | |
152 | #endif | |
43b8fe1c AP |
153 | movlt r0,#0 |
154 | addlt sp,sp,#2*4 | |
7d9cf7c0 | 155 | blt .Labrt |
43b8fe1c AP |
156 | |
157 | stmdb sp!,{r4-r12,lr} @ save 10 registers | |
158 | ||
159 | mov $num,$num,lsl#2 @ rescale $num for byte count | |
160 | sub sp,sp,$num @ alloca(4*num) | |
161 | sub sp,sp,#4 @ +extra dword | |
162 | sub $num,$num,#4 @ "num=num-1" | |
163 | add $tp,$bp,$num @ &bp[num-1] | |
164 | ||
165 | add $num,sp,$num @ $num to point at &tp[num-1] | |
166 | ldr $n0,[$_n0] @ &n0 | |
167 | ldr $bi,[$bp] @ bp[0] | |
168 | ldr $aj,[$ap],#4 @ ap[0],ap++ | |
169 | ldr $nj,[$np],#4 @ np[0],np++ | |
170 | ldr $n0,[$n0] @ *n0 | |
171 | str $tp,[$_bpend] @ save &bp[num] | |
172 | ||
173 | umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] | |
174 | str $n0,[$_n0] @ save n0 value | |
175 | mul $n0,$alo,$n0 @ "tp[0]"*n0 | |
176 | mov $nlo,#0 | |
177 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" | |
178 | mov $tp,sp | |
179 | ||
180 | .L1st: | |
181 | ldr $aj,[$ap],#4 @ ap[j],ap++ | |
182 | mov $alo,$ahi | |
10bd69bf | 183 | ldr $nj,[$np],#4 @ np[j],np++ |
43b8fe1c AP |
184 | mov $ahi,#0 |
185 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | |
43b8fe1c AP |
186 | mov $nhi,#0 |
187 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | |
188 | adds $nlo,$nlo,$alo | |
189 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | |
190 | adc $nlo,$nhi,#0 | |
191 | cmp $tp,$num | |
192 | bne .L1st | |
193 | ||
194 | adds $nlo,$nlo,$ahi | |
10bd69bf | 195 | ldr $tp,[$_bp] @ restore bp |
43b8fe1c | 196 | mov $nhi,#0 |
10bd69bf | 197 | ldr $n0,[$_n0] @ restore n0 |
43b8fe1c | 198 | adc $nhi,$nhi,#0 |
43b8fe1c | 199 | str $nlo,[$num] @ tp[num-1]= |
11208dcf | 200 | mov $tj,sp |
43b8fe1c AP |
201 | str $nhi,[$num,#4] @ tp[num]= |
202 | \f | |
203 | .Louter: | |
11208dcf | 204 | sub $tj,$num,$tj @ "original" $num-1 value |
43b8fe1c | 205 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] |
43b8fe1c | 206 | ldr $bi,[$tp,#4]! @ *(++bp) |
10bd69bf | 207 | sub $np,$np,$tj @ "rewind" np to &np[1] |
43b8fe1c | 208 | ldr $aj,[$ap,#-4] @ ap[0] |
43b8fe1c | 209 | ldr $alo,[sp] @ tp[0] |
10bd69bf | 210 | ldr $nj,[$np,#-4] @ np[0] |
43b8fe1c AP |
211 | ldr $tj,[sp,#4] @ tp[1] |
212 | ||
213 | mov $ahi,#0 | |
214 | umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] | |
215 | str $tp,[$_bp] @ save bp | |
216 | mul $n0,$alo,$n0 | |
217 | mov $nlo,#0 | |
218 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" | |
219 | mov $tp,sp | |
220 | ||
221 | .Linner: | |
222 | ldr $aj,[$ap],#4 @ ap[j],ap++ | |
223 | adds $alo,$ahi,$tj @ +=tp[j] | |
10bd69bf | 224 | ldr $nj,[$np],#4 @ np[j],np++ |
43b8fe1c AP |
225 | mov $ahi,#0 |
226 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | |
43b8fe1c AP |
227 | mov $nhi,#0 |
228 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | |
43b8fe1c | 229 | adc $ahi,$ahi,#0 |
10bd69bf | 230 | ldr $tj,[$tp,#8] @ tp[j+1] |
43b8fe1c AP |
231 | adds $nlo,$nlo,$alo |
232 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | |
233 | adc $nlo,$nhi,#0 | |
234 | cmp $tp,$num | |
235 | bne .Linner | |
236 | ||
237 | adds $nlo,$nlo,$ahi | |
238 | mov $nhi,#0 | |
10bd69bf | 239 | ldr $tp,[$_bp] @ restore bp |
43b8fe1c | 240 | adc $nhi,$nhi,#0 |
10bd69bf | 241 | ldr $n0,[$_n0] @ restore n0 |
43b8fe1c | 242 | adds $nlo,$nlo,$tj |
43b8fe1c | 243 | ldr $tj,[$_bpend] @ restore &bp[num] |
10bd69bf | 244 | adc $nhi,$nhi,#0 |
43b8fe1c | 245 | str $nlo,[$num] @ tp[num-1]= |
43b8fe1c AP |
246 | str $nhi,[$num,#4] @ tp[num]= |
247 | ||
248 | cmp $tp,$tj | |
11208dcf AP |
249 | #ifdef __thumb2__ |
250 | itt ne | |
251 | #endif | |
252 | movne $tj,sp | |
43b8fe1c AP |
253 | bne .Louter |
254 | \f | |
255 | ldr $rp,[$_rp] @ pull rp | |
11208dcf | 256 | mov $aj,sp |
43b8fe1c | 257 | add $num,$num,#4 @ $num to point at &tp[num] |
11208dcf | 258 | sub $aj,$num,$aj @ "original" num value |
43b8fe1c | 259 | mov $tp,sp @ "rewind" $tp |
7d9cf7c0 | 260 | mov $ap,$tp @ "borrow" $ap |
43b8fe1c AP |
261 | sub $np,$np,$aj @ "rewind" $np to &np[0] |
262 | ||
7d9cf7c0 | 263 | subs $tj,$tj,$tj @ "clear" carry flag |
43b8fe1c AP |
264 | .Lsub: ldr $tj,[$tp],#4 |
265 | ldr $nj,[$np],#4 | |
266 | sbcs $tj,$tj,$nj @ tp[j]-np[j] | |
267 | str $tj,[$rp],#4 @ rp[j]= | |
268 | teq $tp,$num @ preserve carry | |
269 | bne .Lsub | |
270 | sbcs $nhi,$nhi,#0 @ upmost carry | |
271 | mov $tp,sp @ "rewind" $tp | |
272 | sub $rp,$rp,$aj @ "rewind" $rp | |
43b8fe1c | 273 | |
774ff8fe AP |
274 | .Lcopy: ldr $tj,[$tp] @ conditional copy |
275 | ldr $aj,[$rp] | |
7d9cf7c0 | 276 | str sp,[$tp],#4 @ zap tp |
774ff8fe AP |
277 | #ifdef __thumb2__ |
278 | it cc | |
279 | #endif | |
280 | movcc $aj,$tj | |
281 | str $aj,[$rp],#4 | |
282 | teq $tp,$num @ preserve carry | |
7d9cf7c0 AP |
283 | bne .Lcopy |
284 | ||
11208dcf AP |
285 | mov sp,$num |
286 | add sp,sp,#4 @ skip over tp[num+1] | |
7d9cf7c0 AP |
287 | ldmia sp!,{r4-r12,lr} @ restore registers |
288 | add sp,sp,#2*4 @ skip over {r0,r2} | |
289 | mov r0,#1 | |
5dcf70a1 AP |
290 | .Labrt: |
291 | #if __ARM_ARCH__>=5 | |
292 | ret @ bx lr | |
293 | #else | |
294 | tst lr,#1 | |
7d9cf7c0 AP |
295 | moveq pc,lr @ be binary compatible with V4, yet |
296 | bx lr @ interoperable with Thumb ISA:-) | |
5dcf70a1 | 297 | #endif |
43b8fe1c | 298 | .size bn_mul_mont,.-bn_mul_mont |
d1671f4f AP |
299 | ___ |
300 | { | |
d1671f4f AP |
301 | my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); |
302 | my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); | |
303 | my ($Z,$Temp)=("q4","q5"); | |
8eed3289 | 304 | my @ACC=map("q$_",(6..13)); |
d1671f4f | 305 | my ($Bi,$Ni,$M0)=map("d$_",(28..31)); |
8eed3289 AP |
306 | my $zero="$Z#lo"; |
307 | my $temp="$Temp#lo"; | |
d1671f4f AP |
308 | |
309 | my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); | |
8eed3289 | 310 | my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); |
d1671f4f AP |
311 | |
312 | $code.=<<___; | |
c1669e1c AP |
313 | #if __ARM_MAX_ARCH__>=7 |
314 | .arch armv7-a | |
d1671f4f AP |
315 | .fpu neon |
316 | ||
317 | .type bn_mul8x_mont_neon,%function | |
318 | .align 5 | |
319 | bn_mul8x_mont_neon: | |
320 | mov ip,sp | |
321 | stmdb sp!,{r4-r11} | |
322 | vstmdb sp!,{d8-d15} @ ABI specification says so | |
323 | ldmia ip,{r4-r5} @ load rest of parameter block | |
11208dcf | 324 | mov ip,sp |
d1671f4f | 325 | |
8eed3289 AP |
326 | cmp $num,#8 |
327 | bhi .LNEON_8n | |
328 | ||
329 | @ special case for $num==8, everything is in register bank... | |
330 | ||
d1671f4f | 331 | vld1.32 {${Bi}[0]}, [$bptr,:32]! |
8eed3289 AP |
332 | veor $zero,$zero,$zero |
333 | sub $toutptr,sp,$num,lsl#4 | |
d1671f4f AP |
334 | vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( |
335 | and $toutptr,$toutptr,#-64 | |
336 | vld1.32 {${M0}[0]}, [$n0,:32] | |
337 | mov sp,$toutptr @ alloca | |
d1671f4f AP |
338 | vzip.16 $Bi,$zero |
339 | ||
8eed3289 AP |
340 | vmull.u32 @ACC[0],$Bi,${A0}[0] |
341 | vmull.u32 @ACC[1],$Bi,${A0}[1] | |
342 | vmull.u32 @ACC[2],$Bi,${A1}[0] | |
343 | vshl.i64 $Ni,@ACC[0]#hi,#16 | |
344 | vmull.u32 @ACC[3],$Bi,${A1}[1] | |
d1671f4f | 345 | |
8eed3289 | 346 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
d1671f4f | 347 | veor $zero,$zero,$zero |
8eed3289 | 348 | vmul.u32 $Ni,$Ni,$M0 |
d1671f4f | 349 | |
8eed3289 | 350 | vmull.u32 @ACC[4],$Bi,${A2}[0] |
d1671f4f | 351 | vld1.32 {$N0-$N3}, [$nptr]! |
8eed3289 AP |
352 | vmull.u32 @ACC[5],$Bi,${A2}[1] |
353 | vmull.u32 @ACC[6],$Bi,${A3}[0] | |
d1671f4f | 354 | vzip.16 $Ni,$zero |
8eed3289 | 355 | vmull.u32 @ACC[7],$Bi,${A3}[1] |
d1671f4f | 356 | |
8eed3289 | 357 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
d1671f4f | 358 | sub $outer,$num,#1 |
8eed3289 AP |
359 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
360 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
361 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
362 | ||
363 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
364 | vmov $Temp,@ACC[0] | |
365 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
366 | vmov @ACC[0],@ACC[1] | |
367 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
368 | vmov @ACC[1],@ACC[2] | |
369 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
370 | vmov @ACC[2],@ACC[3] | |
371 | vmov @ACC[3],@ACC[4] | |
d1671f4f | 372 | vshr.u64 $temp,$temp,#16 |
8eed3289 AP |
373 | vmov @ACC[4],@ACC[5] |
374 | vmov @ACC[5],@ACC[6] | |
375 | vadd.u64 $temp,$temp,$Temp#hi | |
376 | vmov @ACC[6],@ACC[7] | |
377 | veor @ACC[7],@ACC[7] | |
d1671f4f AP |
378 | vshr.u64 $temp,$temp,#16 |
379 | ||
380 | b .LNEON_outer8 | |
381 | ||
382 | .align 4 | |
383 | .LNEON_outer8: | |
384 | vld1.32 {${Bi}[0]}, [$bptr,:32]! | |
385 | veor $zero,$zero,$zero | |
386 | vzip.16 $Bi,$zero | |
8eed3289 | 387 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
d1671f4f | 388 | |
8eed3289 AP |
389 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
390 | vmlal.u32 @ACC[1],$Bi,${A0}[1] | |
391 | vmlal.u32 @ACC[2],$Bi,${A1}[0] | |
392 | vshl.i64 $Ni,@ACC[0]#hi,#16 | |
393 | vmlal.u32 @ACC[3],$Bi,${A1}[1] | |
d1671f4f | 394 | |
8eed3289 | 395 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
d1671f4f AP |
396 | veor $zero,$zero,$zero |
397 | subs $outer,$outer,#1 | |
8eed3289 | 398 | vmul.u32 $Ni,$Ni,$M0 |
d1671f4f | 399 | |
8eed3289 AP |
400 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
401 | vmlal.u32 @ACC[5],$Bi,${A2}[1] | |
402 | vmlal.u32 @ACC[6],$Bi,${A3}[0] | |
d1671f4f | 403 | vzip.16 $Ni,$zero |
8eed3289 AP |
404 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
405 | ||
406 | vmlal.u32 @ACC[0],$Ni,${N0}[0] | |
407 | vmlal.u32 @ACC[1],$Ni,${N0}[1] | |
408 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
409 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
410 | ||
411 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
412 | vmov $Temp,@ACC[0] | |
413 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
414 | vmov @ACC[0],@ACC[1] | |
415 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
416 | vmov @ACC[1],@ACC[2] | |
417 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
418 | vmov @ACC[2],@ACC[3] | |
419 | vmov @ACC[3],@ACC[4] | |
d1671f4f | 420 | vshr.u64 $temp,$temp,#16 |
8eed3289 AP |
421 | vmov @ACC[4],@ACC[5] |
422 | vmov @ACC[5],@ACC[6] | |
423 | vadd.u64 $temp,$temp,$Temp#hi | |
424 | vmov @ACC[6],@ACC[7] | |
425 | veor @ACC[7],@ACC[7] | |
d1671f4f AP |
426 | vshr.u64 $temp,$temp,#16 |
427 | ||
428 | bne .LNEON_outer8 | |
429 | ||
8eed3289 | 430 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
d1671f4f | 431 | mov $toutptr,sp |
8eed3289 | 432 | vshr.u64 $temp,@ACC[0]#lo,#16 |
d1671f4f | 433 | mov $inner,$num |
8eed3289 AP |
434 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp |
435 | add $tinptr,sp,#96 | |
436 | vshr.u64 $temp,@ACC[0]#hi,#16 | |
437 | vzip.16 @ACC[0]#lo,@ACC[0]#hi | |
d1671f4f | 438 | |
8eed3289 | 439 | b .LNEON_tail_entry |
d1671f4f AP |
440 | |
441 | .align 4 | |
8eed3289 AP |
442 | .LNEON_8n: |
443 | veor @ACC[0],@ACC[0],@ACC[0] | |
444 | sub $toutptr,sp,#128 | |
445 | veor @ACC[1],@ACC[1],@ACC[1] | |
446 | sub $toutptr,$toutptr,$num,lsl#4 | |
447 | veor @ACC[2],@ACC[2],@ACC[2] | |
448 | and $toutptr,$toutptr,#-64 | |
449 | veor @ACC[3],@ACC[3],@ACC[3] | |
450 | mov sp,$toutptr @ alloca | |
451 | veor @ACC[4],@ACC[4],@ACC[4] | |
452 | add $toutptr,$toutptr,#256 | |
453 | veor @ACC[5],@ACC[5],@ACC[5] | |
454 | sub $inner,$num,#8 | |
455 | veor @ACC[6],@ACC[6],@ACC[6] | |
456 | veor @ACC[7],@ACC[7],@ACC[7] | |
457 | ||
458 | .LNEON_8n_init: | |
459 | vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! | |
d1671f4f | 460 | subs $inner,$inner,#8 |
8eed3289 AP |
461 | vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! |
462 | vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! | |
463 | vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! | |
464 | bne .LNEON_8n_init | |
465 | ||
466 | add $tinptr,sp,#256 | |
467 | vld1.32 {$A0-$A3},[$aptr]! | |
468 | add $bnptr,sp,#8 | |
469 | vld1.32 {${M0}[0]},[$n0,:32] | |
470 | mov $outer,$num | |
471 | b .LNEON_8n_outer | |
d1671f4f AP |
472 | |
473 | .align 4 | |
8eed3289 AP |
474 | .LNEON_8n_outer: |
475 | vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ | |
d1671f4f | 476 | veor $zero,$zero,$zero |
d1671f4f | 477 | vzip.16 $Bi,$zero |
8eed3289 AP |
478 | add $toutptr,sp,#128 |
479 | vld1.32 {$N0-$N3},[$nptr]! | |
480 | ||
481 | vmlal.u32 @ACC[0],$Bi,${A0}[0] | |
482 | vmlal.u32 @ACC[1],$Bi,${A0}[1] | |
483 | veor $zero,$zero,$zero | |
484 | vmlal.u32 @ACC[2],$Bi,${A1}[0] | |
485 | vshl.i64 $Ni,@ACC[0]#hi,#16 | |
486 | vmlal.u32 @ACC[3],$Bi,${A1}[1] | |
487 | vadd.u64 $Ni,$Ni,@ACC[0]#lo | |
488 | vmlal.u32 @ACC[4],$Bi,${A2}[0] | |
489 | vmul.u32 $Ni,$Ni,$M0 | |
490 | vmlal.u32 @ACC[5],$Bi,${A2}[1] | |
491 | vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] | |
492 | vmlal.u32 @ACC[6],$Bi,${A3}[0] | |
493 | vzip.16 $Ni,$zero | |
494 | vmlal.u32 @ACC[7],$Bi,${A3}[1] | |
495 | ___ | |
496 | for ($i=0; $i<7;) { | |
497 | $code.=<<___; | |
498 | vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ | |
499 | vmlal.u32 @ACC[0],$Ni,${N0}[0] | |
500 | veor $temp,$temp,$temp | |
501 | vmlal.u32 @ACC[1],$Ni,${N0}[1] | |
502 | vzip.16 $Bi,$temp | |
503 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
504 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 | |
505 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
506 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
507 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi | |
508 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
509 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 | |
510 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
511 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
512 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo | |
513 | vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] | |
514 | ___ | |
515 | push(@ACC,shift(@ACC)); $i++; | |
516 | $code.=<<___; | |
517 | vmlal.u32 @ACC[0],$Bi,${A0}[0] | |
518 | vld1.64 {@ACC[7]},[$tinptr,:128]! | |
519 | vmlal.u32 @ACC[1],$Bi,${A0}[1] | |
520 | veor $zero,$zero,$zero | |
521 | vmlal.u32 @ACC[2],$Bi,${A1}[0] | |
522 | vshl.i64 $Ni,@ACC[0]#hi,#16 | |
523 | vmlal.u32 @ACC[3],$Bi,${A1}[1] | |
524 | vadd.u64 $Ni,$Ni,@ACC[0]#lo | |
525 | vmlal.u32 @ACC[4],$Bi,${A2}[0] | |
526 | vmul.u32 $Ni,$Ni,$M0 | |
527 | vmlal.u32 @ACC[5],$Bi,${A2}[1] | |
528 | vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] | |
529 | vmlal.u32 @ACC[6],$Bi,${A3}[0] | |
530 | vzip.16 $Ni,$zero | |
531 | vmlal.u32 @ACC[7],$Bi,${A3}[1] | |
532 | ___ | |
533 | } | |
534 | $code.=<<___; | |
535 | vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] | |
536 | vmlal.u32 @ACC[0],$Ni,${N0}[0] | |
537 | vld1.32 {$A0-$A3},[$aptr]! | |
538 | vmlal.u32 @ACC[1],$Ni,${N0}[1] | |
539 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
540 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 | |
541 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
542 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
543 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi | |
544 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
545 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 | |
546 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
547 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
548 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo | |
549 | vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] | |
550 | add $bnptr,sp,#8 @ rewind | |
551 | ___ | |
552 | push(@ACC,shift(@ACC)); | |
553 | $code.=<<___; | |
d1671f4f | 554 | sub $inner,$num,#8 |
8eed3289 | 555 | b .LNEON_8n_inner |
d1671f4f | 556 | |
8eed3289 AP |
557 | .align 4 |
558 | .LNEON_8n_inner: | |
559 | subs $inner,$inner,#8 | |
560 | vmlal.u32 @ACC[0],$Bi,${A0}[0] | |
561 | vld1.64 {@ACC[7]},[$tinptr,:128] | |
562 | vmlal.u32 @ACC[1],$Bi,${A0}[1] | |
563 | vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] | |
564 | vmlal.u32 @ACC[2],$Bi,${A1}[0] | |
565 | vld1.32 {$N0-$N3},[$nptr]! | |
566 | vmlal.u32 @ACC[3],$Bi,${A1}[1] | |
567 | it ne | |
568 | addne $tinptr,$tinptr,#16 @ don't advance in last iteration | |
569 | vmlal.u32 @ACC[4],$Bi,${A2}[0] | |
570 | vmlal.u32 @ACC[5],$Bi,${A2}[1] | |
571 | vmlal.u32 @ACC[6],$Bi,${A3}[0] | |
572 | vmlal.u32 @ACC[7],$Bi,${A3}[1] | |
573 | ___ | |
574 | for ($i=1; $i<8; $i++) { | |
575 | $code.=<<___; | |
576 | vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] | |
577 | vmlal.u32 @ACC[0],$Ni,${N0}[0] | |
578 | vmlal.u32 @ACC[1],$Ni,${N0}[1] | |
579 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
580 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
581 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
582 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
583 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
584 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
585 | vst1.64 {@ACC[0]},[$toutptr,:128]! | |
586 | ___ | |
587 | push(@ACC,shift(@ACC)); | |
588 | $code.=<<___; | |
589 | vmlal.u32 @ACC[0],$Bi,${A0}[0] | |
590 | vld1.64 {@ACC[7]},[$tinptr,:128] | |
591 | vmlal.u32 @ACC[1],$Bi,${A0}[1] | |
592 | vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] | |
593 | vmlal.u32 @ACC[2],$Bi,${A1}[0] | |
594 | it ne | |
595 | addne $tinptr,$tinptr,#16 @ don't advance in last iteration | |
596 | vmlal.u32 @ACC[3],$Bi,${A1}[1] | |
597 | vmlal.u32 @ACC[4],$Bi,${A2}[0] | |
598 | vmlal.u32 @ACC[5],$Bi,${A2}[1] | |
599 | vmlal.u32 @ACC[6],$Bi,${A3}[0] | |
600 | vmlal.u32 @ACC[7],$Bi,${A3}[1] | |
601 | ___ | |
602 | } | |
603 | $code.=<<___; | |
604 | it eq | |
605 | subeq $aptr,$aptr,$num,lsl#2 @ rewind | |
606 | vmlal.u32 @ACC[0],$Ni,${N0}[0] | |
607 | vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] | |
608 | vmlal.u32 @ACC[1],$Ni,${N0}[1] | |
609 | vld1.32 {$A0-$A3},[$aptr]! | |
610 | vmlal.u32 @ACC[2],$Ni,${N1}[0] | |
611 | add $bnptr,sp,#8 @ rewind | |
612 | vmlal.u32 @ACC[3],$Ni,${N1}[1] | |
613 | vmlal.u32 @ACC[4],$Ni,${N2}[0] | |
614 | vmlal.u32 @ACC[5],$Ni,${N2}[1] | |
615 | vmlal.u32 @ACC[6],$Ni,${N3}[0] | |
616 | vst1.64 {@ACC[0]},[$toutptr,:128]! | |
617 | vmlal.u32 @ACC[7],$Ni,${N3}[1] | |
618 | ||
619 | bne .LNEON_8n_inner | |
620 | ___ | |
621 | push(@ACC,shift(@ACC)); | |
622 | $code.=<<___; | |
623 | add $tinptr,sp,#128 | |
624 | vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! | |
625 | veor q2,q2,q2 @ $N0-$N1 | |
626 | vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! | |
627 | veor q3,q3,q3 @ $N2-$N3 | |
628 | vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! | |
629 | vst1.64 {@ACC[6]},[$toutptr,:128] | |
630 | ||
631 | subs $outer,$outer,#8 | |
632 | vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! | |
633 | vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! | |
634 | vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! | |
635 | vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! | |
636 | ||
637 | itt ne | |
638 | subne $nptr,$nptr,$num,lsl#2 @ rewind | |
639 | bne .LNEON_8n_outer | |
640 | ||
641 | add $toutptr,sp,#128 | |
642 | vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame | |
643 | vshr.u64 $temp,@ACC[0]#lo,#16 | |
644 | vst1.64 {q2-q3},[sp,:256]! | |
645 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp | |
646 | vst1.64 {q2-q3}, [sp,:256]! | |
647 | vshr.u64 $temp,@ACC[0]#hi,#16 | |
648 | vst1.64 {q2-q3}, [sp,:256]! | |
649 | vzip.16 @ACC[0]#lo,@ACC[0]#hi | |
d1671f4f | 650 | |
d1671f4f | 651 | mov $inner,$num |
8eed3289 | 652 | b .LNEON_tail_entry |
d1671f4f | 653 | |
8eed3289 | 654 | .align 4 |
d1671f4f | 655 | .LNEON_tail: |
8eed3289 AP |
656 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
657 | vshr.u64 $temp,@ACC[0]#lo,#16 | |
658 | vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! | |
659 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp | |
660 | vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! | |
661 | vshr.u64 $temp,@ACC[0]#hi,#16 | |
662 | vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! | |
663 | vzip.16 @ACC[0]#lo,@ACC[0]#hi | |
664 | ||
665 | .LNEON_tail_entry: | |
666 | ___ | |
667 | for ($i=1; $i<8; $i++) { | |
668 | $code.=<<___; | |
669 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp | |
670 | vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! | |
671 | vshr.u64 $temp,@ACC[1]#lo,#16 | |
672 | vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp | |
673 | vshr.u64 $temp,@ACC[1]#hi,#16 | |
674 | vzip.16 @ACC[1]#lo,@ACC[1]#hi | |
675 | ___ | |
676 | push(@ACC,shift(@ACC)); | |
677 | } | |
678 | push(@ACC,shift(@ACC)); | |
679 | $code.=<<___; | |
680 | vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! | |
d1671f4f | 681 | subs $inner,$inner,#8 |
8eed3289 | 682 | vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! |
d1671f4f AP |
683 | bne .LNEON_tail |
684 | ||
685 | vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit | |
686 | sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr | |
687 | subs $aptr,sp,#0 @ clear carry flag | |
688 | add $bptr,sp,$num,lsl#2 | |
689 | ||
690 | .LNEON_sub: | |
691 | ldmia $aptr!, {r4-r7} | |
692 | ldmia $nptr!, {r8-r11} | |
693 | sbcs r8, r4,r8 | |
694 | sbcs r9, r5,r9 | |
695 | sbcs r10,r6,r10 | |
696 | sbcs r11,r7,r11 | |
697 | teq $aptr,$bptr @ preserves carry | |
698 | stmia $rptr!, {r8-r11} | |
699 | bne .LNEON_sub | |
700 | ||
701 | ldr r10, [$aptr] @ load top-most bit | |
11208dcf | 702 | mov r11,sp |
d1671f4f | 703 | veor q0,q0,q0 |
11208dcf | 704 | sub r11,$bptr,r11 @ this is num*4 |
d1671f4f AP |
705 | veor q1,q1,q1 |
706 | mov $aptr,sp | |
707 | sub $rptr,$rptr,r11 @ rewind $rptr | |
708 | mov $nptr,$bptr @ second 3/4th of frame | |
709 | sbcs r10,r10,#0 @ result is carry flag | |
710 | ||
711 | .LNEON_copy_n_zap: | |
712 | ldmia $aptr!, {r4-r7} | |
713 | ldmia $rptr, {r8-r11} | |
11208dcf | 714 | it cc |
d1671f4f AP |
715 | movcc r8, r4 |
716 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |
11208dcf | 717 | itt cc |
d1671f4f AP |
718 | movcc r9, r5 |
719 | movcc r10,r6 | |
720 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |
11208dcf | 721 | it cc |
d1671f4f AP |
722 | movcc r11,r7 |
723 | ldmia $aptr, {r4-r7} | |
724 | stmia $rptr!, {r8-r11} | |
725 | sub $aptr,$aptr,#16 | |
726 | ldmia $rptr, {r8-r11} | |
11208dcf | 727 | it cc |
d1671f4f AP |
728 | movcc r8, r4 |
729 | vst1.64 {q0-q1}, [$aptr,:256]! @ wipe | |
11208dcf | 730 | itt cc |
d1671f4f AP |
731 | movcc r9, r5 |
732 | movcc r10,r6 | |
733 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |
11208dcf | 734 | it cc |
d1671f4f AP |
735 | movcc r11,r7 |
736 | teq $aptr,$bptr @ preserves carry | |
737 | stmia $rptr!, {r8-r11} | |
738 | bne .LNEON_copy_n_zap | |
739 | ||
11208dcf | 740 | mov sp,ip |
d1671f4f AP |
741 | vldmia sp!,{d8-d15} |
742 | ldmia sp!,{r4-r11} | |
5dcf70a1 | 743 | ret @ bx lr |
d1671f4f AP |
744 | .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon |
745 | #endif | |
746 | ___ | |
747 | } | |
748 | $code.=<<___; | |
749 | .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | |
97a6a01f | 750 | .align 2 |
c1669e1c | 751 | #if __ARM_MAX_ARCH__>=7 |
7b508cd1 | 752 | .extern OPENSSL_armcap_P |
d1671f4f | 753 | #endif |
43b8fe1c AP |
754 | ___ |
755 | ||
8eed3289 AP |
756 | foreach (split("\n",$code)) { |
757 | s/\`([^\`]*)\`/eval $1/ge; | |
758 | ||
759 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or | |
760 | s/\bret\b/bx lr/g or | |
761 | s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 | |
762 | ||
763 | print $_,"\n"; | |
764 | } | |
765 | ||
a21314db | 766 | close STDOUT or die "error closing STDOUT: $!"; |