]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_nistz256-armv4.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # ECP_NISTZ256 module for ARMv4.
18 #
19 # October 2014.
20 #
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816. In the process of adaptation
23 # original .c module was made 32-bit savvy in order to make this
24 # implementation possible.
25 #
26 # with/without -DECP_NISTZ256_ASM
27 # Cortex-A8 +53-170%
28 # Cortex-A9 +76-205%
29 # Cortex-A15 +100-316%
30 # Snapdragon S4 +66-187%
31 #
32 # Ranges denote minimum and maximum improvement coefficients depending
33 # on benchmark. Lower coefficients are for ECDSA sign, server-side
34 # operation. Keep in mind that +200% means 3x improvement.
35
36 $flavour = shift;
37 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
38 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
39
40 if ($flavour && $flavour ne "void") {
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44 die "can't locate arm-xlate.pl";
45
46 open STDOUT,"| \"$^X\" $xlate $flavour $output";
47 } else {
48 open STDOUT,">$output";
49 }
50
51 $code.=<<___;
52 #include "arm_arch.h"
53
54 .text
55 #if defined(__thumb2__)
56 .syntax unified
57 .thumb
58 #else
59 .code 32
60 #endif
61 ___
62 ########################################################################
63 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
64 #
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 open TABLE,"<ecp_nistz256_table.c" or
67 open TABLE,"<${dir}../ecp_nistz256_table.c" or
68 die "failed to open ecp_nistz256_table.c:",$!;
69
70 use integer;
71
72 foreach(<TABLE>) {
73 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
74 }
75 close TABLE;
76
77 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
78 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
79 # amount of elements.
80 die "insane number of elements" if ($#arr != 64*16*37-1);
81
82 $code.=<<___;
83 .globl ecp_nistz256_precomputed
84 .type ecp_nistz256_precomputed,%object
85 .align 12
86 ecp_nistz256_precomputed:
87 ___
88 ########################################################################
89 # this conversion smashes P256_POINT_AFFINE by individual bytes with
90 # 64 byte interval, similar to
91 # 1111222233334444
92 # 1234123412341234
93 for(1..37) {
94 @tbl = splice(@arr,0,64*16);
95 for($i=0;$i<64;$i++) {
96 undef @line;
97 for($j=0;$j<64;$j++) {
98 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
99 }
100 $code.=".byte\t";
101 $code.=join(',',map { sprintf "0x%02x",$_} @line);
102 $code.="\n";
103 }
104 }
105 $code.=<<___;
106 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
107 .align 5
108 .LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
109 .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
110 .long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
111 .Lone:
112 .long 1,0,0,0,0,0,0,0
113 .asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
114 .align 6
115 ___
116
117 ########################################################################
118 # common register layout, note that $t2 is link register, so that if
119 # internal subroutine uses $t2, then it has to offload lr...
120
121 ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
122 map("r$_",(0..12,14));
123 ($t0,$t3)=($ff,$a_ptr);
124
125 $code.=<<___;
126 @ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
127 .globl ecp_nistz256_to_mont
128 .type ecp_nistz256_to_mont,%function
129 ecp_nistz256_to_mont:
130 adr $b_ptr,.LRR
131 b .Lecp_nistz256_mul_mont
132 .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
133
134 @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
135 .globl ecp_nistz256_from_mont
136 .type ecp_nistz256_from_mont,%function
137 ecp_nistz256_from_mont:
138 adr $b_ptr,.Lone
139 b .Lecp_nistz256_mul_mont
140 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
141
142 @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
143 .globl ecp_nistz256_mul_by_2
144 .type ecp_nistz256_mul_by_2,%function
145 .align 4
146 ecp_nistz256_mul_by_2:
147 stmdb sp!,{r4-r12,lr}
148 bl __ecp_nistz256_mul_by_2
149 #if __ARM_ARCH__>=5 || !defined(__thumb__)
150 ldmia sp!,{r4-r12,pc}
151 #else
152 ldmia sp!,{r4-r12,lr}
153 bx lr @ interoperable with Thumb ISA:-)
154 #endif
155 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
156
157 .type __ecp_nistz256_mul_by_2,%function
158 .align 4
159 __ecp_nistz256_mul_by_2:
160 ldr $a0,[$a_ptr,#0]
161 ldr $a1,[$a_ptr,#4]
162 ldr $a2,[$a_ptr,#8]
163 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
164 ldr $a3,[$a_ptr,#12]
165 adcs $a1,$a1,$a1
166 ldr $a4,[$a_ptr,#16]
167 adcs $a2,$a2,$a2
168 ldr $a5,[$a_ptr,#20]
169 adcs $a3,$a3,$a3
170 ldr $a6,[$a_ptr,#24]
171 adcs $a4,$a4,$a4
172 ldr $a7,[$a_ptr,#28]
173 adcs $a5,$a5,$a5
174 adcs $a6,$a6,$a6
175 mov $ff,#0
176 adcs $a7,$a7,$a7
177 #ifdef __thumb2__
178 it cs
179 #endif
180 movcs $ff,#-1 @ $ff = carry ? -1 : 0
181
182 b .Lreduce_by_sub
183 .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
184
185 @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
186 @ const BN_ULONG r2[8]);
187 .globl ecp_nistz256_add
188 .type ecp_nistz256_add,%function
189 .align 4
190 ecp_nistz256_add:
191 stmdb sp!,{r4-r12,lr}
192 bl __ecp_nistz256_add
193 #if __ARM_ARCH__>=5 || !defined(__thumb__)
194 ldmia sp!,{r4-r12,pc}
195 #else
196 ldmia sp!,{r4-r12,lr}
197 bx lr @ interoperable with Thumb ISA:-)
198 #endif
199 .size ecp_nistz256_add,.-ecp_nistz256_add
200
201 .type __ecp_nistz256_add,%function
202 .align 4
203 __ecp_nistz256_add:
204 str lr,[sp,#-4]! @ push lr
205
206 ldr $a0,[$a_ptr,#0]
207 ldr $a1,[$a_ptr,#4]
208 ldr $a2,[$a_ptr,#8]
209 ldr $a3,[$a_ptr,#12]
210 ldr $a4,[$a_ptr,#16]
211 ldr $t0,[$b_ptr,#0]
212 ldr $a5,[$a_ptr,#20]
213 ldr $t1,[$b_ptr,#4]
214 ldr $a6,[$a_ptr,#24]
215 ldr $t2,[$b_ptr,#8]
216 ldr $a7,[$a_ptr,#28]
217 ldr $t3,[$b_ptr,#12]
218 adds $a0,$a0,$t0
219 ldr $t0,[$b_ptr,#16]
220 adcs $a1,$a1,$t1
221 ldr $t1,[$b_ptr,#20]
222 adcs $a2,$a2,$t2
223 ldr $t2,[$b_ptr,#24]
224 adcs $a3,$a3,$t3
225 ldr $t3,[$b_ptr,#28]
226 adcs $a4,$a4,$t0
227 adcs $a5,$a5,$t1
228 adcs $a6,$a6,$t2
229 mov $ff,#0
230 adcs $a7,$a7,$t3
231 #ifdef __thumb2__
232 it cs
233 #endif
234 movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
235 ldr lr,[sp],#4 @ pop lr
236
237 .Lreduce_by_sub:
238
239 @ if a+b carries, subtract modulus.
240 @
241 @ Note that because mod has special form, i.e. consists of
242 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
243 @ using value of broadcasted carry as a whole or extracting
244 @ single bit. Follow $ff register...
245
246 subs $a0,$a0,$ff @ subtract synthesized modulus
247 sbcs $a1,$a1,$ff
248 str $a0,[$r_ptr,#0]
249 sbcs $a2,$a2,$ff
250 str $a1,[$r_ptr,#4]
251 sbcs $a3,$a3,#0
252 str $a2,[$r_ptr,#8]
253 sbcs $a4,$a4,#0
254 str $a3,[$r_ptr,#12]
255 sbcs $a5,$a5,#0
256 str $a4,[$r_ptr,#16]
257 sbcs $a6,$a6,$ff,lsr#31
258 str $a5,[$r_ptr,#20]
259 sbcs $a7,$a7,$ff
260 str $a6,[$r_ptr,#24]
261 str $a7,[$r_ptr,#28]
262
263 mov pc,lr
264 .size __ecp_nistz256_add,.-__ecp_nistz256_add
265
266 @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
267 .globl ecp_nistz256_mul_by_3
268 .type ecp_nistz256_mul_by_3,%function
269 .align 4
270 ecp_nistz256_mul_by_3:
271 stmdb sp!,{r4-r12,lr}
272 bl __ecp_nistz256_mul_by_3
273 #if __ARM_ARCH__>=5 || !defined(__thumb__)
274 ldmia sp!,{r4-r12,pc}
275 #else
276 ldmia sp!,{r4-r12,lr}
277 bx lr @ interoperable with Thumb ISA:-)
278 #endif
279 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
280
281 .type __ecp_nistz256_mul_by_3,%function
282 .align 4
283 __ecp_nistz256_mul_by_3:
284 str lr,[sp,#-4]! @ push lr
285
286 @ As multiplication by 3 is performed as 2*n+n, below are inline
287 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
288 @ corresponding subroutines for details.
289
290 ldr $a0,[$a_ptr,#0]
291 ldr $a1,[$a_ptr,#4]
292 ldr $a2,[$a_ptr,#8]
293 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
294 ldr $a3,[$a_ptr,#12]
295 adcs $a1,$a1,$a1
296 ldr $a4,[$a_ptr,#16]
297 adcs $a2,$a2,$a2
298 ldr $a5,[$a_ptr,#20]
299 adcs $a3,$a3,$a3
300 ldr $a6,[$a_ptr,#24]
301 adcs $a4,$a4,$a4
302 ldr $a7,[$a_ptr,#28]
303 adcs $a5,$a5,$a5
304 adcs $a6,$a6,$a6
305 mov $ff,#0
306 adcs $a7,$a7,$a7
307 #ifdef __thumb2__
308 it cs
309 #endif
310 movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
311
312 subs $a0,$a0,$ff @ subtract synthesized modulus, see
313 @ .Lreduce_by_sub for details, except
314 @ that we don't write anything to
315 @ memory, but keep intermediate
316 @ results in registers...
317 sbcs $a1,$a1,$ff
318 sbcs $a2,$a2,$ff
319 sbcs $a3,$a3,#0
320 sbcs $a4,$a4,#0
321 ldr $b_ptr,[$a_ptr,#0]
322 sbcs $a5,$a5,#0
323 ldr $t1,[$a_ptr,#4]
324 sbcs $a6,$a6,$ff,lsr#31
325 ldr $t2,[$a_ptr,#8]
326 sbcs $a7,$a7,$ff
327
328 ldr $t0,[$a_ptr,#12]
329 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
330 ldr $b_ptr,[$a_ptr,#16]
331 adcs $a1,$a1,$t1
332 ldr $t1,[$a_ptr,#20]
333 adcs $a2,$a2,$t2
334 ldr $t2,[$a_ptr,#24]
335 adcs $a3,$a3,$t0
336 ldr $t3,[$a_ptr,#28]
337 adcs $a4,$a4,$b_ptr
338 adcs $a5,$a5,$t1
339 adcs $a6,$a6,$t2
340 mov $ff,#0
341 adcs $a7,$a7,$t3
342 #ifdef __thumb2__
343 it cs
344 #endif
345 movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry
346 ldr lr,[sp],#4 @ pop lr
347
348 b .Lreduce_by_sub
349 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
350
351 @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
352 .globl ecp_nistz256_div_by_2
353 .type ecp_nistz256_div_by_2,%function
354 .align 4
355 ecp_nistz256_div_by_2:
356 stmdb sp!,{r4-r12,lr}
357 bl __ecp_nistz256_div_by_2
358 #if __ARM_ARCH__>=5 || !defined(__thumb__)
359 ldmia sp!,{r4-r12,pc}
360 #else
361 ldmia sp!,{r4-r12,lr}
362 bx lr @ interoperable with Thumb ISA:-)
363 #endif
364 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
365
366 .type __ecp_nistz256_div_by_2,%function
367 .align 4
368 __ecp_nistz256_div_by_2:
369 @ ret = (a is odd ? a+mod : a) >> 1
370
371 ldr $a0,[$a_ptr,#0]
372 ldr $a1,[$a_ptr,#4]
373 ldr $a2,[$a_ptr,#8]
374 mov $ff,$a0,lsl#31 @ place least significant bit to most
375 @ significant position, now arithmetic
376 @ right shift by 31 will produce -1 or
377 @ 0, while logical rigth shift 1 or 0,
378 @ this is how modulus is conditionally
379 @ synthesized in this case...
380 ldr $a3,[$a_ptr,#12]
381 adds $a0,$a0,$ff,asr#31
382 ldr $a4,[$a_ptr,#16]
383 adcs $a1,$a1,$ff,asr#31
384 ldr $a5,[$a_ptr,#20]
385 adcs $a2,$a2,$ff,asr#31
386 ldr $a6,[$a_ptr,#24]
387 adcs $a3,$a3,#0
388 ldr $a7,[$a_ptr,#28]
389 adcs $a4,$a4,#0
390 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
391 @ because it doesn't affect flags
392 adcs $a5,$a5,#0
393 orr $a0,$a0,$a1,lsl#31
394 adcs $a6,$a6,$ff,lsr#31
395 mov $b_ptr,#0
396 adcs $a7,$a7,$ff,asr#31
397 mov $a1,$a1,lsr#1
398 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
399
400 orr $a1,$a1,$a2,lsl#31
401 mov $a2,$a2,lsr#1
402 str $a0,[$r_ptr,#0]
403 orr $a2,$a2,$a3,lsl#31
404 mov $a3,$a3,lsr#1
405 str $a1,[$r_ptr,#4]
406 orr $a3,$a3,$a4,lsl#31
407 mov $a4,$a4,lsr#1
408 str $a2,[$r_ptr,#8]
409 orr $a4,$a4,$a5,lsl#31
410 mov $a5,$a5,lsr#1
411 str $a3,[$r_ptr,#12]
412 orr $a5,$a5,$a6,lsl#31
413 mov $a6,$a6,lsr#1
414 str $a4,[$r_ptr,#16]
415 orr $a6,$a6,$a7,lsl#31
416 mov $a7,$a7,lsr#1
417 str $a5,[$r_ptr,#20]
418 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
419 str $a6,[$r_ptr,#24]
420 str $a7,[$r_ptr,#28]
421
422 mov pc,lr
423 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
424
425 @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
426 @ const BN_ULONG r2[8]);
427 .globl ecp_nistz256_sub
428 .type ecp_nistz256_sub,%function
429 .align 4
430 ecp_nistz256_sub:
431 stmdb sp!,{r4-r12,lr}
432 bl __ecp_nistz256_sub
433 #if __ARM_ARCH__>=5 || !defined(__thumb__)
434 ldmia sp!,{r4-r12,pc}
435 #else
436 ldmia sp!,{r4-r12,lr}
437 bx lr @ interoperable with Thumb ISA:-)
438 #endif
439 .size ecp_nistz256_sub,.-ecp_nistz256_sub
440
441 .type __ecp_nistz256_sub,%function
442 .align 4
443 __ecp_nistz256_sub:
444 str lr,[sp,#-4]! @ push lr
445
446 ldr $a0,[$a_ptr,#0]
447 ldr $a1,[$a_ptr,#4]
448 ldr $a2,[$a_ptr,#8]
449 ldr $a3,[$a_ptr,#12]
450 ldr $a4,[$a_ptr,#16]
451 ldr $t0,[$b_ptr,#0]
452 ldr $a5,[$a_ptr,#20]
453 ldr $t1,[$b_ptr,#4]
454 ldr $a6,[$a_ptr,#24]
455 ldr $t2,[$b_ptr,#8]
456 ldr $a7,[$a_ptr,#28]
457 ldr $t3,[$b_ptr,#12]
458 subs $a0,$a0,$t0
459 ldr $t0,[$b_ptr,#16]
460 sbcs $a1,$a1,$t1
461 ldr $t1,[$b_ptr,#20]
462 sbcs $a2,$a2,$t2
463 ldr $t2,[$b_ptr,#24]
464 sbcs $a3,$a3,$t3
465 ldr $t3,[$b_ptr,#28]
466 sbcs $a4,$a4,$t0
467 sbcs $a5,$a5,$t1
468 sbcs $a6,$a6,$t2
469 sbcs $a7,$a7,$t3
470 sbc $ff,$ff,$ff @ broadcast borrow bit
471 ldr lr,[sp],#4 @ pop lr
472
473 .Lreduce_by_add:
474
475 @ if a-b borrows, add modulus.
476 @
477 @ Note that because mod has special form, i.e. consists of
478 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
479 @ broadcasting borrow bit to a register, $ff, and using it as
480 @ a whole or extracting single bit.
481
482 adds $a0,$a0,$ff @ add synthesized modulus
483 adcs $a1,$a1,$ff
484 str $a0,[$r_ptr,#0]
485 adcs $a2,$a2,$ff
486 str $a1,[$r_ptr,#4]
487 adcs $a3,$a3,#0
488 str $a2,[$r_ptr,#8]
489 adcs $a4,$a4,#0
490 str $a3,[$r_ptr,#12]
491 adcs $a5,$a5,#0
492 str $a4,[$r_ptr,#16]
493 adcs $a6,$a6,$ff,lsr#31
494 str $a5,[$r_ptr,#20]
495 adcs $a7,$a7,$ff
496 str $a6,[$r_ptr,#24]
497 str $a7,[$r_ptr,#28]
498
499 mov pc,lr
500 .size __ecp_nistz256_sub,.-__ecp_nistz256_sub
501
502 @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
503 .globl ecp_nistz256_neg
504 .type ecp_nistz256_neg,%function
505 .align 4
506 ecp_nistz256_neg:
507 stmdb sp!,{r4-r12,lr}
508 bl __ecp_nistz256_neg
509 #if __ARM_ARCH__>=5 || !defined(__thumb__)
510 ldmia sp!,{r4-r12,pc}
511 #else
512 ldmia sp!,{r4-r12,lr}
513 bx lr @ interoperable with Thumb ISA:-)
514 #endif
515 .size ecp_nistz256_neg,.-ecp_nistz256_neg
516
517 .type __ecp_nistz256_neg,%function
518 .align 4
519 __ecp_nistz256_neg:
520 ldr $a0,[$a_ptr,#0]
521 eor $ff,$ff,$ff
522 ldr $a1,[$a_ptr,#4]
523 ldr $a2,[$a_ptr,#8]
524 subs $a0,$ff,$a0
525 ldr $a3,[$a_ptr,#12]
526 sbcs $a1,$ff,$a1
527 ldr $a4,[$a_ptr,#16]
528 sbcs $a2,$ff,$a2
529 ldr $a5,[$a_ptr,#20]
530 sbcs $a3,$ff,$a3
531 ldr $a6,[$a_ptr,#24]
532 sbcs $a4,$ff,$a4
533 ldr $a7,[$a_ptr,#28]
534 sbcs $a5,$ff,$a5
535 sbcs $a6,$ff,$a6
536 sbcs $a7,$ff,$a7
537 sbc $ff,$ff,$ff
538
539 b .Lreduce_by_add
540 .size __ecp_nistz256_neg,.-__ecp_nistz256_neg
541 ___
542 {
543 my @acc=map("r$_",(3..11));
544 my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
545
546 $code.=<<___;
547 @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
548 .globl ecp_nistz256_sqr_mont
549 .type ecp_nistz256_sqr_mont,%function
550 .align 4
551 ecp_nistz256_sqr_mont:
552 mov $b_ptr,$a_ptr
553 b .Lecp_nistz256_mul_mont
554 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
555
556 @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
557 @ const BN_ULONG r2[8]);
558 .globl ecp_nistz256_mul_mont
559 .type ecp_nistz256_mul_mont,%function
560 .align 4
561 ecp_nistz256_mul_mont:
562 .Lecp_nistz256_mul_mont:
563 stmdb sp!,{r4-r12,lr}
564 bl __ecp_nistz256_mul_mont
565 #if __ARM_ARCH__>=5 || !defined(__thumb__)
566 ldmia sp!,{r4-r12,pc}
567 #else
568 ldmia sp!,{r4-r12,lr}
569 bx lr @ interoperable with Thumb ISA:-)
570 #endif
571 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
572
573 .type __ecp_nistz256_mul_mont,%function
574 .align 4
575 __ecp_nistz256_mul_mont:
576 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
577
578 ldr $bj,[$b_ptr,#0] @ b[0]
579 ldmia $a_ptr,{@acc[1]-@acc[8]}
580
581 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
582 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
583 @ that it can be addressed
584 @ without spending register
585 @ on address
586 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
587 umull @acc[2],$t1,@acc[3],$bj
588 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
589 umull @acc[3],$t2,@acc[4],$bj
590 adcs @acc[2],@acc[2],$t0
591 umull @acc[4],$t3,@acc[5],$bj
592 adcs @acc[3],@acc[3],$t1
593 umull @acc[5],$t0,@acc[6],$bj
594 adcs @acc[4],@acc[4],$t2
595 umull @acc[6],$t1,@acc[7],$bj
596 adcs @acc[5],@acc[5],$t3
597 umull @acc[7],$t2,@acc[8],$bj
598 adcs @acc[6],@acc[6],$t0
599 adcs @acc[7],@acc[7],$t1
600 eor $t3,$t3,$t3 @ first overflow bit is zero
601 adc @acc[8],$t2,#0
602 ___
603 for(my $i=1;$i<8;$i++) {
604 my $t4=@acc[0];
605
606 # Reduction iteration is normally performed by accumulating
607 # result of multiplication of modulus by "magic" digit [and
608 # omitting least significant word, which is guaranteed to
609 # be 0], but thanks to special form of modulus and "magic"
610 # digit being equal to least significant word, it can be
611 # performed with additions and subtractions alone. Indeed:
612 #
613 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
614 # * abcd
615 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
616 #
617 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
618 # rewrite above as:
619 #
620 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
621 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
622 # - abcd.0000.0000.0000.0000.0000.0000.abcd
623 #
624 # or marking redundant operations:
625 #
626 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
627 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
628 # - abcd.----.----.----.----.----.----.----
629
630 $code.=<<___;
631 @ multiplication-less reduction $i
632 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
633 ldr $bj,[sp,#40] @ restore b_ptr
634 adcs @acc[4],@acc[4],#0 @ r[4]+=0
635 adcs @acc[5],@acc[5],#0 @ r[5]+=0
636 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
637 ldr $t1,[sp,#0] @ load a[0]
638 adcs @acc[7],@acc[7],#0 @ r[7]+=0
639 ldr $bj,[$bj,#4*$i] @ load b[i]
640 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
641 eor $t0,$t0,$t0
642 adc $t3,$t3,#0 @ overflow bit
643 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
644 ldr $t2,[sp,#4] @ a[1]
645 sbcs @acc[8],@acc[8],#0 @ r[8]-=0
646 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
647 eor $t1,$t1,$t1
648 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
649 @ that netto result is
650 @ addition of a value which
651 @ makes underflow impossible
652
653 ldr $t3,[sp,#8] @ a[2]
654 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
655 str @acc[0],[sp,#36] @ temporarily offload overflow
656 eor $t2,$t2,$t2
657 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
658 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
659 eor $t3,$t3,$t3
660 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
661 ldr $t0,[sp,#16] @ a[4]
662 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
663 eor $t4,$t4,$t4
664 adcs @acc[3],@acc[3],$t1
665 ldr $t1,[sp,#20] @ a[5]
666 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
667 eor $t0,$t0,$t0
668 adcs @acc[4],@acc[4],$t2
669 ldr $t2,[sp,#24] @ a[6]
670 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
671 eor $t1,$t1,$t1
672 adcs @acc[5],@acc[5],$t3
673 ldr $t3,[sp,#28] @ a[7]
674 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
675 eor $t2,$t2,$t2
676 adcs @acc[6],@acc[6],$t4
677 ldr @acc[0],[sp,#36] @ restore overflow bit
678 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
679 eor $t3,$t3,$t3
680 adcs @acc[7],@acc[7],$t0
681 adcs @acc[8],@acc[8],$t1
682 adcs @acc[0],$acc[0],$t2
683 adc $t3,$t3,#0 @ new overflow bit
684 ___
685 push(@acc,shift(@acc)); # rotate registers, so that
686 # "r[i]" becomes r[i]
687 }
688 $code.=<<___;
689 @ last multiplication-less reduction
690 adds @acc[3],@acc[3],@acc[0]
691 ldr $r_ptr,[sp,#32] @ restore r_ptr
692 adcs @acc[4],@acc[4],#0
693 adcs @acc[5],@acc[5],#0
694 adcs @acc[6],@acc[6],@acc[0]
695 adcs @acc[7],@acc[7],#0
696 adcs @acc[8],@acc[8],@acc[0]
697 adc $t3,$t3,#0
698 subs @acc[7],@acc[7],@acc[0]
699 sbcs @acc[8],@acc[8],#0
700 sbc @acc[0],$t3,#0 @ overflow bit
701
702 @ Final step is "if result > mod, subtract mod", but we do it
703 @ "other way around", namely subtract modulus from result
704 @ and if it borrowed, add modulus back.
705
706 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
707 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
708 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
709 sbcs @acc[4],@acc[4],#0
710 sbcs @acc[5],@acc[5],#0
711 sbcs @acc[6],@acc[6],#0
712 sbcs @acc[7],@acc[7],#1
713 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
714 ldr lr,[sp,#44] @ restore lr
715 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
716 add sp,sp,#48
717
718 @ Note that because mod has special form, i.e. consists of
719 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
720 @ broadcasting borrow bit to a register, @acc[0], and using it as
721 @ a whole or extracting single bit.
722
723 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
724 adcs @acc[2],@acc[2],@acc[0]
725 str @acc[1],[$r_ptr,#0]
726 adcs @acc[3],@acc[3],@acc[0]
727 str @acc[2],[$r_ptr,#4]
728 adcs @acc[4],@acc[4],#0
729 str @acc[3],[$r_ptr,#8]
730 adcs @acc[5],@acc[5],#0
731 str @acc[4],[$r_ptr,#12]
732 adcs @acc[6],@acc[6],#0
733 str @acc[5],[$r_ptr,#16]
734 adcs @acc[7],@acc[7],@acc[0],lsr#31
735 str @acc[6],[$r_ptr,#20]
736 adc @acc[8],@acc[8],@acc[0]
737 str @acc[7],[$r_ptr,#24]
738 str @acc[8],[$r_ptr,#28]
739
740 mov pc,lr
741 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
742 ___
743 }
744
745 {
746 my ($out,$inp,$index,$mask)=map("r$_",(0..3));
747 $code.=<<___;
748 @ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
749 @ int r2);
750 .globl ecp_nistz256_scatter_w5
751 .type ecp_nistz256_scatter_w5,%function
752 .align 5
753 ecp_nistz256_scatter_w5:
754 stmdb sp!,{r4-r11}
755
756 add $out,$out,$index,lsl#2
757
758 ldmia $inp!,{r4-r11} @ X
759 str r4,[$out,#64*0-4]
760 str r5,[$out,#64*1-4]
761 str r6,[$out,#64*2-4]
762 str r7,[$out,#64*3-4]
763 str r8,[$out,#64*4-4]
764 str r9,[$out,#64*5-4]
765 str r10,[$out,#64*6-4]
766 str r11,[$out,#64*7-4]
767 add $out,$out,#64*8
768
769 ldmia $inp!,{r4-r11} @ Y
770 str r4,[$out,#64*0-4]
771 str r5,[$out,#64*1-4]
772 str r6,[$out,#64*2-4]
773 str r7,[$out,#64*3-4]
774 str r8,[$out,#64*4-4]
775 str r9,[$out,#64*5-4]
776 str r10,[$out,#64*6-4]
777 str r11,[$out,#64*7-4]
778 add $out,$out,#64*8
779
780 ldmia $inp,{r4-r11} @ Z
781 str r4,[$out,#64*0-4]
782 str r5,[$out,#64*1-4]
783 str r6,[$out,#64*2-4]
784 str r7,[$out,#64*3-4]
785 str r8,[$out,#64*4-4]
786 str r9,[$out,#64*5-4]
787 str r10,[$out,#64*6-4]
788 str r11,[$out,#64*7-4]
789
790 ldmia sp!,{r4-r11}
791 #if __ARM_ARCH__>=5 || defined(__thumb__)
792 bx lr
793 #else
794 mov pc,lr
795 #endif
796 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
797
798 @ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
799 @ int r2);
800 .globl ecp_nistz256_gather_w5
801 .type ecp_nistz256_gather_w5,%function
802 .align 5
803 ecp_nistz256_gather_w5:
804 stmdb sp!,{r4-r11}
805
806 cmp $index,#0
807 mov $mask,#0
808 #ifdef __thumb2__
809 itt ne
810 #endif
811 subne $index,$index,#1
812 movne $mask,#-1
813 add $inp,$inp,$index,lsl#2
814
815 ldr r4,[$inp,#64*0]
816 ldr r5,[$inp,#64*1]
817 ldr r6,[$inp,#64*2]
818 and r4,r4,$mask
819 ldr r7,[$inp,#64*3]
820 and r5,r5,$mask
821 ldr r8,[$inp,#64*4]
822 and r6,r6,$mask
823 ldr r9,[$inp,#64*5]
824 and r7,r7,$mask
825 ldr r10,[$inp,#64*6]
826 and r8,r8,$mask
827 ldr r11,[$inp,#64*7]
828 add $inp,$inp,#64*8
829 and r9,r9,$mask
830 and r10,r10,$mask
831 and r11,r11,$mask
832 stmia $out!,{r4-r11} @ X
833
834 ldr r4,[$inp,#64*0]
835 ldr r5,[$inp,#64*1]
836 ldr r6,[$inp,#64*2]
837 and r4,r4,$mask
838 ldr r7,[$inp,#64*3]
839 and r5,r5,$mask
840 ldr r8,[$inp,#64*4]
841 and r6,r6,$mask
842 ldr r9,[$inp,#64*5]
843 and r7,r7,$mask
844 ldr r10,[$inp,#64*6]
845 and r8,r8,$mask
846 ldr r11,[$inp,#64*7]
847 add $inp,$inp,#64*8
848 and r9,r9,$mask
849 and r10,r10,$mask
850 and r11,r11,$mask
851 stmia $out!,{r4-r11} @ Y
852
853 ldr r4,[$inp,#64*0]
854 ldr r5,[$inp,#64*1]
855 ldr r6,[$inp,#64*2]
856 and r4,r4,$mask
857 ldr r7,[$inp,#64*3]
858 and r5,r5,$mask
859 ldr r8,[$inp,#64*4]
860 and r6,r6,$mask
861 ldr r9,[$inp,#64*5]
862 and r7,r7,$mask
863 ldr r10,[$inp,#64*6]
864 and r8,r8,$mask
865 ldr r11,[$inp,#64*7]
866 and r9,r9,$mask
867 and r10,r10,$mask
868 and r11,r11,$mask
869 stmia $out,{r4-r11} @ Z
870
871 ldmia sp!,{r4-r11}
872 #if __ARM_ARCH__>=5 || defined(__thumb__)
873 bx lr
874 #else
875 mov pc,lr
876 #endif
877 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
878
879 @ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
880 @ int r2);
881 .globl ecp_nistz256_scatter_w7
882 .type ecp_nistz256_scatter_w7,%function
883 .align 5
884 ecp_nistz256_scatter_w7:
885 add $out,$out,$index
886 mov $index,#64/4
887 .Loop_scatter_w7:
888 ldr $mask,[$inp],#4
889 subs $index,$index,#1
890 strb $mask,[$out,#64*0-1]
891 mov $mask,$mask,lsr#8
892 strb $mask,[$out,#64*1-1]
893 mov $mask,$mask,lsr#8
894 strb $mask,[$out,#64*2-1]
895 mov $mask,$mask,lsr#8
896 strb $mask,[$out,#64*3-1]
897 add $out,$out,#64*4
898 bne .Loop_scatter_w7
899
900 #if __ARM_ARCH__>=5 || defined(__thumb__)
901 bx lr
902 #else
903 mov pc,lr
904 #endif
905 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
906
907 @ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
908 @ int r2);
909 .globl ecp_nistz256_gather_w7
910 .type ecp_nistz256_gather_w7,%function
911 .align 5
912 ecp_nistz256_gather_w7:
913 stmdb sp!,{r4-r7}
914
915 cmp $index,#0
916 mov $mask,#0
917 #ifdef __thumb2__
918 itt ne
919 #endif
920 subne $index,$index,#1
921 movne $mask,#-1
922 add $inp,$inp,$index
923 mov $index,#64/4
924 nop
925 .Loop_gather_w7:
926 ldrb r4,[$inp,#64*0]
927 subs $index,$index,#1
928 ldrb r5,[$inp,#64*1]
929 ldrb r6,[$inp,#64*2]
930 ldrb r7,[$inp,#64*3]
931 add $inp,$inp,#64*4
932 orr r4,r4,r5,lsl#8
933 orr r4,r4,r6,lsl#16
934 orr r4,r4,r7,lsl#24
935 and r4,r4,$mask
936 str r4,[$out],#4
937 bne .Loop_gather_w7
938
939 ldmia sp!,{r4-r7}
940 #if __ARM_ARCH__>=5 || defined(__thumb__)
941 bx lr
942 #else
943 mov pc,lr
944 #endif
945 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
946 ___
947 }
948 if (0) {
949 # In comparison to integer-only equivalent of below subroutine:
950 #
951 # Cortex-A8 +10%
952 # Cortex-A9 -10%
953 # Snapdragon S4 +5%
954 #
955 # As not all time is spent in multiplication, overall impact is deemed
956 # too low to care about.
957
958 my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
959 my $mask="q4";
960 my $mult="q5";
961 my @AxB=map("q$_",(8..15));
962
963 my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
964
965 $code.=<<___;
966 #if __ARM_ARCH__>=7
967 .fpu neon
968
969 .globl ecp_nistz256_mul_mont_neon
970 .type ecp_nistz256_mul_mont_neon,%function
971 .align 5
972 ecp_nistz256_mul_mont_neon:
973 mov ip,sp
974 stmdb sp!,{r4-r9}
975 vstmdb sp!,{q4-q5} @ ABI specification says so
976
977 sub $toutptr,sp,#40
978 vld1.32 {${Bi}[0]},[$bptr,:32]!
979 veor $zero,$zero,$zero
980 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
981 vzip.16 $Bi,$zero
982 mov sp,$toutptr @ alloca
983 vmov.i64 $mask,#0xffff
984
985 vmull.u32 @AxB[0],$Bi,${A0}[0]
986 vmull.u32 @AxB[1],$Bi,${A0}[1]
987 vmull.u32 @AxB[2],$Bi,${A1}[0]
988 vmull.u32 @AxB[3],$Bi,${A1}[1]
989 vshr.u64 $temp,@AxB[0]#lo,#16
990 vmull.u32 @AxB[4],$Bi,${A2}[0]
991 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
992 vmull.u32 @AxB[5],$Bi,${A2}[1]
993 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
994 vmull.u32 @AxB[6],$Bi,${A3}[0]
995 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
996 vmull.u32 @AxB[7],$Bi,${A3}[1]
997 ___
998 for($i=1;$i<8;$i++) {
999 $code.=<<___;
1000 vld1.32 {${Bi}[0]},[$bptr,:32]!
1001 veor $zero,$zero,$zero
1002 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
1003 vshl.u64 $mult,@AxB[0],#32
1004 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1005 vsub.u64 $mult,$mult,@AxB[0]
1006 vzip.16 $Bi,$zero
1007 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1008 vadd.u64 @AxB[7],@AxB[7],$mult
1009 ___
1010 push(@AxB,shift(@AxB));
1011 $code.=<<___;
1012 vmlal.u32 @AxB[0],$Bi,${A0}[0]
1013 vmlal.u32 @AxB[1],$Bi,${A0}[1]
1014 vmlal.u32 @AxB[2],$Bi,${A1}[0]
1015 vmlal.u32 @AxB[3],$Bi,${A1}[1]
1016 vshr.u64 $temp,@AxB[0]#lo,#16
1017 vmlal.u32 @AxB[4],$Bi,${A2}[0]
1018 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
1019 vmlal.u32 @AxB[5],$Bi,${A2}[1]
1020 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
1021 vmlal.u32 @AxB[6],$Bi,${A3}[0]
1022 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
1023 vmull.u32 @AxB[7],$Bi,${A3}[1]
1024 ___
1025 }
1026 $code.=<<___;
1027 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
1028 vshl.u64 $mult,@AxB[0],#32
1029 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1030 vsub.u64 $mult,$mult,@AxB[0]
1031 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1032 vadd.u64 @AxB[7],@AxB[7],$mult
1033
1034 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
1035 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
1036 vshr.u64 $temp,@AxB[1]#hi,#16
1037 vzip.16 @AxB[1]#lo,@AxB[1]#hi
1038 ___
1039 foreach (2..7) {
1040 $code.=<<___;
1041 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
1042 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1043 vshr.u64 $temp,@AxB[$_]#lo,#16
1044 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
1045 vshr.u64 $temp,@AxB[$_]#hi,#16
1046 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
1047 ___
1048 }
1049 $code.=<<___;
1050 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
1051 vst1.32 {$temp},[$toutptr] @ upper 33 bits
1052
1053 ldr r1,[sp,#0]
1054 ldr r2,[sp,#4]
1055 ldr r3,[sp,#8]
1056 subs r1,r1,#-1
1057 ldr r4,[sp,#12]
1058 sbcs r2,r2,#-1
1059 ldr r5,[sp,#16]
1060 sbcs r3,r3,#-1
1061 ldr r6,[sp,#20]
1062 sbcs r4,r4,#0
1063 ldr r7,[sp,#24]
1064 sbcs r5,r5,#0
1065 ldr r8,[sp,#28]
1066 sbcs r6,r6,#0
1067 ldr r9,[sp,#32] @ top-most bit
1068 sbcs r7,r7,#1
1069 sub sp,ip,#40+16
1070 sbcs r8,r8,#-1
1071 sbc r9,r9,#0
1072 vldmia sp!,{q4-q5}
1073
1074 adds r1,r1,r9
1075 adcs r2,r2,r9
1076 str r1,[$rptr,#0]
1077 adcs r3,r3,r9
1078 str r2,[$rptr,#4]
1079 adcs r4,r4,#0
1080 str r3,[$rptr,#8]
1081 adcs r5,r5,#0
1082 str r4,[$rptr,#12]
1083 adcs r6,r6,#0
1084 str r5,[$rptr,#16]
1085 adcs r7,r7,r9,lsr#31
1086 str r6,[$rptr,#20]
1087 adcs r8,r8,r9
1088 str r7,[$rptr,#24]
1089 str r8,[$rptr,#28]
1090
1091 ldmia sp!,{r4-r9}
1092 bx lr
1093 .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1094 #endif
1095 ___
1096 }
1097
1098 {{{
1099 ########################################################################
1100 # Below $aN assignment matches order in which 256-bit result appears in
1101 # register bank at return from __ecp_nistz256_mul_mont, so that we can
1102 # skip over reloading it from memory. This means that below functions
1103 # use custom calling sequence accepting 256-bit input in registers,
1104 # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1105 #
1106 # See their "normal" counterparts for insights on calculations.
1107
1108 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1109 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1110 my $ff=$b_ptr;
1111
1112 $code.=<<___;
1113 .type __ecp_nistz256_sub_from,%function
1114 .align 5
1115 __ecp_nistz256_sub_from:
1116 str lr,[sp,#-4]! @ push lr
1117
1118 ldr $t0,[$b_ptr,#0]
1119 ldr $t1,[$b_ptr,#4]
1120 ldr $t2,[$b_ptr,#8]
1121 ldr $t3,[$b_ptr,#12]
1122 subs $a0,$a0,$t0
1123 ldr $t0,[$b_ptr,#16]
1124 sbcs $a1,$a1,$t1
1125 ldr $t1,[$b_ptr,#20]
1126 sbcs $a2,$a2,$t2
1127 ldr $t2,[$b_ptr,#24]
1128 sbcs $a3,$a3,$t3
1129 ldr $t3,[$b_ptr,#28]
1130 sbcs $a4,$a4,$t0
1131 sbcs $a5,$a5,$t1
1132 sbcs $a6,$a6,$t2
1133 sbcs $a7,$a7,$t3
1134 sbc $ff,$ff,$ff @ broadcast borrow bit
1135 ldr lr,[sp],#4 @ pop lr
1136
1137 adds $a0,$a0,$ff @ add synthesized modulus
1138 adcs $a1,$a1,$ff
1139 str $a0,[$r_ptr,#0]
1140 adcs $a2,$a2,$ff
1141 str $a1,[$r_ptr,#4]
1142 adcs $a3,$a3,#0
1143 str $a2,[$r_ptr,#8]
1144 adcs $a4,$a4,#0
1145 str $a3,[$r_ptr,#12]
1146 adcs $a5,$a5,#0
1147 str $a4,[$r_ptr,#16]
1148 adcs $a6,$a6,$ff,lsr#31
1149 str $a5,[$r_ptr,#20]
1150 adcs $a7,$a7,$ff
1151 str $a6,[$r_ptr,#24]
1152 str $a7,[$r_ptr,#28]
1153
1154 mov pc,lr
1155 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1156
1157 .type __ecp_nistz256_sub_morf,%function
1158 .align 5
1159 __ecp_nistz256_sub_morf:
1160 str lr,[sp,#-4]! @ push lr
1161
1162 ldr $t0,[$b_ptr,#0]
1163 ldr $t1,[$b_ptr,#4]
1164 ldr $t2,[$b_ptr,#8]
1165 ldr $t3,[$b_ptr,#12]
1166 subs $a0,$t0,$a0
1167 ldr $t0,[$b_ptr,#16]
1168 sbcs $a1,$t1,$a1
1169 ldr $t1,[$b_ptr,#20]
1170 sbcs $a2,$t2,$a2
1171 ldr $t2,[$b_ptr,#24]
1172 sbcs $a3,$t3,$a3
1173 ldr $t3,[$b_ptr,#28]
1174 sbcs $a4,$t0,$a4
1175 sbcs $a5,$t1,$a5
1176 sbcs $a6,$t2,$a6
1177 sbcs $a7,$t3,$a7
1178 sbc $ff,$ff,$ff @ broadcast borrow bit
1179 ldr lr,[sp],#4 @ pop lr
1180
1181 adds $a0,$a0,$ff @ add synthesized modulus
1182 adcs $a1,$a1,$ff
1183 str $a0,[$r_ptr,#0]
1184 adcs $a2,$a2,$ff
1185 str $a1,[$r_ptr,#4]
1186 adcs $a3,$a3,#0
1187 str $a2,[$r_ptr,#8]
1188 adcs $a4,$a4,#0
1189 str $a3,[$r_ptr,#12]
1190 adcs $a5,$a5,#0
1191 str $a4,[$r_ptr,#16]
1192 adcs $a6,$a6,$ff,lsr#31
1193 str $a5,[$r_ptr,#20]
1194 adcs $a7,$a7,$ff
1195 str $a6,[$r_ptr,#24]
1196 str $a7,[$r_ptr,#28]
1197
1198 mov pc,lr
1199 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1200
1201 .type __ecp_nistz256_add_self,%function
1202 .align 4
1203 __ecp_nistz256_add_self:
1204 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
1205 adcs $a1,$a1,$a1
1206 adcs $a2,$a2,$a2
1207 adcs $a3,$a3,$a3
1208 adcs $a4,$a4,$a4
1209 adcs $a5,$a5,$a5
1210 adcs $a6,$a6,$a6
1211 mov $ff,#0
1212 adcs $a7,$a7,$a7
1213 #ifdef __thumb2__
1214 it cs
1215 #endif
1216 movcs $ff,#-1 @ $ff = carry ? -1 : 0
1217
1218 subs $a0,$a0,$ff @ subtract synthesized modulus
1219 sbcs $a1,$a1,$ff
1220 str $a0,[$r_ptr,#0]
1221 sbcs $a2,$a2,$ff
1222 str $a1,[$r_ptr,#4]
1223 sbcs $a3,$a3,#0
1224 str $a2,[$r_ptr,#8]
1225 sbcs $a4,$a4,#0
1226 str $a3,[$r_ptr,#12]
1227 sbcs $a5,$a5,#0
1228 str $a4,[$r_ptr,#16]
1229 sbcs $a6,$a6,$ff,lsr#31
1230 str $a5,[$r_ptr,#20]
1231 sbcs $a7,$a7,$ff
1232 str $a6,[$r_ptr,#24]
1233 str $a7,[$r_ptr,#28]
1234
1235 mov pc,lr
1236 .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1237
1238 ___
1239
1240 ########################################################################
1241 # following subroutines are "literal" implementation of those found in
1242 # ecp_nistz256.c
1243 #
1244 ########################################################################
1245 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1246 #
1247 {
1248 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1249 # above map() describes stack layout with 5 temporary
1250 # 256-bit vectors on top. Then note that we push
1251 # starting from r0, which means that we have copy of
1252 # input arguments just below these temporary vectors.
1253
1254 $code.=<<___;
1255 .globl ecp_nistz256_point_double
1256 .type ecp_nistz256_point_double,%function
1257 .align 5
1258 ecp_nistz256_point_double:
1259 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1260 sub sp,sp,#32*5
1261
1262 .Lpoint_double_shortcut:
1263 add r3,sp,#$in_x
1264 ldmia $a_ptr!,{r4-r11} @ copy in_x
1265 stmia r3,{r4-r11}
1266
1267 add $r_ptr,sp,#$S
1268 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
1269
1270 add $b_ptr,$a_ptr,#32
1271 add $a_ptr,$a_ptr,#32
1272 add $r_ptr,sp,#$Zsqr
1273 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
1274
1275 add $a_ptr,sp,#$S
1276 add $b_ptr,sp,#$S
1277 add $r_ptr,sp,#$S
1278 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
1279
1280 ldr $b_ptr,[sp,#32*5+4]
1281 add $a_ptr,$b_ptr,#32
1282 add $b_ptr,$b_ptr,#64
1283 add $r_ptr,sp,#$tmp0
1284 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
1285
1286 ldr $r_ptr,[sp,#32*5]
1287 add $r_ptr,$r_ptr,#64
1288 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
1289
1290 add $a_ptr,sp,#$in_x
1291 add $b_ptr,sp,#$Zsqr
1292 add $r_ptr,sp,#$M
1293 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
1294
1295 add $a_ptr,sp,#$in_x
1296 add $b_ptr,sp,#$Zsqr
1297 add $r_ptr,sp,#$Zsqr
1298 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
1299
1300 add $a_ptr,sp,#$S
1301 add $b_ptr,sp,#$S
1302 add $r_ptr,sp,#$tmp0
1303 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
1304
1305 add $a_ptr,sp,#$Zsqr
1306 add $b_ptr,sp,#$M
1307 add $r_ptr,sp,#$M
1308 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
1309
1310 ldr $r_ptr,[sp,#32*5]
1311 add $a_ptr,sp,#$tmp0
1312 add $r_ptr,$r_ptr,#32
1313 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
1314
1315 add $a_ptr,sp,#$M
1316 add $r_ptr,sp,#$M
1317 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
1318
1319 add $a_ptr,sp,#$in_x
1320 add $b_ptr,sp,#$S
1321 add $r_ptr,sp,#$S
1322 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
1323
1324 add $r_ptr,sp,#$tmp0
1325 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
1326
1327 ldr $r_ptr,[sp,#32*5]
1328 add $a_ptr,sp,#$M
1329 add $b_ptr,sp,#$M
1330 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
1331
1332 add $b_ptr,sp,#$tmp0
1333 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
1334
1335 add $b_ptr,sp,#$S
1336 add $r_ptr,sp,#$S
1337 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
1338
1339 add $a_ptr,sp,#$M
1340 add $b_ptr,sp,#$S
1341 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
1342
1343 ldr $r_ptr,[sp,#32*5]
1344 add $b_ptr,$r_ptr,#32
1345 add $r_ptr,$r_ptr,#32
1346 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
1347
1348 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
1349 #if __ARM_ARCH__>=5 || !defined(__thumb__)
1350 ldmia sp!,{r4-r12,pc}
1351 #else
1352 ldmia sp!,{r4-r12,lr}
1353 bx lr @ interoperable with Thumb ISA:-)
1354 #endif
1355 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1356 ___
1357 }
1358
1359 ########################################################################
1360 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1361 # const P256_POINT *in2);
1362 {
1363 my ($res_x,$res_y,$res_z,
1364 $in1_x,$in1_y,$in1_z,
1365 $in2_x,$in2_y,$in2_z,
1366 $H,$Hsqr,$R,$Rsqr,$Hcub,
1367 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1368 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1369 # above map() describes stack layout with 18 temporary
1370 # 256-bit vectors on top. Then note that we push
1371 # starting from r0, which means that we have copy of
1372 # input arguments just below these temporary vectors.
1373 # We use three of them for !in1infty, !in2intfy and
1374 # result of check for zero.
1375
1376 $code.=<<___;
1377 .globl ecp_nistz256_point_add
1378 .type ecp_nistz256_point_add,%function
1379 .align 5
1380 ecp_nistz256_point_add:
1381 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1382 sub sp,sp,#32*18+16
1383
1384 ldmia $b_ptr!,{r4-r11} @ copy in2
1385 add r3,sp,#$in2_x
1386 orr r12,r4,r5
1387 orr r12,r12,r6
1388 orr r12,r12,r7
1389 orr r12,r12,r8
1390 orr r12,r12,r9
1391 orr r12,r12,r10
1392 orr r12,r12,r11
1393 stmia r3!,{r4-r11}
1394 ldmia $b_ptr!,{r4-r11}
1395 orr r12,r12,r4
1396 orr r12,r12,r5
1397 orr r12,r12,r6
1398 orr r12,r12,r7
1399 orr r12,r12,r8
1400 orr r12,r12,r9
1401 orr r12,r12,r10
1402 orr r12,r12,r11
1403 stmia r3!,{r4-r11}
1404 ldmia $b_ptr,{r4-r11}
1405 cmp r12,#0
1406 #ifdef __thumb2__
1407 it ne
1408 #endif
1409 movne r12,#-1
1410 stmia r3,{r4-r11}
1411 str r12,[sp,#32*18+8] @ !in2infty
1412
1413 ldmia $a_ptr!,{r4-r11} @ copy in1
1414 add r3,sp,#$in1_x
1415 orr r12,r4,r5
1416 orr r12,r12,r6
1417 orr r12,r12,r7
1418 orr r12,r12,r8
1419 orr r12,r12,r9
1420 orr r12,r12,r10
1421 orr r12,r12,r11
1422 stmia r3!,{r4-r11}
1423 ldmia $a_ptr!,{r4-r11}
1424 orr r12,r12,r4
1425 orr r12,r12,r5
1426 orr r12,r12,r6
1427 orr r12,r12,r7
1428 orr r12,r12,r8
1429 orr r12,r12,r9
1430 orr r12,r12,r10
1431 orr r12,r12,r11
1432 stmia r3!,{r4-r11}
1433 ldmia $a_ptr,{r4-r11}
1434 cmp r12,#0
1435 #ifdef __thumb2__
1436 it ne
1437 #endif
1438 movne r12,#-1
1439 stmia r3,{r4-r11}
1440 str r12,[sp,#32*18+4] @ !in1infty
1441
1442 add $a_ptr,sp,#$in2_z
1443 add $b_ptr,sp,#$in2_z
1444 add $r_ptr,sp,#$Z2sqr
1445 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
1446
1447 add $a_ptr,sp,#$in1_z
1448 add $b_ptr,sp,#$in1_z
1449 add $r_ptr,sp,#$Z1sqr
1450 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1451
1452 add $a_ptr,sp,#$in2_z
1453 add $b_ptr,sp,#$Z2sqr
1454 add $r_ptr,sp,#$S1
1455 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
1456
1457 add $a_ptr,sp,#$in1_z
1458 add $b_ptr,sp,#$Z1sqr
1459 add $r_ptr,sp,#$S2
1460 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1461
1462 add $a_ptr,sp,#$in1_y
1463 add $b_ptr,sp,#$S1
1464 add $r_ptr,sp,#$S1
1465 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
1466
1467 add $a_ptr,sp,#$in2_y
1468 add $b_ptr,sp,#$S2
1469 add $r_ptr,sp,#$S2
1470 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1471
1472 add $b_ptr,sp,#$S1
1473 add $r_ptr,sp,#$R
1474 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
1475
1476 orr $a0,$a0,$a1 @ see if result is zero
1477 orr $a2,$a2,$a3
1478 orr $a4,$a4,$a5
1479 orr $a0,$a0,$a2
1480 orr $a4,$a4,$a6
1481 orr $a0,$a0,$a7
1482 add $a_ptr,sp,#$in1_x
1483 orr $a0,$a0,$a4
1484 add $b_ptr,sp,#$Z2sqr
1485 str $a0,[sp,#32*18+12]
1486
1487 add $r_ptr,sp,#$U1
1488 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
1489
1490 add $a_ptr,sp,#$in2_x
1491 add $b_ptr,sp,#$Z1sqr
1492 add $r_ptr,sp,#$U2
1493 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
1494
1495 add $b_ptr,sp,#$U1
1496 add $r_ptr,sp,#$H
1497 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
1498
1499 orr $a0,$a0,$a1 @ see if result is zero
1500 orr $a2,$a2,$a3
1501 orr $a4,$a4,$a5
1502 orr $a0,$a0,$a2
1503 orr $a4,$a4,$a6
1504 orr $a0,$a0,$a7
1505 orrs $a0,$a0,$a4
1506
1507 bne .Ladd_proceed @ is_equal(U1,U2)?
1508
1509 ldr $t0,[sp,#32*18+4]
1510 ldr $t1,[sp,#32*18+8]
1511 ldr $t2,[sp,#32*18+12]
1512 tst $t0,$t1
1513 beq .Ladd_proceed @ (in1infty || in2infty)?
1514 tst $t2,$t2
1515 beq .Ladd_double @ is_equal(S1,S2)?
1516
1517 ldr $r_ptr,[sp,#32*18+16]
1518 eor r4,r4,r4
1519 eor r5,r5,r5
1520 eor r6,r6,r6
1521 eor r7,r7,r7
1522 eor r8,r8,r8
1523 eor r9,r9,r9
1524 eor r10,r10,r10
1525 eor r11,r11,r11
1526 stmia $r_ptr!,{r4-r11}
1527 stmia $r_ptr!,{r4-r11}
1528 stmia $r_ptr!,{r4-r11}
1529 b .Ladd_done
1530
1531 .align 4
1532 .Ladd_double:
1533 ldr $a_ptr,[sp,#32*18+20]
1534 add sp,sp,#32*(18-5)+16 @ difference in frame sizes
1535 b .Lpoint_double_shortcut
1536
1537 .align 4
1538 .Ladd_proceed:
1539 add $a_ptr,sp,#$R
1540 add $b_ptr,sp,#$R
1541 add $r_ptr,sp,#$Rsqr
1542 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1543
1544 add $a_ptr,sp,#$H
1545 add $b_ptr,sp,#$in1_z
1546 add $r_ptr,sp,#$res_z
1547 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1548
1549 add $a_ptr,sp,#$H
1550 add $b_ptr,sp,#$H
1551 add $r_ptr,sp,#$Hsqr
1552 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1553
1554 add $a_ptr,sp,#$in2_z
1555 add $b_ptr,sp,#$res_z
1556 add $r_ptr,sp,#$res_z
1557 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
1558
1559 add $a_ptr,sp,#$H
1560 add $b_ptr,sp,#$Hsqr
1561 add $r_ptr,sp,#$Hcub
1562 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1563
1564 add $a_ptr,sp,#$Hsqr
1565 add $b_ptr,sp,#$U1
1566 add $r_ptr,sp,#$U2
1567 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
1568
1569 add $r_ptr,sp,#$Hsqr
1570 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1571
1572 add $b_ptr,sp,#$Rsqr
1573 add $r_ptr,sp,#$res_x
1574 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1575
1576 add $b_ptr,sp,#$Hcub
1577 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1578
1579 add $b_ptr,sp,#$U2
1580 add $r_ptr,sp,#$res_y
1581 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1582
1583 add $a_ptr,sp,#$Hcub
1584 add $b_ptr,sp,#$S1
1585 add $r_ptr,sp,#$S2
1586 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
1587
1588 add $a_ptr,sp,#$R
1589 add $b_ptr,sp,#$res_y
1590 add $r_ptr,sp,#$res_y
1591 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1592
1593 add $b_ptr,sp,#$S2
1594 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1595
1596 ldr r11,[sp,#32*18+4] @ !in1intfy
1597 ldr r12,[sp,#32*18+8] @ !in2intfy
1598 add r1,sp,#$res_x
1599 add r2,sp,#$in2_x
1600 and r10,r11,r12
1601 mvn r11,r11
1602 add r3,sp,#$in1_x
1603 and r11,r11,r12
1604 mvn r12,r12
1605 ldr $r_ptr,[sp,#32*18+16]
1606 ___
1607 for($i=0;$i<96;$i+=8) { # conditional moves
1608 $code.=<<___;
1609 ldmia r1!,{r4-r5} @ res_x
1610 ldmia r2!,{r6-r7} @ in2_x
1611 ldmia r3!,{r8-r9} @ in1_x
1612 and r4,r4,r10
1613 and r5,r5,r10
1614 and r6,r6,r11
1615 and r7,r7,r11
1616 and r8,r8,r12
1617 and r9,r9,r12
1618 orr r4,r4,r6
1619 orr r5,r5,r7
1620 orr r4,r4,r8
1621 orr r5,r5,r9
1622 stmia $r_ptr!,{r4-r5}
1623 ___
1624 }
1625 $code.=<<___;
1626 .Ladd_done:
1627 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
1628 #if __ARM_ARCH__>=5 || defined(__thumb__)
1629 ldmia sp!,{r4-r12,pc}
1630 #else
1631 ldmia sp!,{r4-r12,lr}
1632 bx lr @ interoperable with Thumb ISA:-)
1633 #endif
1634 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1635 ___
1636 }
1637
1638 ########################################################################
1639 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1640 # const P256_POINT_AFFINE *in2);
1641 {
1642 my ($res_x,$res_y,$res_z,
1643 $in1_x,$in1_y,$in1_z,
1644 $in2_x,$in2_y,
1645 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1646 my $Z1sqr = $S2;
1647 # above map() describes stack layout with 18 temporary
1648 # 256-bit vectors on top. Then note that we push
1649 # starting from r0, which means that we have copy of
1650 # input arguments just below these temporary vectors.
1651 # We use two of them for !in1infty, !in2intfy.
1652
1653 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1654
1655 $code.=<<___;
1656 .globl ecp_nistz256_point_add_affine
1657 .type ecp_nistz256_point_add_affine,%function
1658 .align 5
1659 ecp_nistz256_point_add_affine:
1660 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1661 sub sp,sp,#32*15
1662
1663 ldmia $a_ptr!,{r4-r11} @ copy in1
1664 add r3,sp,#$in1_x
1665 orr r12,r4,r5
1666 orr r12,r12,r6
1667 orr r12,r12,r7
1668 orr r12,r12,r8
1669 orr r12,r12,r9
1670 orr r12,r12,r10
1671 orr r12,r12,r11
1672 stmia r3!,{r4-r11}
1673 ldmia $a_ptr!,{r4-r11}
1674 orr r12,r12,r4
1675 orr r12,r12,r5
1676 orr r12,r12,r6
1677 orr r12,r12,r7
1678 orr r12,r12,r8
1679 orr r12,r12,r9
1680 orr r12,r12,r10
1681 orr r12,r12,r11
1682 stmia r3!,{r4-r11}
1683 ldmia $a_ptr,{r4-r11}
1684 cmp r12,#0
1685 #ifdef __thumb2__
1686 it ne
1687 #endif
1688 movne r12,#-1
1689 stmia r3,{r4-r11}
1690 str r12,[sp,#32*15+4] @ !in1infty
1691
1692 ldmia $b_ptr!,{r4-r11} @ copy in2
1693 add r3,sp,#$in2_x
1694 orr r12,r4,r5
1695 orr r12,r12,r6
1696 orr r12,r12,r7
1697 orr r12,r12,r8
1698 orr r12,r12,r9
1699 orr r12,r12,r10
1700 orr r12,r12,r11
1701 stmia r3!,{r4-r11}
1702 ldmia $b_ptr!,{r4-r11}
1703 orr r12,r12,r4
1704 orr r12,r12,r5
1705 orr r12,r12,r6
1706 orr r12,r12,r7
1707 orr r12,r12,r8
1708 orr r12,r12,r9
1709 orr r12,r12,r10
1710 orr r12,r12,r11
1711 stmia r3!,{r4-r11}
1712 cmp r12,#0
1713 #ifdef __thumb2__
1714 it ne
1715 #endif
1716 movne r12,#-1
1717 str r12,[sp,#32*15+8] @ !in2infty
1718
1719 add $a_ptr,sp,#$in1_z
1720 add $b_ptr,sp,#$in1_z
1721 add $r_ptr,sp,#$Z1sqr
1722 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1723
1724 add $a_ptr,sp,#$Z1sqr
1725 add $b_ptr,sp,#$in2_x
1726 add $r_ptr,sp,#$U2
1727 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
1728
1729 add $b_ptr,sp,#$in1_x
1730 add $r_ptr,sp,#$H
1731 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
1732
1733 add $a_ptr,sp,#$Z1sqr
1734 add $b_ptr,sp,#$in1_z
1735 add $r_ptr,sp,#$S2
1736 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1737
1738 add $a_ptr,sp,#$H
1739 add $b_ptr,sp,#$in1_z
1740 add $r_ptr,sp,#$res_z
1741 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1742
1743 add $a_ptr,sp,#$in2_y
1744 add $b_ptr,sp,#$S2
1745 add $r_ptr,sp,#$S2
1746 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1747
1748 add $b_ptr,sp,#$in1_y
1749 add $r_ptr,sp,#$R
1750 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
1751
1752 add $a_ptr,sp,#$H
1753 add $b_ptr,sp,#$H
1754 add $r_ptr,sp,#$Hsqr
1755 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1756
1757 add $a_ptr,sp,#$R
1758 add $b_ptr,sp,#$R
1759 add $r_ptr,sp,#$Rsqr
1760 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1761
1762 add $a_ptr,sp,#$H
1763 add $b_ptr,sp,#$Hsqr
1764 add $r_ptr,sp,#$Hcub
1765 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1766
1767 add $a_ptr,sp,#$Hsqr
1768 add $b_ptr,sp,#$in1_x
1769 add $r_ptr,sp,#$U2
1770 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
1771
1772 add $r_ptr,sp,#$Hsqr
1773 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1774
1775 add $b_ptr,sp,#$Rsqr
1776 add $r_ptr,sp,#$res_x
1777 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1778
1779 add $b_ptr,sp,#$Hcub
1780 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1781
1782 add $b_ptr,sp,#$U2
1783 add $r_ptr,sp,#$res_y
1784 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1785
1786 add $a_ptr,sp,#$Hcub
1787 add $b_ptr,sp,#$in1_y
1788 add $r_ptr,sp,#$S2
1789 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
1790
1791 add $a_ptr,sp,#$R
1792 add $b_ptr,sp,#$res_y
1793 add $r_ptr,sp,#$res_y
1794 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1795
1796 add $b_ptr,sp,#$S2
1797 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1798
1799 ldr r11,[sp,#32*15+4] @ !in1intfy
1800 ldr r12,[sp,#32*15+8] @ !in2intfy
1801 add r1,sp,#$res_x
1802 add r2,sp,#$in2_x
1803 and r10,r11,r12
1804 mvn r11,r11
1805 add r3,sp,#$in1_x
1806 and r11,r11,r12
1807 mvn r12,r12
1808 ldr $r_ptr,[sp,#32*15]
1809 ___
1810 for($i=0;$i<64;$i+=8) { # conditional moves
1811 $code.=<<___;
1812 ldmia r1!,{r4-r5} @ res_x
1813 ldmia r2!,{r6-r7} @ in2_x
1814 ldmia r3!,{r8-r9} @ in1_x
1815 and r4,r4,r10
1816 and r5,r5,r10
1817 and r6,r6,r11
1818 and r7,r7,r11
1819 and r8,r8,r12
1820 and r9,r9,r12
1821 orr r4,r4,r6
1822 orr r5,r5,r7
1823 orr r4,r4,r8
1824 orr r5,r5,r9
1825 stmia $r_ptr!,{r4-r5}
1826 ___
1827 }
1828 for(;$i<96;$i+=8) {
1829 my $j=($i-64)/4;
1830 $code.=<<___;
1831 ldmia r1!,{r4-r5} @ res_z
1832 ldmia r3!,{r8-r9} @ in1_z
1833 and r4,r4,r10
1834 and r5,r5,r10
1835 and r6,r11,#@ONE_mont[$j]
1836 and r7,r11,#@ONE_mont[$j+1]
1837 and r8,r8,r12
1838 and r9,r9,r12
1839 orr r4,r4,r6
1840 orr r5,r5,r7
1841 orr r4,r4,r8
1842 orr r5,r5,r9
1843 stmia $r_ptr!,{r4-r5}
1844 ___
1845 }
1846 $code.=<<___;
1847 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
1848 #if __ARM_ARCH__>=5 || !defined(__thumb__)
1849 ldmia sp!,{r4-r12,pc}
1850 #else
1851 ldmia sp!,{r4-r12,lr}
1852 bx lr @ interoperable with Thumb ISA:-)
1853 #endif
1854 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1855 ___
1856 } }}}
1857
1858 foreach (split("\n",$code)) {
1859 s/\`([^\`]*)\`/eval $1/geo;
1860
1861 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1862
1863 print $_,"\n";
1864 }
1865 close STDOUT; # enforce flush