]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/ec/asm/ecp_nistz256-armv4.pl
ARM assembly pack: make it Windows-friendly.
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-armv4.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
6ec5fce2 2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a7f182b7 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
7a6c9a2e
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv4.
18#
19# October 2014.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26# with/without -DECP_NISTZ256_ASM
27# Cortex-A8 +53-170%
28# Cortex-A9 +76-205%
29# Cortex-A15 +100-316%
30# Snapdragon S4 +66-187%
31#
32# Ranges denote minimum and maximum improvement coefficients depending
33# on benchmark. Lower coefficients are for ECDSA sign, server-side
34# operation. Keep in mind that +200% means 3x improvement.
35
36$flavour = shift;
a5aa63a4
RL
37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
313e6ec1
AP
39
40if ($flavour && $flavour ne "void") {
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44 die "can't locate arm-xlate.pl";
45
46 open STDOUT,"| \"$^X\" $xlate $flavour $output";
47} else {
48 open STDOUT,">$output";
49}
7a6c9a2e
AP
50
51$code.=<<___;
52#include "arm_arch.h"
53
a2859927 54#if defined(__thumb2__)
11208dcf
AP
55.syntax unified
56.thumb
57#else
7a6c9a2e 58.code 32
11208dcf 59#endif
7a6c9a2e
AP
60___
61########################################################################
62# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
63#
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65open TABLE,"<ecp_nistz256_table.c" or
66open TABLE,"<${dir}../ecp_nistz256_table.c" or
67die "failed to open ecp_nistz256_table.c:",$!;
68
69use integer;
70
71foreach(<TABLE>) {
72 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
73}
74close TABLE;
75
76# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
77# 64*16*37-1 is because $#arr returns last valid index or @arr, not
78# amount of elements.
79die "insane number of elements" if ($#arr != 64*16*37-1);
80
81$code.=<<___;
3405db97 82.rodata
7a6c9a2e
AP
83.globl ecp_nistz256_precomputed
84.type ecp_nistz256_precomputed,%object
85.align 12
86ecp_nistz256_precomputed:
87___
88########################################################################
89# this conversion smashes P256_POINT_AFFINE by individual bytes with
90# 64 byte interval, similar to
91# 1111222233334444
92# 1234123412341234
93for(1..37) {
94 @tbl = splice(@arr,0,64*16);
95 for($i=0;$i<64;$i++) {
96 undef @line;
97 for($j=0;$j<64;$j++) {
98 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
99 }
100 $code.=".byte\t";
101 $code.=join(',',map { sprintf "0x%02x",$_} @line);
102 $code.="\n";
103 }
104}
105$code.=<<___;
106.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
3405db97
AP
107
108.text
7a6c9a2e
AP
109.align 5
110.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
111.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
112.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
113.Lone:
114.long 1,0,0,0,0,0,0,0
115.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
116.align 6
117___
118
119########################################################################
120# common register layout, note that $t2 is link register, so that if
121# internal subroutine uses $t2, then it has to offload lr...
122
123($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
124 map("r$_",(0..12,14));
125($t0,$t3)=($ff,$a_ptr);
126
127$code.=<<___;
128@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
129.globl ecp_nistz256_to_mont
130.type ecp_nistz256_to_mont,%function
131ecp_nistz256_to_mont:
132 adr $b_ptr,.LRR
133 b .Lecp_nistz256_mul_mont
134.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
135
136@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
137.globl ecp_nistz256_from_mont
138.type ecp_nistz256_from_mont,%function
139ecp_nistz256_from_mont:
140 adr $b_ptr,.Lone
141 b .Lecp_nistz256_mul_mont
142.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
143
144@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
145.globl ecp_nistz256_mul_by_2
146.type ecp_nistz256_mul_by_2,%function
147.align 4
148ecp_nistz256_mul_by_2:
149 stmdb sp!,{r4-r12,lr}
313e6ec1 150 bl __ecp_nistz256_mul_by_2
7a6c9a2e
AP
151#if __ARM_ARCH__>=5 || !defined(__thumb__)
152 ldmia sp!,{r4-r12,pc}
153#else
154 ldmia sp!,{r4-r12,lr}
155 bx lr @ interoperable with Thumb ISA:-)
156#endif
157.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
158
313e6ec1 159.type __ecp_nistz256_mul_by_2,%function
7a6c9a2e 160.align 4
313e6ec1 161__ecp_nistz256_mul_by_2:
7a6c9a2e
AP
162 ldr $a0,[$a_ptr,#0]
163 ldr $a1,[$a_ptr,#4]
164 ldr $a2,[$a_ptr,#8]
165 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
166 ldr $a3,[$a_ptr,#12]
167 adcs $a1,$a1,$a1
168 ldr $a4,[$a_ptr,#16]
169 adcs $a2,$a2,$a2
170 ldr $a5,[$a_ptr,#20]
171 adcs $a3,$a3,$a3
172 ldr $a6,[$a_ptr,#24]
173 adcs $a4,$a4,$a4
174 ldr $a7,[$a_ptr,#28]
175 adcs $a5,$a5,$a5
176 adcs $a6,$a6,$a6
177 mov $ff,#0
178 adcs $a7,$a7,$a7
dfde4219 179 adc $ff,$ff,#0
7a6c9a2e
AP
180
181 b .Lreduce_by_sub
313e6ec1 182.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
7a6c9a2e
AP
183
184@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
185@ const BN_ULONG r2[8]);
186.globl ecp_nistz256_add
187.type ecp_nistz256_add,%function
188.align 4
189ecp_nistz256_add:
190 stmdb sp!,{r4-r12,lr}
313e6ec1 191 bl __ecp_nistz256_add
7a6c9a2e
AP
192#if __ARM_ARCH__>=5 || !defined(__thumb__)
193 ldmia sp!,{r4-r12,pc}
194#else
195 ldmia sp!,{r4-r12,lr}
196 bx lr @ interoperable with Thumb ISA:-)
197#endif
198.size ecp_nistz256_add,.-ecp_nistz256_add
199
313e6ec1 200.type __ecp_nistz256_add,%function
7a6c9a2e 201.align 4
313e6ec1 202__ecp_nistz256_add:
7a6c9a2e
AP
203 str lr,[sp,#-4]! @ push lr
204
205 ldr $a0,[$a_ptr,#0]
206 ldr $a1,[$a_ptr,#4]
207 ldr $a2,[$a_ptr,#8]
208 ldr $a3,[$a_ptr,#12]
209 ldr $a4,[$a_ptr,#16]
210 ldr $t0,[$b_ptr,#0]
211 ldr $a5,[$a_ptr,#20]
212 ldr $t1,[$b_ptr,#4]
213 ldr $a6,[$a_ptr,#24]
214 ldr $t2,[$b_ptr,#8]
215 ldr $a7,[$a_ptr,#28]
216 ldr $t3,[$b_ptr,#12]
217 adds $a0,$a0,$t0
218 ldr $t0,[$b_ptr,#16]
219 adcs $a1,$a1,$t1
220 ldr $t1,[$b_ptr,#20]
221 adcs $a2,$a2,$t2
222 ldr $t2,[$b_ptr,#24]
223 adcs $a3,$a3,$t3
224 ldr $t3,[$b_ptr,#28]
225 adcs $a4,$a4,$t0
226 adcs $a5,$a5,$t1
227 adcs $a6,$a6,$t2
228 mov $ff,#0
229 adcs $a7,$a7,$t3
dfde4219 230 adc $ff,$ff,#0
7a6c9a2e
AP
231 ldr lr,[sp],#4 @ pop lr
232
233.Lreduce_by_sub:
234
dfde4219 235 @ if a+b >= modulus, subtract modulus.
7a6c9a2e 236 @
dfde4219 237 @ But since comparison implies subtraction, we subtract
46f4e1be 238 @ modulus and then add it back if subtraction borrowed.
dfde4219
AP
239
240 subs $a0,$a0,#-1
241 sbcs $a1,$a1,#-1
242 sbcs $a2,$a2,#-1
243 sbcs $a3,$a3,#0
244 sbcs $a4,$a4,#0
245 sbcs $a5,$a5,#0
246 sbcs $a6,$a6,#1
247 sbcs $a7,$a7,#-1
248 sbc $ff,$ff,#0
249
7a6c9a2e
AP
250 @ Note that because mod has special form, i.e. consists of
251 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
dfde4219
AP
252 @ using value of borrow as a whole or extracting single bit.
253 @ Follow $ff register...
7a6c9a2e 254
dfde4219
AP
255 adds $a0,$a0,$ff @ add synthesized modulus
256 adcs $a1,$a1,$ff
7a6c9a2e 257 str $a0,[$r_ptr,#0]
dfde4219 258 adcs $a2,$a2,$ff
7a6c9a2e 259 str $a1,[$r_ptr,#4]
dfde4219 260 adcs $a3,$a3,#0
7a6c9a2e 261 str $a2,[$r_ptr,#8]
dfde4219 262 adcs $a4,$a4,#0
7a6c9a2e 263 str $a3,[$r_ptr,#12]
dfde4219 264 adcs $a5,$a5,#0
7a6c9a2e 265 str $a4,[$r_ptr,#16]
dfde4219 266 adcs $a6,$a6,$ff,lsr#31
7a6c9a2e 267 str $a5,[$r_ptr,#20]
dfde4219 268 adcs $a7,$a7,$ff
7a6c9a2e
AP
269 str $a6,[$r_ptr,#24]
270 str $a7,[$r_ptr,#28]
271
272 mov pc,lr
313e6ec1 273.size __ecp_nistz256_add,.-__ecp_nistz256_add
7a6c9a2e
AP
274
275@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
276.globl ecp_nistz256_mul_by_3
277.type ecp_nistz256_mul_by_3,%function
278.align 4
279ecp_nistz256_mul_by_3:
280 stmdb sp!,{r4-r12,lr}
313e6ec1 281 bl __ecp_nistz256_mul_by_3
7a6c9a2e
AP
282#if __ARM_ARCH__>=5 || !defined(__thumb__)
283 ldmia sp!,{r4-r12,pc}
284#else
285 ldmia sp!,{r4-r12,lr}
286 bx lr @ interoperable with Thumb ISA:-)
287#endif
288.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
289
313e6ec1 290.type __ecp_nistz256_mul_by_3,%function
7a6c9a2e 291.align 4
313e6ec1 292__ecp_nistz256_mul_by_3:
7a6c9a2e
AP
293 str lr,[sp,#-4]! @ push lr
294
295 @ As multiplication by 3 is performed as 2*n+n, below are inline
313e6ec1 296 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
7a6c9a2e
AP
297 @ corresponding subroutines for details.
298
299 ldr $a0,[$a_ptr,#0]
300 ldr $a1,[$a_ptr,#4]
301 ldr $a2,[$a_ptr,#8]
302 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
303 ldr $a3,[$a_ptr,#12]
304 adcs $a1,$a1,$a1
305 ldr $a4,[$a_ptr,#16]
306 adcs $a2,$a2,$a2
307 ldr $a5,[$a_ptr,#20]
308 adcs $a3,$a3,$a3
309 ldr $a6,[$a_ptr,#24]
310 adcs $a4,$a4,$a4
311 ldr $a7,[$a_ptr,#28]
312 adcs $a5,$a5,$a5
313 adcs $a6,$a6,$a6
314 mov $ff,#0
315 adcs $a7,$a7,$a7
dfde4219
AP
316 adc $ff,$ff,#0
317
318 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
319 sbcs $a1,$a1,#-1
320 sbcs $a2,$a2,#-1
7a6c9a2e
AP
321 sbcs $a3,$a3,#0
322 sbcs $a4,$a4,#0
7a6c9a2e 323 sbcs $a5,$a5,#0
dfde4219
AP
324 sbcs $a6,$a6,#1
325 sbcs $a7,$a7,#-1
326 sbc $ff,$ff,#0
327
328 adds $a0,$a0,$ff @ add synthesized modulus
329 adcs $a1,$a1,$ff
330 adcs $a2,$a2,$ff
331 adcs $a3,$a3,#0
332 adcs $a4,$a4,#0
333 ldr $b_ptr,[$a_ptr,#0]
334 adcs $a5,$a5,#0
7a6c9a2e 335 ldr $t1,[$a_ptr,#4]
dfde4219 336 adcs $a6,$a6,$ff,lsr#31
7a6c9a2e 337 ldr $t2,[$a_ptr,#8]
dfde4219 338 adc $a7,$a7,$ff
7a6c9a2e
AP
339
340 ldr $t0,[$a_ptr,#12]
341 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
342 ldr $b_ptr,[$a_ptr,#16]
343 adcs $a1,$a1,$t1
344 ldr $t1,[$a_ptr,#20]
345 adcs $a2,$a2,$t2
346 ldr $t2,[$a_ptr,#24]
347 adcs $a3,$a3,$t0
348 ldr $t3,[$a_ptr,#28]
349 adcs $a4,$a4,$b_ptr
350 adcs $a5,$a5,$t1
351 adcs $a6,$a6,$t2
352 mov $ff,#0
353 adcs $a7,$a7,$t3
dfde4219 354 adc $ff,$ff,#0
7a6c9a2e
AP
355 ldr lr,[sp],#4 @ pop lr
356
357 b .Lreduce_by_sub
358.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
359
360@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
361.globl ecp_nistz256_div_by_2
362.type ecp_nistz256_div_by_2,%function
363.align 4
364ecp_nistz256_div_by_2:
365 stmdb sp!,{r4-r12,lr}
313e6ec1 366 bl __ecp_nistz256_div_by_2
7a6c9a2e
AP
367#if __ARM_ARCH__>=5 || !defined(__thumb__)
368 ldmia sp!,{r4-r12,pc}
369#else
370 ldmia sp!,{r4-r12,lr}
371 bx lr @ interoperable with Thumb ISA:-)
372#endif
373.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
374
313e6ec1 375.type __ecp_nistz256_div_by_2,%function
7a6c9a2e 376.align 4
313e6ec1 377__ecp_nistz256_div_by_2:
7a6c9a2e
AP
378 @ ret = (a is odd ? a+mod : a) >> 1
379
380 ldr $a0,[$a_ptr,#0]
381 ldr $a1,[$a_ptr,#4]
382 ldr $a2,[$a_ptr,#8]
383 mov $ff,$a0,lsl#31 @ place least significant bit to most
384 @ significant position, now arithmetic
385 @ right shift by 31 will produce -1 or
60250017 386 @ 0, while logical right shift 1 or 0,
7a6c9a2e
AP
387 @ this is how modulus is conditionally
388 @ synthesized in this case...
389 ldr $a3,[$a_ptr,#12]
390 adds $a0,$a0,$ff,asr#31
391 ldr $a4,[$a_ptr,#16]
392 adcs $a1,$a1,$ff,asr#31
393 ldr $a5,[$a_ptr,#20]
394 adcs $a2,$a2,$ff,asr#31
395 ldr $a6,[$a_ptr,#24]
396 adcs $a3,$a3,#0
397 ldr $a7,[$a_ptr,#28]
398 adcs $a4,$a4,#0
399 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
400 @ because it doesn't affect flags
401 adcs $a5,$a5,#0
402 orr $a0,$a0,$a1,lsl#31
403 adcs $a6,$a6,$ff,lsr#31
404 mov $b_ptr,#0
405 adcs $a7,$a7,$ff,asr#31
406 mov $a1,$a1,lsr#1
407 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
408
409 orr $a1,$a1,$a2,lsl#31
410 mov $a2,$a2,lsr#1
411 str $a0,[$r_ptr,#0]
412 orr $a2,$a2,$a3,lsl#31
413 mov $a3,$a3,lsr#1
414 str $a1,[$r_ptr,#4]
415 orr $a3,$a3,$a4,lsl#31
416 mov $a4,$a4,lsr#1
417 str $a2,[$r_ptr,#8]
418 orr $a4,$a4,$a5,lsl#31
419 mov $a5,$a5,lsr#1
420 str $a3,[$r_ptr,#12]
421 orr $a5,$a5,$a6,lsl#31
422 mov $a6,$a6,lsr#1
423 str $a4,[$r_ptr,#16]
424 orr $a6,$a6,$a7,lsl#31
425 mov $a7,$a7,lsr#1
426 str $a5,[$r_ptr,#20]
427 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
428 str $a6,[$r_ptr,#24]
429 str $a7,[$r_ptr,#28]
430
431 mov pc,lr
313e6ec1 432.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
7a6c9a2e
AP
433
434@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
313e6ec1 435@ const BN_ULONG r2[8]);
7a6c9a2e
AP
436.globl ecp_nistz256_sub
437.type ecp_nistz256_sub,%function
438.align 4
439ecp_nistz256_sub:
440 stmdb sp!,{r4-r12,lr}
313e6ec1 441 bl __ecp_nistz256_sub
7a6c9a2e
AP
442#if __ARM_ARCH__>=5 || !defined(__thumb__)
443 ldmia sp!,{r4-r12,pc}
444#else
445 ldmia sp!,{r4-r12,lr}
446 bx lr @ interoperable with Thumb ISA:-)
447#endif
448.size ecp_nistz256_sub,.-ecp_nistz256_sub
449
313e6ec1 450.type __ecp_nistz256_sub,%function
7a6c9a2e 451.align 4
313e6ec1 452__ecp_nistz256_sub:
7a6c9a2e
AP
453 str lr,[sp,#-4]! @ push lr
454
455 ldr $a0,[$a_ptr,#0]
456 ldr $a1,[$a_ptr,#4]
457 ldr $a2,[$a_ptr,#8]
458 ldr $a3,[$a_ptr,#12]
459 ldr $a4,[$a_ptr,#16]
460 ldr $t0,[$b_ptr,#0]
461 ldr $a5,[$a_ptr,#20]
462 ldr $t1,[$b_ptr,#4]
463 ldr $a6,[$a_ptr,#24]
464 ldr $t2,[$b_ptr,#8]
465 ldr $a7,[$a_ptr,#28]
466 ldr $t3,[$b_ptr,#12]
467 subs $a0,$a0,$t0
468 ldr $t0,[$b_ptr,#16]
469 sbcs $a1,$a1,$t1
470 ldr $t1,[$b_ptr,#20]
471 sbcs $a2,$a2,$t2
472 ldr $t2,[$b_ptr,#24]
473 sbcs $a3,$a3,$t3
474 ldr $t3,[$b_ptr,#28]
475 sbcs $a4,$a4,$t0
476 sbcs $a5,$a5,$t1
477 sbcs $a6,$a6,$t2
478 sbcs $a7,$a7,$t3
479 sbc $ff,$ff,$ff @ broadcast borrow bit
480 ldr lr,[sp],#4 @ pop lr
481
482.Lreduce_by_add:
483
484 @ if a-b borrows, add modulus.
485 @
486 @ Note that because mod has special form, i.e. consists of
487 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
488 @ broadcasting borrow bit to a register, $ff, and using it as
489 @ a whole or extracting single bit.
490
491 adds $a0,$a0,$ff @ add synthesized modulus
492 adcs $a1,$a1,$ff
493 str $a0,[$r_ptr,#0]
494 adcs $a2,$a2,$ff
495 str $a1,[$r_ptr,#4]
496 adcs $a3,$a3,#0
497 str $a2,[$r_ptr,#8]
498 adcs $a4,$a4,#0
499 str $a3,[$r_ptr,#12]
500 adcs $a5,$a5,#0
501 str $a4,[$r_ptr,#16]
502 adcs $a6,$a6,$ff,lsr#31
503 str $a5,[$r_ptr,#20]
504 adcs $a7,$a7,$ff
505 str $a6,[$r_ptr,#24]
506 str $a7,[$r_ptr,#28]
507
508 mov pc,lr
313e6ec1 509.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
7a6c9a2e
AP
510
511@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
512.globl ecp_nistz256_neg
513.type ecp_nistz256_neg,%function
514.align 4
515ecp_nistz256_neg:
516 stmdb sp!,{r4-r12,lr}
313e6ec1 517 bl __ecp_nistz256_neg
7a6c9a2e
AP
518#if __ARM_ARCH__>=5 || !defined(__thumb__)
519 ldmia sp!,{r4-r12,pc}
520#else
521 ldmia sp!,{r4-r12,lr}
522 bx lr @ interoperable with Thumb ISA:-)
523#endif
524.size ecp_nistz256_neg,.-ecp_nistz256_neg
525
313e6ec1 526.type __ecp_nistz256_neg,%function
7a6c9a2e 527.align 4
313e6ec1 528__ecp_nistz256_neg:
7a6c9a2e
AP
529 ldr $a0,[$a_ptr,#0]
530 eor $ff,$ff,$ff
531 ldr $a1,[$a_ptr,#4]
532 ldr $a2,[$a_ptr,#8]
533 subs $a0,$ff,$a0
534 ldr $a3,[$a_ptr,#12]
535 sbcs $a1,$ff,$a1
536 ldr $a4,[$a_ptr,#16]
537 sbcs $a2,$ff,$a2
538 ldr $a5,[$a_ptr,#20]
539 sbcs $a3,$ff,$a3
540 ldr $a6,[$a_ptr,#24]
541 sbcs $a4,$ff,$a4
542 ldr $a7,[$a_ptr,#28]
543 sbcs $a5,$ff,$a5
544 sbcs $a6,$ff,$a6
545 sbcs $a7,$ff,$a7
546 sbc $ff,$ff,$ff
547
548 b .Lreduce_by_add
313e6ec1 549.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
7a6c9a2e
AP
550___
551{
552my @acc=map("r$_",(3..11));
553my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
554
555$code.=<<___;
556@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
557.globl ecp_nistz256_sqr_mont
558.type ecp_nistz256_sqr_mont,%function
559.align 4
560ecp_nistz256_sqr_mont:
561 mov $b_ptr,$a_ptr
562 b .Lecp_nistz256_mul_mont
563.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
564
565@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
566@ const BN_ULONG r2[8]);
567.globl ecp_nistz256_mul_mont
568.type ecp_nistz256_mul_mont,%function
569.align 4
570ecp_nistz256_mul_mont:
571.Lecp_nistz256_mul_mont:
572 stmdb sp!,{r4-r12,lr}
313e6ec1 573 bl __ecp_nistz256_mul_mont
7a6c9a2e
AP
574#if __ARM_ARCH__>=5 || !defined(__thumb__)
575 ldmia sp!,{r4-r12,pc}
576#else
577 ldmia sp!,{r4-r12,lr}
578 bx lr @ interoperable with Thumb ISA:-)
579#endif
580.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
581
313e6ec1 582.type __ecp_nistz256_mul_mont,%function
7a6c9a2e 583.align 4
313e6ec1 584__ecp_nistz256_mul_mont:
7a6c9a2e
AP
585 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
586
587 ldr $bj,[$b_ptr,#0] @ b[0]
588 ldmia $a_ptr,{@acc[1]-@acc[8]}
589
590 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
591 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
592 @ that it can be addressed
593 @ without spending register
594 @ on address
595 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
596 umull @acc[2],$t1,@acc[3],$bj
597 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
598 umull @acc[3],$t2,@acc[4],$bj
599 adcs @acc[2],@acc[2],$t0
600 umull @acc[4],$t3,@acc[5],$bj
601 adcs @acc[3],@acc[3],$t1
602 umull @acc[5],$t0,@acc[6],$bj
603 adcs @acc[4],@acc[4],$t2
604 umull @acc[6],$t1,@acc[7],$bj
605 adcs @acc[5],@acc[5],$t3
606 umull @acc[7],$t2,@acc[8],$bj
607 adcs @acc[6],@acc[6],$t0
608 adcs @acc[7],@acc[7],$t1
609 eor $t3,$t3,$t3 @ first overflow bit is zero
610 adc @acc[8],$t2,#0
611___
612for(my $i=1;$i<8;$i++) {
613my $t4=@acc[0];
614
615 # Reduction iteration is normally performed by accumulating
616 # result of multiplication of modulus by "magic" digit [and
617 # omitting least significant word, which is guaranteed to
618 # be 0], but thanks to special form of modulus and "magic"
619 # digit being equal to least significant word, it can be
620 # performed with additions and subtractions alone. Indeed:
621 #
622 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
623 # * abcd
624 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
625 #
626 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
627 # rewrite above as:
628 #
629 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
630 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
631 # - abcd.0000.0000.0000.0000.0000.0000.abcd
632 #
633 # or marking redundant operations:
634 #
635 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
636 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
637 # - abcd.----.----.----.----.----.----.----
638
639$code.=<<___;
640 @ multiplication-less reduction $i
641 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
642 ldr $bj,[sp,#40] @ restore b_ptr
643 adcs @acc[4],@acc[4],#0 @ r[4]+=0
644 adcs @acc[5],@acc[5],#0 @ r[5]+=0
645 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
646 ldr $t1,[sp,#0] @ load a[0]
647 adcs @acc[7],@acc[7],#0 @ r[7]+=0
648 ldr $bj,[$bj,#4*$i] @ load b[i]
649 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
650 eor $t0,$t0,$t0
651 adc $t3,$t3,#0 @ overflow bit
652 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
653 ldr $t2,[sp,#4] @ a[1]
654 sbcs @acc[8],@acc[8],#0 @ r[8]-=0
655 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
656 eor $t1,$t1,$t1
657 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
658 @ that netto result is
659 @ addition of a value which
660 @ makes underflow impossible
661
662 ldr $t3,[sp,#8] @ a[2]
663 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
664 str @acc[0],[sp,#36] @ temporarily offload overflow
665 eor $t2,$t2,$t2
666 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
667 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
668 eor $t3,$t3,$t3
669 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
670 ldr $t0,[sp,#16] @ a[4]
671 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
672 eor $t4,$t4,$t4
673 adcs @acc[3],@acc[3],$t1
674 ldr $t1,[sp,#20] @ a[5]
675 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
676 eor $t0,$t0,$t0
677 adcs @acc[4],@acc[4],$t2
678 ldr $t2,[sp,#24] @ a[6]
679 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
680 eor $t1,$t1,$t1
681 adcs @acc[5],@acc[5],$t3
682 ldr $t3,[sp,#28] @ a[7]
683 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
684 eor $t2,$t2,$t2
685 adcs @acc[6],@acc[6],$t4
686 ldr @acc[0],[sp,#36] @ restore overflow bit
687 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
688 eor $t3,$t3,$t3
689 adcs @acc[7],@acc[7],$t0
690 adcs @acc[8],@acc[8],$t1
691 adcs @acc[0],$acc[0],$t2
692 adc $t3,$t3,#0 @ new overflow bit
693___
694 push(@acc,shift(@acc)); # rotate registers, so that
695 # "r[i]" becomes r[i]
696}
697$code.=<<___;
698 @ last multiplication-less reduction
699 adds @acc[3],@acc[3],@acc[0]
700 ldr $r_ptr,[sp,#32] @ restore r_ptr
701 adcs @acc[4],@acc[4],#0
702 adcs @acc[5],@acc[5],#0
703 adcs @acc[6],@acc[6],@acc[0]
704 adcs @acc[7],@acc[7],#0
705 adcs @acc[8],@acc[8],@acc[0]
706 adc $t3,$t3,#0
707 subs @acc[7],@acc[7],@acc[0]
708 sbcs @acc[8],@acc[8],#0
709 sbc @acc[0],$t3,#0 @ overflow bit
710
711 @ Final step is "if result > mod, subtract mod", but we do it
712 @ "other way around", namely subtract modulus from result
713 @ and if it borrowed, add modulus back.
714
313e6ec1
AP
715 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
716 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
717 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
7a6c9a2e
AP
718 sbcs @acc[4],@acc[4],#0
719 sbcs @acc[5],@acc[5],#0
720 sbcs @acc[6],@acc[6],#0
721 sbcs @acc[7],@acc[7],#1
313e6ec1 722 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
7a6c9a2e
AP
723 ldr lr,[sp,#44] @ restore lr
724 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
725 add sp,sp,#48
726
727 @ Note that because mod has special form, i.e. consists of
728 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
729 @ broadcasting borrow bit to a register, @acc[0], and using it as
730 @ a whole or extracting single bit.
731
732 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
733 adcs @acc[2],@acc[2],@acc[0]
734 str @acc[1],[$r_ptr,#0]
735 adcs @acc[3],@acc[3],@acc[0]
736 str @acc[2],[$r_ptr,#4]
737 adcs @acc[4],@acc[4],#0
738 str @acc[3],[$r_ptr,#8]
739 adcs @acc[5],@acc[5],#0
740 str @acc[4],[$r_ptr,#12]
741 adcs @acc[6],@acc[6],#0
742 str @acc[5],[$r_ptr,#16]
743 adcs @acc[7],@acc[7],@acc[0],lsr#31
744 str @acc[6],[$r_ptr,#20]
745 adc @acc[8],@acc[8],@acc[0]
746 str @acc[7],[$r_ptr,#24]
747 str @acc[8],[$r_ptr,#28]
748
749 mov pc,lr
313e6ec1 750.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
7a6c9a2e
AP
751___
752}
753
754{
755my ($out,$inp,$index,$mask)=map("r$_",(0..3));
756$code.=<<___;
757@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
758@ int r2);
759.globl ecp_nistz256_scatter_w5
760.type ecp_nistz256_scatter_w5,%function
761.align 5
762ecp_nistz256_scatter_w5:
763 stmdb sp!,{r4-r11}
764
765 add $out,$out,$index,lsl#2
766
767 ldmia $inp!,{r4-r11} @ X
768 str r4,[$out,#64*0-4]
769 str r5,[$out,#64*1-4]
770 str r6,[$out,#64*2-4]
771 str r7,[$out,#64*3-4]
772 str r8,[$out,#64*4-4]
773 str r9,[$out,#64*5-4]
774 str r10,[$out,#64*6-4]
775 str r11,[$out,#64*7-4]
776 add $out,$out,#64*8
777
778 ldmia $inp!,{r4-r11} @ Y
779 str r4,[$out,#64*0-4]
780 str r5,[$out,#64*1-4]
781 str r6,[$out,#64*2-4]
782 str r7,[$out,#64*3-4]
783 str r8,[$out,#64*4-4]
784 str r9,[$out,#64*5-4]
785 str r10,[$out,#64*6-4]
786 str r11,[$out,#64*7-4]
787 add $out,$out,#64*8
788
789 ldmia $inp,{r4-r11} @ Z
790 str r4,[$out,#64*0-4]
791 str r5,[$out,#64*1-4]
792 str r6,[$out,#64*2-4]
793 str r7,[$out,#64*3-4]
794 str r8,[$out,#64*4-4]
795 str r9,[$out,#64*5-4]
796 str r10,[$out,#64*6-4]
797 str r11,[$out,#64*7-4]
798
799 ldmia sp!,{r4-r11}
800#if __ARM_ARCH__>=5 || defined(__thumb__)
801 bx lr
802#else
803 mov pc,lr
804#endif
805.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
806
807@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
808@ int r2);
809.globl ecp_nistz256_gather_w5
810.type ecp_nistz256_gather_w5,%function
811.align 5
812ecp_nistz256_gather_w5:
813 stmdb sp!,{r4-r11}
814
815 cmp $index,#0
816 mov $mask,#0
11208dcf
AP
817#ifdef __thumb2__
818 itt ne
819#endif
7a6c9a2e
AP
820 subne $index,$index,#1
821 movne $mask,#-1
822 add $inp,$inp,$index,lsl#2
823
824 ldr r4,[$inp,#64*0]
825 ldr r5,[$inp,#64*1]
826 ldr r6,[$inp,#64*2]
827 and r4,r4,$mask
828 ldr r7,[$inp,#64*3]
829 and r5,r5,$mask
830 ldr r8,[$inp,#64*4]
831 and r6,r6,$mask
832 ldr r9,[$inp,#64*5]
833 and r7,r7,$mask
834 ldr r10,[$inp,#64*6]
835 and r8,r8,$mask
836 ldr r11,[$inp,#64*7]
837 add $inp,$inp,#64*8
838 and r9,r9,$mask
839 and r10,r10,$mask
840 and r11,r11,$mask
841 stmia $out!,{r4-r11} @ X
842
843 ldr r4,[$inp,#64*0]
844 ldr r5,[$inp,#64*1]
845 ldr r6,[$inp,#64*2]
846 and r4,r4,$mask
847 ldr r7,[$inp,#64*3]
848 and r5,r5,$mask
849 ldr r8,[$inp,#64*4]
850 and r6,r6,$mask
851 ldr r9,[$inp,#64*5]
852 and r7,r7,$mask
853 ldr r10,[$inp,#64*6]
854 and r8,r8,$mask
855 ldr r11,[$inp,#64*7]
856 add $inp,$inp,#64*8
857 and r9,r9,$mask
858 and r10,r10,$mask
859 and r11,r11,$mask
860 stmia $out!,{r4-r11} @ Y
861
862 ldr r4,[$inp,#64*0]
863 ldr r5,[$inp,#64*1]
864 ldr r6,[$inp,#64*2]
865 and r4,r4,$mask
866 ldr r7,[$inp,#64*3]
867 and r5,r5,$mask
868 ldr r8,[$inp,#64*4]
869 and r6,r6,$mask
870 ldr r9,[$inp,#64*5]
871 and r7,r7,$mask
872 ldr r10,[$inp,#64*6]
873 and r8,r8,$mask
874 ldr r11,[$inp,#64*7]
875 and r9,r9,$mask
876 and r10,r10,$mask
877 and r11,r11,$mask
878 stmia $out,{r4-r11} @ Z
879
880 ldmia sp!,{r4-r11}
881#if __ARM_ARCH__>=5 || defined(__thumb__)
882 bx lr
883#else
884 mov pc,lr
885#endif
886.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
887
888@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
889@ int r2);
890.globl ecp_nistz256_scatter_w7
891.type ecp_nistz256_scatter_w7,%function
892.align 5
893ecp_nistz256_scatter_w7:
894 add $out,$out,$index
895 mov $index,#64/4
896.Loop_scatter_w7:
897 ldr $mask,[$inp],#4
898 subs $index,$index,#1
87a75b3e 899 strb $mask,[$out,#64*0]
7a6c9a2e 900 mov $mask,$mask,lsr#8
87a75b3e 901 strb $mask,[$out,#64*1]
7a6c9a2e 902 mov $mask,$mask,lsr#8
87a75b3e 903 strb $mask,[$out,#64*2]
7a6c9a2e 904 mov $mask,$mask,lsr#8
87a75b3e 905 strb $mask,[$out,#64*3]
7a6c9a2e
AP
906 add $out,$out,#64*4
907 bne .Loop_scatter_w7
908
909#if __ARM_ARCH__>=5 || defined(__thumb__)
910 bx lr
911#else
912 mov pc,lr
913#endif
914.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
915
916@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
917@ int r2);
918.globl ecp_nistz256_gather_w7
919.type ecp_nistz256_gather_w7,%function
920.align 5
921ecp_nistz256_gather_w7:
922 stmdb sp!,{r4-r7}
923
924 cmp $index,#0
925 mov $mask,#0
11208dcf
AP
926#ifdef __thumb2__
927 itt ne
928#endif
7a6c9a2e
AP
929 subne $index,$index,#1
930 movne $mask,#-1
931 add $inp,$inp,$index
932 mov $index,#64/4
933 nop
934.Loop_gather_w7:
935 ldrb r4,[$inp,#64*0]
936 subs $index,$index,#1
937 ldrb r5,[$inp,#64*1]
938 ldrb r6,[$inp,#64*2]
939 ldrb r7,[$inp,#64*3]
940 add $inp,$inp,#64*4
941 orr r4,r4,r5,lsl#8
942 orr r4,r4,r6,lsl#16
943 orr r4,r4,r7,lsl#24
944 and r4,r4,$mask
945 str r4,[$out],#4
946 bne .Loop_gather_w7
947
948 ldmia sp!,{r4-r7}
949#if __ARM_ARCH__>=5 || defined(__thumb__)
950 bx lr
951#else
952 mov pc,lr
953#endif
954.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
955___
956}
957if (0) {
958# In comparison to integer-only equivalent of below subroutine:
959#
960# Cortex-A8 +10%
961# Cortex-A9 -10%
962# Snapdragon S4 +5%
963#
964# As not all time is spent in multiplication, overall impact is deemed
965# too low to care about.
966
967my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
968my $mask="q4";
969my $mult="q5";
970my @AxB=map("q$_",(8..15));
971
972my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
973
974$code.=<<___;
975#if __ARM_ARCH__>=7
976.fpu neon
977
978.globl ecp_nistz256_mul_mont_neon
979.type ecp_nistz256_mul_mont_neon,%function
980.align 5
981ecp_nistz256_mul_mont_neon:
982 mov ip,sp
983 stmdb sp!,{r4-r9}
984 vstmdb sp!,{q4-q5} @ ABI specification says so
985
986 sub $toutptr,sp,#40
987 vld1.32 {${Bi}[0]},[$bptr,:32]!
988 veor $zero,$zero,$zero
989 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
990 vzip.16 $Bi,$zero
991 mov sp,$toutptr @ alloca
992 vmov.i64 $mask,#0xffff
993
994 vmull.u32 @AxB[0],$Bi,${A0}[0]
995 vmull.u32 @AxB[1],$Bi,${A0}[1]
996 vmull.u32 @AxB[2],$Bi,${A1}[0]
997 vmull.u32 @AxB[3],$Bi,${A1}[1]
998 vshr.u64 $temp,@AxB[0]#lo,#16
999 vmull.u32 @AxB[4],$Bi,${A2}[0]
1000 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
1001 vmull.u32 @AxB[5],$Bi,${A2}[1]
1002 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
1003 vmull.u32 @AxB[6],$Bi,${A3}[0]
1004 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
1005 vmull.u32 @AxB[7],$Bi,${A3}[1]
1006___
1007for($i=1;$i<8;$i++) {
1008$code.=<<___;
1009 vld1.32 {${Bi}[0]},[$bptr,:32]!
1010 veor $zero,$zero,$zero
1011 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
1012 vshl.u64 $mult,@AxB[0],#32
1013 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1014 vsub.u64 $mult,$mult,@AxB[0]
1015 vzip.16 $Bi,$zero
1016 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1017 vadd.u64 @AxB[7],@AxB[7],$mult
1018___
1019 push(@AxB,shift(@AxB));
1020$code.=<<___;
1021 vmlal.u32 @AxB[0],$Bi,${A0}[0]
1022 vmlal.u32 @AxB[1],$Bi,${A0}[1]
1023 vmlal.u32 @AxB[2],$Bi,${A1}[0]
1024 vmlal.u32 @AxB[3],$Bi,${A1}[1]
1025 vshr.u64 $temp,@AxB[0]#lo,#16
1026 vmlal.u32 @AxB[4],$Bi,${A2}[0]
1027 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
1028 vmlal.u32 @AxB[5],$Bi,${A2}[1]
1029 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
1030 vmlal.u32 @AxB[6],$Bi,${A3}[0]
1031 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
1032 vmull.u32 @AxB[7],$Bi,${A3}[1]
1033___
1034}
1035$code.=<<___;
1036 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
1037 vshl.u64 $mult,@AxB[0],#32
1038 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1039 vsub.u64 $mult,$mult,@AxB[0]
1040 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1041 vadd.u64 @AxB[7],@AxB[7],$mult
1042
1043 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
1044 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
1045 vshr.u64 $temp,@AxB[1]#hi,#16
1046 vzip.16 @AxB[1]#lo,@AxB[1]#hi
1047___
1048foreach (2..7) {
1049$code.=<<___;
1050 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
1051 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1052 vshr.u64 $temp,@AxB[$_]#lo,#16
1053 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
1054 vshr.u64 $temp,@AxB[$_]#hi,#16
1055 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
1056___
1057}
1058$code.=<<___;
1059 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
1060 vst1.32 {$temp},[$toutptr] @ upper 33 bits
1061
1062 ldr r1,[sp,#0]
1063 ldr r2,[sp,#4]
1064 ldr r3,[sp,#8]
1065 subs r1,r1,#-1
1066 ldr r4,[sp,#12]
1067 sbcs r2,r2,#-1
1068 ldr r5,[sp,#16]
1069 sbcs r3,r3,#-1
1070 ldr r6,[sp,#20]
1071 sbcs r4,r4,#0
1072 ldr r7,[sp,#24]
1073 sbcs r5,r5,#0
1074 ldr r8,[sp,#28]
1075 sbcs r6,r6,#0
1076 ldr r9,[sp,#32] @ top-most bit
1077 sbcs r7,r7,#1
1078 sub sp,ip,#40+16
1079 sbcs r8,r8,#-1
1080 sbc r9,r9,#0
1081 vldmia sp!,{q4-q5}
1082
1083 adds r1,r1,r9
1084 adcs r2,r2,r9
1085 str r1,[$rptr,#0]
1086 adcs r3,r3,r9
1087 str r2,[$rptr,#4]
1088 adcs r4,r4,#0
1089 str r3,[$rptr,#8]
1090 adcs r5,r5,#0
1091 str r4,[$rptr,#12]
1092 adcs r6,r6,#0
1093 str r5,[$rptr,#16]
1094 adcs r7,r7,r9,lsr#31
1095 str r6,[$rptr,#20]
1096 adcs r8,r8,r9
1097 str r7,[$rptr,#24]
1098 str r8,[$rptr,#28]
1099
1100 ldmia sp!,{r4-r9}
1101 bx lr
1102.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1103#endif
1104___
1105}
1106
1107{{{
1108########################################################################
1109# Below $aN assignment matches order in which 256-bit result appears in
313e6ec1 1110# register bank at return from __ecp_nistz256_mul_mont, so that we can
7a6c9a2e
AP
1111# skip over reloading it from memory. This means that below functions
1112# use custom calling sequence accepting 256-bit input in registers,
1113# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1114#
1115# See their "normal" counterparts for insights on calculations.
1116
1117my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1118 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1119my $ff=$b_ptr;
1120
1121$code.=<<___;
1122.type __ecp_nistz256_sub_from,%function
1123.align 5
1124__ecp_nistz256_sub_from:
1125 str lr,[sp,#-4]! @ push lr
1126
1127 ldr $t0,[$b_ptr,#0]
1128 ldr $t1,[$b_ptr,#4]
1129 ldr $t2,[$b_ptr,#8]
1130 ldr $t3,[$b_ptr,#12]
1131 subs $a0,$a0,$t0
1132 ldr $t0,[$b_ptr,#16]
1133 sbcs $a1,$a1,$t1
1134 ldr $t1,[$b_ptr,#20]
1135 sbcs $a2,$a2,$t2
1136 ldr $t2,[$b_ptr,#24]
1137 sbcs $a3,$a3,$t3
1138 ldr $t3,[$b_ptr,#28]
1139 sbcs $a4,$a4,$t0
1140 sbcs $a5,$a5,$t1
1141 sbcs $a6,$a6,$t2
1142 sbcs $a7,$a7,$t3
1143 sbc $ff,$ff,$ff @ broadcast borrow bit
1144 ldr lr,[sp],#4 @ pop lr
1145
1146 adds $a0,$a0,$ff @ add synthesized modulus
1147 adcs $a1,$a1,$ff
1148 str $a0,[$r_ptr,#0]
1149 adcs $a2,$a2,$ff
1150 str $a1,[$r_ptr,#4]
1151 adcs $a3,$a3,#0
1152 str $a2,[$r_ptr,#8]
1153 adcs $a4,$a4,#0
1154 str $a3,[$r_ptr,#12]
1155 adcs $a5,$a5,#0
1156 str $a4,[$r_ptr,#16]
1157 adcs $a6,$a6,$ff,lsr#31
1158 str $a5,[$r_ptr,#20]
1159 adcs $a7,$a7,$ff
1160 str $a6,[$r_ptr,#24]
1161 str $a7,[$r_ptr,#28]
1162
1163 mov pc,lr
1164.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1165
1166.type __ecp_nistz256_sub_morf,%function
1167.align 5
1168__ecp_nistz256_sub_morf:
1169 str lr,[sp,#-4]! @ push lr
1170
1171 ldr $t0,[$b_ptr,#0]
1172 ldr $t1,[$b_ptr,#4]
1173 ldr $t2,[$b_ptr,#8]
1174 ldr $t3,[$b_ptr,#12]
1175 subs $a0,$t0,$a0
1176 ldr $t0,[$b_ptr,#16]
1177 sbcs $a1,$t1,$a1
1178 ldr $t1,[$b_ptr,#20]
1179 sbcs $a2,$t2,$a2
1180 ldr $t2,[$b_ptr,#24]
1181 sbcs $a3,$t3,$a3
1182 ldr $t3,[$b_ptr,#28]
1183 sbcs $a4,$t0,$a4
1184 sbcs $a5,$t1,$a5
1185 sbcs $a6,$t2,$a6
1186 sbcs $a7,$t3,$a7
1187 sbc $ff,$ff,$ff @ broadcast borrow bit
1188 ldr lr,[sp],#4 @ pop lr
1189
1190 adds $a0,$a0,$ff @ add synthesized modulus
1191 adcs $a1,$a1,$ff
1192 str $a0,[$r_ptr,#0]
1193 adcs $a2,$a2,$ff
1194 str $a1,[$r_ptr,#4]
1195 adcs $a3,$a3,#0
1196 str $a2,[$r_ptr,#8]
1197 adcs $a4,$a4,#0
1198 str $a3,[$r_ptr,#12]
1199 adcs $a5,$a5,#0
1200 str $a4,[$r_ptr,#16]
1201 adcs $a6,$a6,$ff,lsr#31
1202 str $a5,[$r_ptr,#20]
1203 adcs $a7,$a7,$ff
1204 str $a6,[$r_ptr,#24]
1205 str $a7,[$r_ptr,#28]
1206
1207 mov pc,lr
1208.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1209
313e6ec1 1210.type __ecp_nistz256_add_self,%function
7a6c9a2e 1211.align 4
313e6ec1 1212__ecp_nistz256_add_self:
7a6c9a2e
AP
1213 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
1214 adcs $a1,$a1,$a1
1215 adcs $a2,$a2,$a2
1216 adcs $a3,$a3,$a3
1217 adcs $a4,$a4,$a4
1218 adcs $a5,$a5,$a5
1219 adcs $a6,$a6,$a6
1220 mov $ff,#0
1221 adcs $a7,$a7,$a7
dfde4219
AP
1222 adc $ff,$ff,#0
1223
1224 @ if a+b >= modulus, subtract modulus.
1225 @
1226 @ But since comparison implies subtraction, we subtract
46f4e1be 1227 @ modulus and then add it back if subtraction borrowed.
dfde4219
AP
1228
1229 subs $a0,$a0,#-1
1230 sbcs $a1,$a1,#-1
1231 sbcs $a2,$a2,#-1
1232 sbcs $a3,$a3,#0
1233 sbcs $a4,$a4,#0
1234 sbcs $a5,$a5,#0
1235 sbcs $a6,$a6,#1
1236 sbcs $a7,$a7,#-1
1237 sbc $ff,$ff,#0
7a6c9a2e 1238
dfde4219
AP
1239 @ Note that because mod has special form, i.e. consists of
1240 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1241 @ using value of borrow as a whole or extracting single bit.
1242 @ Follow $ff register...
1243
1244 adds $a0,$a0,$ff @ add synthesized modulus
1245 adcs $a1,$a1,$ff
7a6c9a2e 1246 str $a0,[$r_ptr,#0]
dfde4219 1247 adcs $a2,$a2,$ff
7a6c9a2e 1248 str $a1,[$r_ptr,#4]
dfde4219 1249 adcs $a3,$a3,#0
7a6c9a2e 1250 str $a2,[$r_ptr,#8]
dfde4219 1251 adcs $a4,$a4,#0
7a6c9a2e 1252 str $a3,[$r_ptr,#12]
dfde4219 1253 adcs $a5,$a5,#0
7a6c9a2e 1254 str $a4,[$r_ptr,#16]
dfde4219 1255 adcs $a6,$a6,$ff,lsr#31
7a6c9a2e 1256 str $a5,[$r_ptr,#20]
dfde4219 1257 adcs $a7,$a7,$ff
7a6c9a2e
AP
1258 str $a6,[$r_ptr,#24]
1259 str $a7,[$r_ptr,#28]
1260
1261 mov pc,lr
313e6ec1 1262.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
7a6c9a2e
AP
1263
1264___
1265
1266########################################################################
085b3860 1267# following subroutines are "literal" implementation of those found in
7a6c9a2e
AP
1268# ecp_nistz256.c
1269#
1270########################################################################
1271# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1272#
1273{
1274my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1275# above map() describes stack layout with 5 temporary
1276# 256-bit vectors on top. Then note that we push
1277# starting from r0, which means that we have copy of
1278# input arguments just below these temporary vectors.
1279
1280$code.=<<___;
1281.globl ecp_nistz256_point_double
1282.type ecp_nistz256_point_double,%function
1283.align 5
1284ecp_nistz256_point_double:
1285 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1286 sub sp,sp,#32*5
1287
143ee099 1288.Lpoint_double_shortcut:
7a6c9a2e
AP
1289 add r3,sp,#$in_x
1290 ldmia $a_ptr!,{r4-r11} @ copy in_x
1291 stmia r3,{r4-r11}
1292
1293 add $r_ptr,sp,#$S
313e6ec1 1294 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
7a6c9a2e
AP
1295
1296 add $b_ptr,$a_ptr,#32
1297 add $a_ptr,$a_ptr,#32
1298 add $r_ptr,sp,#$Zsqr
313e6ec1 1299 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
7a6c9a2e
AP
1300
1301 add $a_ptr,sp,#$S
1302 add $b_ptr,sp,#$S
1303 add $r_ptr,sp,#$S
313e6ec1 1304 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
7a6c9a2e
AP
1305
1306 ldr $b_ptr,[sp,#32*5+4]
1307 add $a_ptr,$b_ptr,#32
1308 add $b_ptr,$b_ptr,#64
1309 add $r_ptr,sp,#$tmp0
313e6ec1 1310 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
7a6c9a2e
AP
1311
1312 ldr $r_ptr,[sp,#32*5]
1313 add $r_ptr,$r_ptr,#64
313e6ec1 1314 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
7a6c9a2e
AP
1315
1316 add $a_ptr,sp,#$in_x
1317 add $b_ptr,sp,#$Zsqr
1318 add $r_ptr,sp,#$M
313e6ec1 1319 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
7a6c9a2e
AP
1320
1321 add $a_ptr,sp,#$in_x
1322 add $b_ptr,sp,#$Zsqr
1323 add $r_ptr,sp,#$Zsqr
313e6ec1 1324 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
7a6c9a2e
AP
1325
1326 add $a_ptr,sp,#$S
1327 add $b_ptr,sp,#$S
1328 add $r_ptr,sp,#$tmp0
313e6ec1 1329 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
7a6c9a2e
AP
1330
1331 add $a_ptr,sp,#$Zsqr
1332 add $b_ptr,sp,#$M
1333 add $r_ptr,sp,#$M
313e6ec1 1334 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
7a6c9a2e
AP
1335
1336 ldr $r_ptr,[sp,#32*5]
1337 add $a_ptr,sp,#$tmp0
1338 add $r_ptr,$r_ptr,#32
313e6ec1 1339 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
7a6c9a2e
AP
1340
1341 add $a_ptr,sp,#$M
1342 add $r_ptr,sp,#$M
313e6ec1 1343 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
7a6c9a2e
AP
1344
1345 add $a_ptr,sp,#$in_x
1346 add $b_ptr,sp,#$S
1347 add $r_ptr,sp,#$S
313e6ec1 1348 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
7a6c9a2e
AP
1349
1350 add $r_ptr,sp,#$tmp0
313e6ec1 1351 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
7a6c9a2e
AP
1352
1353 ldr $r_ptr,[sp,#32*5]
1354 add $a_ptr,sp,#$M
1355 add $b_ptr,sp,#$M
313e6ec1 1356 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
7a6c9a2e
AP
1357
1358 add $b_ptr,sp,#$tmp0
1359 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
1360
1361 add $b_ptr,sp,#$S
1362 add $r_ptr,sp,#$S
1363 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
1364
1365 add $a_ptr,sp,#$M
1366 add $b_ptr,sp,#$S
313e6ec1 1367 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
7a6c9a2e
AP
1368
1369 ldr $r_ptr,[sp,#32*5]
1370 add $b_ptr,$r_ptr,#32
1371 add $r_ptr,$r_ptr,#32
1372 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
1373
1374 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
1375#if __ARM_ARCH__>=5 || !defined(__thumb__)
1376 ldmia sp!,{r4-r12,pc}
1377#else
1378 ldmia sp!,{r4-r12,lr}
1379 bx lr @ interoperable with Thumb ISA:-)
1380#endif
1381.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1382___
1383}
1384
1385########################################################################
1386# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1387# const P256_POINT *in2);
1388{
1389my ($res_x,$res_y,$res_z,
1390 $in1_x,$in1_y,$in1_z,
1391 $in2_x,$in2_y,$in2_z,
1392 $H,$Hsqr,$R,$Rsqr,$Hcub,
1393 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1394my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1395# above map() describes stack layout with 18 temporary
1396# 256-bit vectors on top. Then note that we push
1397# starting from r0, which means that we have copy of
1398# input arguments just below these temporary vectors.
1399# We use three of them for !in1infty, !in2intfy and
1400# result of check for zero.
1401
1402$code.=<<___;
1403.globl ecp_nistz256_point_add
1404.type ecp_nistz256_point_add,%function
1405.align 5
1406ecp_nistz256_point_add:
1407 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
143ee099 1408 sub sp,sp,#32*18+16
7a6c9a2e 1409
c74aea8d 1410 ldmia $b_ptr!,{r4-r11} @ copy in2_x
7a6c9a2e 1411 add r3,sp,#$in2_x
7a6c9a2e 1412 stmia r3!,{r4-r11}
c74aea8d
AP
1413 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1414 stmia r3!,{r4-r11}
1415 ldmia $b_ptr,{r4-r11} @ copy in2_z
1416 orr r12,r4,r5
7a6c9a2e
AP
1417 orr r12,r12,r6
1418 orr r12,r12,r7
1419 orr r12,r12,r8
1420 orr r12,r12,r9
1421 orr r12,r12,r10
1422 orr r12,r12,r11
7a6c9a2e 1423 cmp r12,#0
11208dcf
AP
1424#ifdef __thumb2__
1425 it ne
1426#endif
7a6c9a2e
AP
1427 movne r12,#-1
1428 stmia r3,{r4-r11}
1429 str r12,[sp,#32*18+8] @ !in2infty
1430
c74aea8d 1431 ldmia $a_ptr!,{r4-r11} @ copy in1_x
7a6c9a2e 1432 add r3,sp,#$in1_x
7a6c9a2e 1433 stmia r3!,{r4-r11}
c74aea8d
AP
1434 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1435 stmia r3!,{r4-r11}
1436 ldmia $a_ptr,{r4-r11} @ copy in1_z
1437 orr r12,r4,r5
7a6c9a2e
AP
1438 orr r12,r12,r6
1439 orr r12,r12,r7
1440 orr r12,r12,r8
1441 orr r12,r12,r9
1442 orr r12,r12,r10
1443 orr r12,r12,r11
7a6c9a2e 1444 cmp r12,#0
11208dcf
AP
1445#ifdef __thumb2__
1446 it ne
1447#endif
7a6c9a2e
AP
1448 movne r12,#-1
1449 stmia r3,{r4-r11}
1450 str r12,[sp,#32*18+4] @ !in1infty
1451
1452 add $a_ptr,sp,#$in2_z
1453 add $b_ptr,sp,#$in2_z
1454 add $r_ptr,sp,#$Z2sqr
313e6ec1 1455 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
7a6c9a2e
AP
1456
1457 add $a_ptr,sp,#$in1_z
1458 add $b_ptr,sp,#$in1_z
1459 add $r_ptr,sp,#$Z1sqr
313e6ec1 1460 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
7a6c9a2e
AP
1461
1462 add $a_ptr,sp,#$in2_z
1463 add $b_ptr,sp,#$Z2sqr
1464 add $r_ptr,sp,#$S1
313e6ec1 1465 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
7a6c9a2e
AP
1466
1467 add $a_ptr,sp,#$in1_z
1468 add $b_ptr,sp,#$Z1sqr
1469 add $r_ptr,sp,#$S2
313e6ec1 1470 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
7a6c9a2e
AP
1471
1472 add $a_ptr,sp,#$in1_y
1473 add $b_ptr,sp,#$S1
1474 add $r_ptr,sp,#$S1
313e6ec1 1475 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
7a6c9a2e
AP
1476
1477 add $a_ptr,sp,#$in2_y
1478 add $b_ptr,sp,#$S2
1479 add $r_ptr,sp,#$S2
313e6ec1 1480 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
7a6c9a2e
AP
1481
1482 add $b_ptr,sp,#$S1
1483 add $r_ptr,sp,#$R
1484 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
1485
1486 orr $a0,$a0,$a1 @ see if result is zero
1487 orr $a2,$a2,$a3
1488 orr $a4,$a4,$a5
1489 orr $a0,$a0,$a2
1490 orr $a4,$a4,$a6
1491 orr $a0,$a0,$a7
1492 add $a_ptr,sp,#$in1_x
1493 orr $a0,$a0,$a4
1494 add $b_ptr,sp,#$Z2sqr
1495 str $a0,[sp,#32*18+12]
1496
1497 add $r_ptr,sp,#$U1
313e6ec1 1498 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
7a6c9a2e
AP
1499
1500 add $a_ptr,sp,#$in2_x
1501 add $b_ptr,sp,#$Z1sqr
1502 add $r_ptr,sp,#$U2
313e6ec1 1503 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
7a6c9a2e
AP
1504
1505 add $b_ptr,sp,#$U1
1506 add $r_ptr,sp,#$H
1507 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
1508
1509 orr $a0,$a0,$a1 @ see if result is zero
1510 orr $a2,$a2,$a3
1511 orr $a4,$a4,$a5
1512 orr $a0,$a0,$a2
1513 orr $a4,$a4,$a6
1514 orr $a0,$a0,$a7
1515 orrs $a0,$a0,$a4
1516
1517 bne .Ladd_proceed @ is_equal(U1,U2)?
1518
1519 ldr $t0,[sp,#32*18+4]
1520 ldr $t1,[sp,#32*18+8]
1521 ldr $t2,[sp,#32*18+12]
1522 tst $t0,$t1
1523 beq .Ladd_proceed @ (in1infty || in2infty)?
1524 tst $t2,$t2
143ee099 1525 beq .Ladd_double @ is_equal(S1,S2)?
7a6c9a2e 1526
143ee099 1527 ldr $r_ptr,[sp,#32*18+16]
7a6c9a2e
AP
1528 eor r4,r4,r4
1529 eor r5,r5,r5
1530 eor r6,r6,r6
1531 eor r7,r7,r7
1532 eor r8,r8,r8
1533 eor r9,r9,r9
1534 eor r10,r10,r10
1535 eor r11,r11,r11
1536 stmia $r_ptr!,{r4-r11}
1537 stmia $r_ptr!,{r4-r11}
1538 stmia $r_ptr!,{r4-r11}
1539 b .Ladd_done
1540
143ee099
AP
1541.align 4
1542.Ladd_double:
1543 ldr $a_ptr,[sp,#32*18+20]
1544 add sp,sp,#32*(18-5)+16 @ difference in frame sizes
1545 b .Lpoint_double_shortcut
1546
7a6c9a2e
AP
1547.align 4
1548.Ladd_proceed:
1549 add $a_ptr,sp,#$R
1550 add $b_ptr,sp,#$R
1551 add $r_ptr,sp,#$Rsqr
313e6ec1 1552 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
7a6c9a2e
AP
1553
1554 add $a_ptr,sp,#$H
1555 add $b_ptr,sp,#$in1_z
1556 add $r_ptr,sp,#$res_z
313e6ec1 1557 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
7a6c9a2e
AP
1558
1559 add $a_ptr,sp,#$H
1560 add $b_ptr,sp,#$H
1561 add $r_ptr,sp,#$Hsqr
313e6ec1 1562 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
7a6c9a2e
AP
1563
1564 add $a_ptr,sp,#$in2_z
1565 add $b_ptr,sp,#$res_z
1566 add $r_ptr,sp,#$res_z
313e6ec1 1567 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
7a6c9a2e
AP
1568
1569 add $a_ptr,sp,#$H
1570 add $b_ptr,sp,#$Hsqr
1571 add $r_ptr,sp,#$Hcub
313e6ec1 1572 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
7a6c9a2e
AP
1573
1574 add $a_ptr,sp,#$Hsqr
1575 add $b_ptr,sp,#$U1
1576 add $r_ptr,sp,#$U2
313e6ec1 1577 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
7a6c9a2e
AP
1578
1579 add $r_ptr,sp,#$Hsqr
313e6ec1 1580 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
7a6c9a2e
AP
1581
1582 add $b_ptr,sp,#$Rsqr
1583 add $r_ptr,sp,#$res_x
1584 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1585
1586 add $b_ptr,sp,#$Hcub
1587 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1588
1589 add $b_ptr,sp,#$U2
1590 add $r_ptr,sp,#$res_y
1591 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1592
1593 add $a_ptr,sp,#$Hcub
1594 add $b_ptr,sp,#$S1
1595 add $r_ptr,sp,#$S2
313e6ec1 1596 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
7a6c9a2e
AP
1597
1598 add $a_ptr,sp,#$R
1599 add $b_ptr,sp,#$res_y
1600 add $r_ptr,sp,#$res_y
313e6ec1 1601 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
7a6c9a2e
AP
1602
1603 add $b_ptr,sp,#$S2
1604 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1605
1606 ldr r11,[sp,#32*18+4] @ !in1intfy
1607 ldr r12,[sp,#32*18+8] @ !in2intfy
1608 add r1,sp,#$res_x
1609 add r2,sp,#$in2_x
1610 and r10,r11,r12
1611 mvn r11,r11
1612 add r3,sp,#$in1_x
1613 and r11,r11,r12
1614 mvn r12,r12
143ee099 1615 ldr $r_ptr,[sp,#32*18+16]
7a6c9a2e
AP
1616___
1617for($i=0;$i<96;$i+=8) { # conditional moves
1618$code.=<<___;
1619 ldmia r1!,{r4-r5} @ res_x
1620 ldmia r2!,{r6-r7} @ in2_x
1621 ldmia r3!,{r8-r9} @ in1_x
1622 and r4,r4,r10
1623 and r5,r5,r10
1624 and r6,r6,r11
1625 and r7,r7,r11
1626 and r8,r8,r12
1627 and r9,r9,r12
1628 orr r4,r4,r6
1629 orr r5,r5,r7
1630 orr r4,r4,r8
1631 orr r5,r5,r9
1632 stmia $r_ptr!,{r4-r5}
1633___
1634}
1635$code.=<<___;
1636.Ladd_done:
143ee099 1637 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
198a2ed7 1638#if __ARM_ARCH__>=5 || !defined(__thumb__)
7a6c9a2e
AP
1639 ldmia sp!,{r4-r12,pc}
1640#else
1641 ldmia sp!,{r4-r12,lr}
1642 bx lr @ interoperable with Thumb ISA:-)
1643#endif
1644.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1645___
1646}
1647
1648########################################################################
1649# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1650# const P256_POINT_AFFINE *in2);
1651{
1652my ($res_x,$res_y,$res_z,
1653 $in1_x,$in1_y,$in1_z,
1654 $in2_x,$in2_y,
1655 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1656my $Z1sqr = $S2;
1657# above map() describes stack layout with 18 temporary
1658# 256-bit vectors on top. Then note that we push
1659# starting from r0, which means that we have copy of
1660# input arguments just below these temporary vectors.
1661# We use two of them for !in1infty, !in2intfy.
1662
1663my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1664
1665$code.=<<___;
1666.globl ecp_nistz256_point_add_affine
1667.type ecp_nistz256_point_add_affine,%function
1668.align 5
1669ecp_nistz256_point_add_affine:
1670 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1671 sub sp,sp,#32*15
1672
c74aea8d 1673 ldmia $a_ptr!,{r4-r11} @ copy in1_x
7a6c9a2e 1674 add r3,sp,#$in1_x
7a6c9a2e 1675 stmia r3!,{r4-r11}
c74aea8d
AP
1676 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1677 stmia r3!,{r4-r11}
1678 ldmia $a_ptr,{r4-r11} @ copy in1_z
1679 orr r12,r4,r5
7a6c9a2e
AP
1680 orr r12,r12,r6
1681 orr r12,r12,r7
1682 orr r12,r12,r8
1683 orr r12,r12,r9
1684 orr r12,r12,r10
1685 orr r12,r12,r11
7a6c9a2e 1686 cmp r12,#0
11208dcf
AP
1687#ifdef __thumb2__
1688 it ne
1689#endif
7a6c9a2e
AP
1690 movne r12,#-1
1691 stmia r3,{r4-r11}
1692 str r12,[sp,#32*15+4] @ !in1infty
1693
c74aea8d 1694 ldmia $b_ptr!,{r4-r11} @ copy in2_x
7a6c9a2e
AP
1695 add r3,sp,#$in2_x
1696 orr r12,r4,r5
1697 orr r12,r12,r6
1698 orr r12,r12,r7
1699 orr r12,r12,r8
1700 orr r12,r12,r9
1701 orr r12,r12,r10
1702 orr r12,r12,r11
1703 stmia r3!,{r4-r11}
c74aea8d 1704 ldmia $b_ptr!,{r4-r11} @ copy in2_y
7a6c9a2e
AP
1705 orr r12,r12,r4
1706 orr r12,r12,r5
1707 orr r12,r12,r6
1708 orr r12,r12,r7
1709 orr r12,r12,r8
1710 orr r12,r12,r9
1711 orr r12,r12,r10
1712 orr r12,r12,r11
1713 stmia r3!,{r4-r11}
1714 cmp r12,#0
11208dcf
AP
1715#ifdef __thumb2__
1716 it ne
1717#endif
7a6c9a2e
AP
1718 movne r12,#-1
1719 str r12,[sp,#32*15+8] @ !in2infty
1720
1721 add $a_ptr,sp,#$in1_z
1722 add $b_ptr,sp,#$in1_z
1723 add $r_ptr,sp,#$Z1sqr
313e6ec1 1724 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
7a6c9a2e
AP
1725
1726 add $a_ptr,sp,#$Z1sqr
1727 add $b_ptr,sp,#$in2_x
1728 add $r_ptr,sp,#$U2
313e6ec1 1729 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
7a6c9a2e
AP
1730
1731 add $b_ptr,sp,#$in1_x
1732 add $r_ptr,sp,#$H
1733 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
1734
1735 add $a_ptr,sp,#$Z1sqr
1736 add $b_ptr,sp,#$in1_z
1737 add $r_ptr,sp,#$S2
313e6ec1 1738 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
7a6c9a2e
AP
1739
1740 add $a_ptr,sp,#$H
1741 add $b_ptr,sp,#$in1_z
1742 add $r_ptr,sp,#$res_z
313e6ec1 1743 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
7a6c9a2e
AP
1744
1745 add $a_ptr,sp,#$in2_y
1746 add $b_ptr,sp,#$S2
1747 add $r_ptr,sp,#$S2
313e6ec1 1748 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
7a6c9a2e
AP
1749
1750 add $b_ptr,sp,#$in1_y
1751 add $r_ptr,sp,#$R
1752 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
1753
1754 add $a_ptr,sp,#$H
1755 add $b_ptr,sp,#$H
1756 add $r_ptr,sp,#$Hsqr
313e6ec1 1757 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
7a6c9a2e
AP
1758
1759 add $a_ptr,sp,#$R
1760 add $b_ptr,sp,#$R
1761 add $r_ptr,sp,#$Rsqr
313e6ec1 1762 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
7a6c9a2e
AP
1763
1764 add $a_ptr,sp,#$H
1765 add $b_ptr,sp,#$Hsqr
1766 add $r_ptr,sp,#$Hcub
313e6ec1 1767 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
7a6c9a2e
AP
1768
1769 add $a_ptr,sp,#$Hsqr
1770 add $b_ptr,sp,#$in1_x
1771 add $r_ptr,sp,#$U2
313e6ec1 1772 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
7a6c9a2e
AP
1773
1774 add $r_ptr,sp,#$Hsqr
313e6ec1 1775 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
7a6c9a2e
AP
1776
1777 add $b_ptr,sp,#$Rsqr
1778 add $r_ptr,sp,#$res_x
1779 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1780
1781 add $b_ptr,sp,#$Hcub
1782 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1783
1784 add $b_ptr,sp,#$U2
1785 add $r_ptr,sp,#$res_y
1786 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1787
1788 add $a_ptr,sp,#$Hcub
1789 add $b_ptr,sp,#$in1_y
1790 add $r_ptr,sp,#$S2
313e6ec1 1791 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
7a6c9a2e
AP
1792
1793 add $a_ptr,sp,#$R
1794 add $b_ptr,sp,#$res_y
1795 add $r_ptr,sp,#$res_y
313e6ec1 1796 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
7a6c9a2e
AP
1797
1798 add $b_ptr,sp,#$S2
1799 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1800
1801 ldr r11,[sp,#32*15+4] @ !in1intfy
1802 ldr r12,[sp,#32*15+8] @ !in2intfy
1803 add r1,sp,#$res_x
1804 add r2,sp,#$in2_x
1805 and r10,r11,r12
1806 mvn r11,r11
1807 add r3,sp,#$in1_x
1808 and r11,r11,r12
1809 mvn r12,r12
1810 ldr $r_ptr,[sp,#32*15]
1811___
1812for($i=0;$i<64;$i+=8) { # conditional moves
1813$code.=<<___;
1814 ldmia r1!,{r4-r5} @ res_x
1815 ldmia r2!,{r6-r7} @ in2_x
1816 ldmia r3!,{r8-r9} @ in1_x
1817 and r4,r4,r10
1818 and r5,r5,r10
1819 and r6,r6,r11
1820 and r7,r7,r11
1821 and r8,r8,r12
1822 and r9,r9,r12
1823 orr r4,r4,r6
1824 orr r5,r5,r7
1825 orr r4,r4,r8
1826 orr r5,r5,r9
1827 stmia $r_ptr!,{r4-r5}
1828___
1829}
1830for(;$i<96;$i+=8) {
1831my $j=($i-64)/4;
1832$code.=<<___;
1833 ldmia r1!,{r4-r5} @ res_z
1834 ldmia r3!,{r8-r9} @ in1_z
1835 and r4,r4,r10
1836 and r5,r5,r10
1837 and r6,r11,#@ONE_mont[$j]
1838 and r7,r11,#@ONE_mont[$j+1]
1839 and r8,r8,r12
1840 and r9,r9,r12
1841 orr r4,r4,r6
1842 orr r5,r5,r7
1843 orr r4,r4,r8
1844 orr r5,r5,r9
1845 stmia $r_ptr!,{r4-r5}
1846___
1847}
1848$code.=<<___;
1849 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
1850#if __ARM_ARCH__>=5 || !defined(__thumb__)
1851 ldmia sp!,{r4-r12,pc}
1852#else
1853 ldmia sp!,{r4-r12,lr}
1854 bx lr @ interoperable with Thumb ISA:-)
1855#endif
1856.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1857___
1858} }}}
1859
1860foreach (split("\n",$code)) {
1861 s/\`([^\`]*)\`/eval $1/geo;
1862
1863 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1864
1865 print $_,"\n";
1866}
1867close STDOUT; # enforce flush