]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
6ec5fce2 | 2 | # Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
a7f182b7 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
7a6c9a2e AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # ECP_NISTZ256 module for ARMv4. | |
18 | # | |
19 | # October 2014. | |
20 | # | |
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | |
22 | # http://eprint.iacr.org/2013/816. In the process of adaptation | |
23 | # original .c module was made 32-bit savvy in order to make this | |
24 | # implementation possible. | |
25 | # | |
26 | # with/without -DECP_NISTZ256_ASM | |
27 | # Cortex-A8 +53-170% | |
28 | # Cortex-A9 +76-205% | |
29 | # Cortex-A15 +100-316% | |
30 | # Snapdragon S4 +66-187% | |
31 | # | |
32 | # Ranges denote minimum and maximum improvement coefficients depending | |
33 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | |
34 | # operation. Keep in mind that +200% means 3x improvement. | |
35 | ||
36 | $flavour = shift; | |
a5aa63a4 RL |
37 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } |
38 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | |
313e6ec1 AP |
39 | |
40 | if ($flavour && $flavour ne "void") { | |
41 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
42 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
43 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
44 | die "can't locate arm-xlate.pl"; | |
45 | ||
46 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; | |
47 | } else { | |
48 | open STDOUT,">$output"; | |
49 | } | |
7a6c9a2e AP |
50 | |
51 | $code.=<<___; | |
52 | #include "arm_arch.h" | |
53 | ||
a2859927 | 54 | #if defined(__thumb2__) |
11208dcf AP |
55 | .syntax unified |
56 | .thumb | |
57 | #else | |
7a6c9a2e | 58 | .code 32 |
11208dcf | 59 | #endif |
7a6c9a2e AP |
60 | ___ |
61 | ######################################################################## | |
62 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
63 | # | |
64 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
65 | open TABLE,"<ecp_nistz256_table.c" or | |
66 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
67 | die "failed to open ecp_nistz256_table.c:",$!; | |
68 | ||
69 | use integer; | |
70 | ||
71 | foreach(<TABLE>) { | |
72 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
73 | } | |
74 | close TABLE; | |
75 | ||
76 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37. | |
77 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not | |
78 | # amount of elements. | |
79 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
80 | ||
81 | $code.=<<___; | |
3405db97 | 82 | .rodata |
7a6c9a2e AP |
83 | .globl ecp_nistz256_precomputed |
84 | .type ecp_nistz256_precomputed,%object | |
85 | .align 12 | |
86 | ecp_nistz256_precomputed: | |
87 | ___ | |
88 | ######################################################################## | |
89 | # this conversion smashes P256_POINT_AFFINE by individual bytes with | |
90 | # 64 byte interval, similar to | |
91 | # 1111222233334444 | |
92 | # 1234123412341234 | |
93 | for(1..37) { | |
94 | @tbl = splice(@arr,0,64*16); | |
95 | for($i=0;$i<64;$i++) { | |
96 | undef @line; | |
97 | for($j=0;$j<64;$j++) { | |
98 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; | |
99 | } | |
100 | $code.=".byte\t"; | |
101 | $code.=join(',',map { sprintf "0x%02x",$_} @line); | |
102 | $code.="\n"; | |
103 | } | |
104 | } | |
105 | $code.=<<___; | |
106 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed | |
3405db97 AP |
107 | |
108 | .text | |
7a6c9a2e AP |
109 | .align 5 |
110 | .LRR: @ 2^512 mod P precomputed for NIST P256 polynomial | |
111 | .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb | |
112 | .long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 | |
113 | .Lone: | |
114 | .long 1,0,0,0,0,0,0,0 | |
115 | .asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | |
116 | .align 6 | |
117 | ___ | |
118 | ||
119 | ######################################################################## | |
120 | # common register layout, note that $t2 is link register, so that if | |
121 | # internal subroutine uses $t2, then it has to offload lr... | |
122 | ||
123 | ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= | |
124 | map("r$_",(0..12,14)); | |
125 | ($t0,$t3)=($ff,$a_ptr); | |
126 | ||
127 | $code.=<<___; | |
128 | @ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
129 | .globl ecp_nistz256_to_mont | |
130 | .type ecp_nistz256_to_mont,%function | |
131 | ecp_nistz256_to_mont: | |
132 | adr $b_ptr,.LRR | |
133 | b .Lecp_nistz256_mul_mont | |
134 | .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont | |
135 | ||
136 | @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
137 | .globl ecp_nistz256_from_mont | |
138 | .type ecp_nistz256_from_mont,%function | |
139 | ecp_nistz256_from_mont: | |
140 | adr $b_ptr,.Lone | |
141 | b .Lecp_nistz256_mul_mont | |
142 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | |
143 | ||
144 | @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
145 | .globl ecp_nistz256_mul_by_2 | |
146 | .type ecp_nistz256_mul_by_2,%function | |
147 | .align 4 | |
148 | ecp_nistz256_mul_by_2: | |
149 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 150 | bl __ecp_nistz256_mul_by_2 |
7a6c9a2e AP |
151 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
152 | ldmia sp!,{r4-r12,pc} | |
153 | #else | |
154 | ldmia sp!,{r4-r12,lr} | |
155 | bx lr @ interoperable with Thumb ISA:-) | |
156 | #endif | |
157 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | |
158 | ||
313e6ec1 | 159 | .type __ecp_nistz256_mul_by_2,%function |
7a6c9a2e | 160 | .align 4 |
313e6ec1 | 161 | __ecp_nistz256_mul_by_2: |
7a6c9a2e AP |
162 | ldr $a0,[$a_ptr,#0] |
163 | ldr $a1,[$a_ptr,#4] | |
164 | ldr $a2,[$a_ptr,#8] | |
165 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself | |
166 | ldr $a3,[$a_ptr,#12] | |
167 | adcs $a1,$a1,$a1 | |
168 | ldr $a4,[$a_ptr,#16] | |
169 | adcs $a2,$a2,$a2 | |
170 | ldr $a5,[$a_ptr,#20] | |
171 | adcs $a3,$a3,$a3 | |
172 | ldr $a6,[$a_ptr,#24] | |
173 | adcs $a4,$a4,$a4 | |
174 | ldr $a7,[$a_ptr,#28] | |
175 | adcs $a5,$a5,$a5 | |
176 | adcs $a6,$a6,$a6 | |
177 | mov $ff,#0 | |
178 | adcs $a7,$a7,$a7 | |
dfde4219 | 179 | adc $ff,$ff,#0 |
7a6c9a2e AP |
180 | |
181 | b .Lreduce_by_sub | |
313e6ec1 | 182 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 |
7a6c9a2e AP |
183 | |
184 | @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], | |
185 | @ const BN_ULONG r2[8]); | |
186 | .globl ecp_nistz256_add | |
187 | .type ecp_nistz256_add,%function | |
188 | .align 4 | |
189 | ecp_nistz256_add: | |
190 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 191 | bl __ecp_nistz256_add |
7a6c9a2e AP |
192 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
193 | ldmia sp!,{r4-r12,pc} | |
194 | #else | |
195 | ldmia sp!,{r4-r12,lr} | |
196 | bx lr @ interoperable with Thumb ISA:-) | |
197 | #endif | |
198 | .size ecp_nistz256_add,.-ecp_nistz256_add | |
199 | ||
313e6ec1 | 200 | .type __ecp_nistz256_add,%function |
7a6c9a2e | 201 | .align 4 |
313e6ec1 | 202 | __ecp_nistz256_add: |
7a6c9a2e AP |
203 | str lr,[sp,#-4]! @ push lr |
204 | ||
205 | ldr $a0,[$a_ptr,#0] | |
206 | ldr $a1,[$a_ptr,#4] | |
207 | ldr $a2,[$a_ptr,#8] | |
208 | ldr $a3,[$a_ptr,#12] | |
209 | ldr $a4,[$a_ptr,#16] | |
210 | ldr $t0,[$b_ptr,#0] | |
211 | ldr $a5,[$a_ptr,#20] | |
212 | ldr $t1,[$b_ptr,#4] | |
213 | ldr $a6,[$a_ptr,#24] | |
214 | ldr $t2,[$b_ptr,#8] | |
215 | ldr $a7,[$a_ptr,#28] | |
216 | ldr $t3,[$b_ptr,#12] | |
217 | adds $a0,$a0,$t0 | |
218 | ldr $t0,[$b_ptr,#16] | |
219 | adcs $a1,$a1,$t1 | |
220 | ldr $t1,[$b_ptr,#20] | |
221 | adcs $a2,$a2,$t2 | |
222 | ldr $t2,[$b_ptr,#24] | |
223 | adcs $a3,$a3,$t3 | |
224 | ldr $t3,[$b_ptr,#28] | |
225 | adcs $a4,$a4,$t0 | |
226 | adcs $a5,$a5,$t1 | |
227 | adcs $a6,$a6,$t2 | |
228 | mov $ff,#0 | |
229 | adcs $a7,$a7,$t3 | |
dfde4219 | 230 | adc $ff,$ff,#0 |
7a6c9a2e AP |
231 | ldr lr,[sp],#4 @ pop lr |
232 | ||
233 | .Lreduce_by_sub: | |
234 | ||
dfde4219 | 235 | @ if a+b >= modulus, subtract modulus. |
7a6c9a2e | 236 | @ |
dfde4219 | 237 | @ But since comparison implies subtraction, we subtract |
46f4e1be | 238 | @ modulus and then add it back if subtraction borrowed. |
dfde4219 AP |
239 | |
240 | subs $a0,$a0,#-1 | |
241 | sbcs $a1,$a1,#-1 | |
242 | sbcs $a2,$a2,#-1 | |
243 | sbcs $a3,$a3,#0 | |
244 | sbcs $a4,$a4,#0 | |
245 | sbcs $a5,$a5,#0 | |
246 | sbcs $a6,$a6,#1 | |
247 | sbcs $a7,$a7,#-1 | |
248 | sbc $ff,$ff,#0 | |
249 | ||
7a6c9a2e AP |
250 | @ Note that because mod has special form, i.e. consists of |
251 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
dfde4219 AP |
252 | @ using value of borrow as a whole or extracting single bit. |
253 | @ Follow $ff register... | |
7a6c9a2e | 254 | |
dfde4219 AP |
255 | adds $a0,$a0,$ff @ add synthesized modulus |
256 | adcs $a1,$a1,$ff | |
7a6c9a2e | 257 | str $a0,[$r_ptr,#0] |
dfde4219 | 258 | adcs $a2,$a2,$ff |
7a6c9a2e | 259 | str $a1,[$r_ptr,#4] |
dfde4219 | 260 | adcs $a3,$a3,#0 |
7a6c9a2e | 261 | str $a2,[$r_ptr,#8] |
dfde4219 | 262 | adcs $a4,$a4,#0 |
7a6c9a2e | 263 | str $a3,[$r_ptr,#12] |
dfde4219 | 264 | adcs $a5,$a5,#0 |
7a6c9a2e | 265 | str $a4,[$r_ptr,#16] |
dfde4219 | 266 | adcs $a6,$a6,$ff,lsr#31 |
7a6c9a2e | 267 | str $a5,[$r_ptr,#20] |
dfde4219 | 268 | adcs $a7,$a7,$ff |
7a6c9a2e AP |
269 | str $a6,[$r_ptr,#24] |
270 | str $a7,[$r_ptr,#28] | |
271 | ||
272 | mov pc,lr | |
313e6ec1 | 273 | .size __ecp_nistz256_add,.-__ecp_nistz256_add |
7a6c9a2e AP |
274 | |
275 | @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
276 | .globl ecp_nistz256_mul_by_3 | |
277 | .type ecp_nistz256_mul_by_3,%function | |
278 | .align 4 | |
279 | ecp_nistz256_mul_by_3: | |
280 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 281 | bl __ecp_nistz256_mul_by_3 |
7a6c9a2e AP |
282 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
283 | ldmia sp!,{r4-r12,pc} | |
284 | #else | |
285 | ldmia sp!,{r4-r12,lr} | |
286 | bx lr @ interoperable with Thumb ISA:-) | |
287 | #endif | |
288 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | |
289 | ||
313e6ec1 | 290 | .type __ecp_nistz256_mul_by_3,%function |
7a6c9a2e | 291 | .align 4 |
313e6ec1 | 292 | __ecp_nistz256_mul_by_3: |
7a6c9a2e AP |
293 | str lr,[sp,#-4]! @ push lr |
294 | ||
295 | @ As multiplication by 3 is performed as 2*n+n, below are inline | |
313e6ec1 | 296 | @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see |
7a6c9a2e AP |
297 | @ corresponding subroutines for details. |
298 | ||
299 | ldr $a0,[$a_ptr,#0] | |
300 | ldr $a1,[$a_ptr,#4] | |
301 | ldr $a2,[$a_ptr,#8] | |
302 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] | |
303 | ldr $a3,[$a_ptr,#12] | |
304 | adcs $a1,$a1,$a1 | |
305 | ldr $a4,[$a_ptr,#16] | |
306 | adcs $a2,$a2,$a2 | |
307 | ldr $a5,[$a_ptr,#20] | |
308 | adcs $a3,$a3,$a3 | |
309 | ldr $a6,[$a_ptr,#24] | |
310 | adcs $a4,$a4,$a4 | |
311 | ldr $a7,[$a_ptr,#28] | |
312 | adcs $a5,$a5,$a5 | |
313 | adcs $a6,$a6,$a6 | |
314 | mov $ff,#0 | |
315 | adcs $a7,$a7,$a7 | |
dfde4219 AP |
316 | adc $ff,$ff,#0 |
317 | ||
318 | subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores | |
319 | sbcs $a1,$a1,#-1 | |
320 | sbcs $a2,$a2,#-1 | |
7a6c9a2e AP |
321 | sbcs $a3,$a3,#0 |
322 | sbcs $a4,$a4,#0 | |
7a6c9a2e | 323 | sbcs $a5,$a5,#0 |
dfde4219 AP |
324 | sbcs $a6,$a6,#1 |
325 | sbcs $a7,$a7,#-1 | |
326 | sbc $ff,$ff,#0 | |
327 | ||
328 | adds $a0,$a0,$ff @ add synthesized modulus | |
329 | adcs $a1,$a1,$ff | |
330 | adcs $a2,$a2,$ff | |
331 | adcs $a3,$a3,#0 | |
332 | adcs $a4,$a4,#0 | |
333 | ldr $b_ptr,[$a_ptr,#0] | |
334 | adcs $a5,$a5,#0 | |
7a6c9a2e | 335 | ldr $t1,[$a_ptr,#4] |
dfde4219 | 336 | adcs $a6,$a6,$ff,lsr#31 |
7a6c9a2e | 337 | ldr $t2,[$a_ptr,#8] |
dfde4219 | 338 | adc $a7,$a7,$ff |
7a6c9a2e AP |
339 | |
340 | ldr $t0,[$a_ptr,#12] | |
341 | adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] | |
342 | ldr $b_ptr,[$a_ptr,#16] | |
343 | adcs $a1,$a1,$t1 | |
344 | ldr $t1,[$a_ptr,#20] | |
345 | adcs $a2,$a2,$t2 | |
346 | ldr $t2,[$a_ptr,#24] | |
347 | adcs $a3,$a3,$t0 | |
348 | ldr $t3,[$a_ptr,#28] | |
349 | adcs $a4,$a4,$b_ptr | |
350 | adcs $a5,$a5,$t1 | |
351 | adcs $a6,$a6,$t2 | |
352 | mov $ff,#0 | |
353 | adcs $a7,$a7,$t3 | |
dfde4219 | 354 | adc $ff,$ff,#0 |
7a6c9a2e AP |
355 | ldr lr,[sp],#4 @ pop lr |
356 | ||
357 | b .Lreduce_by_sub | |
358 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | |
359 | ||
360 | @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
361 | .globl ecp_nistz256_div_by_2 | |
362 | .type ecp_nistz256_div_by_2,%function | |
363 | .align 4 | |
364 | ecp_nistz256_div_by_2: | |
365 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 366 | bl __ecp_nistz256_div_by_2 |
7a6c9a2e AP |
367 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
368 | ldmia sp!,{r4-r12,pc} | |
369 | #else | |
370 | ldmia sp!,{r4-r12,lr} | |
371 | bx lr @ interoperable with Thumb ISA:-) | |
372 | #endif | |
373 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | |
374 | ||
313e6ec1 | 375 | .type __ecp_nistz256_div_by_2,%function |
7a6c9a2e | 376 | .align 4 |
313e6ec1 | 377 | __ecp_nistz256_div_by_2: |
7a6c9a2e AP |
378 | @ ret = (a is odd ? a+mod : a) >> 1 |
379 | ||
380 | ldr $a0,[$a_ptr,#0] | |
381 | ldr $a1,[$a_ptr,#4] | |
382 | ldr $a2,[$a_ptr,#8] | |
383 | mov $ff,$a0,lsl#31 @ place least significant bit to most | |
384 | @ significant position, now arithmetic | |
385 | @ right shift by 31 will produce -1 or | |
60250017 | 386 | @ 0, while logical right shift 1 or 0, |
7a6c9a2e AP |
387 | @ this is how modulus is conditionally |
388 | @ synthesized in this case... | |
389 | ldr $a3,[$a_ptr,#12] | |
390 | adds $a0,$a0,$ff,asr#31 | |
391 | ldr $a4,[$a_ptr,#16] | |
392 | adcs $a1,$a1,$ff,asr#31 | |
393 | ldr $a5,[$a_ptr,#20] | |
394 | adcs $a2,$a2,$ff,asr#31 | |
395 | ldr $a6,[$a_ptr,#24] | |
396 | adcs $a3,$a3,#0 | |
397 | ldr $a7,[$a_ptr,#28] | |
398 | adcs $a4,$a4,#0 | |
399 | mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early | |
400 | @ because it doesn't affect flags | |
401 | adcs $a5,$a5,#0 | |
402 | orr $a0,$a0,$a1,lsl#31 | |
403 | adcs $a6,$a6,$ff,lsr#31 | |
404 | mov $b_ptr,#0 | |
405 | adcs $a7,$a7,$ff,asr#31 | |
406 | mov $a1,$a1,lsr#1 | |
407 | adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition | |
408 | ||
409 | orr $a1,$a1,$a2,lsl#31 | |
410 | mov $a2,$a2,lsr#1 | |
411 | str $a0,[$r_ptr,#0] | |
412 | orr $a2,$a2,$a3,lsl#31 | |
413 | mov $a3,$a3,lsr#1 | |
414 | str $a1,[$r_ptr,#4] | |
415 | orr $a3,$a3,$a4,lsl#31 | |
416 | mov $a4,$a4,lsr#1 | |
417 | str $a2,[$r_ptr,#8] | |
418 | orr $a4,$a4,$a5,lsl#31 | |
419 | mov $a5,$a5,lsr#1 | |
420 | str $a3,[$r_ptr,#12] | |
421 | orr $a5,$a5,$a6,lsl#31 | |
422 | mov $a6,$a6,lsr#1 | |
423 | str $a4,[$r_ptr,#16] | |
424 | orr $a6,$a6,$a7,lsl#31 | |
425 | mov $a7,$a7,lsr#1 | |
426 | str $a5,[$r_ptr,#20] | |
427 | orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit | |
428 | str $a6,[$r_ptr,#24] | |
429 | str $a7,[$r_ptr,#28] | |
430 | ||
431 | mov pc,lr | |
313e6ec1 | 432 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 |
7a6c9a2e AP |
433 | |
434 | @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], | |
313e6ec1 | 435 | @ const BN_ULONG r2[8]); |
7a6c9a2e AP |
436 | .globl ecp_nistz256_sub |
437 | .type ecp_nistz256_sub,%function | |
438 | .align 4 | |
439 | ecp_nistz256_sub: | |
440 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 441 | bl __ecp_nistz256_sub |
7a6c9a2e AP |
442 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
443 | ldmia sp!,{r4-r12,pc} | |
444 | #else | |
445 | ldmia sp!,{r4-r12,lr} | |
446 | bx lr @ interoperable with Thumb ISA:-) | |
447 | #endif | |
448 | .size ecp_nistz256_sub,.-ecp_nistz256_sub | |
449 | ||
313e6ec1 | 450 | .type __ecp_nistz256_sub,%function |
7a6c9a2e | 451 | .align 4 |
313e6ec1 | 452 | __ecp_nistz256_sub: |
7a6c9a2e AP |
453 | str lr,[sp,#-4]! @ push lr |
454 | ||
455 | ldr $a0,[$a_ptr,#0] | |
456 | ldr $a1,[$a_ptr,#4] | |
457 | ldr $a2,[$a_ptr,#8] | |
458 | ldr $a3,[$a_ptr,#12] | |
459 | ldr $a4,[$a_ptr,#16] | |
460 | ldr $t0,[$b_ptr,#0] | |
461 | ldr $a5,[$a_ptr,#20] | |
462 | ldr $t1,[$b_ptr,#4] | |
463 | ldr $a6,[$a_ptr,#24] | |
464 | ldr $t2,[$b_ptr,#8] | |
465 | ldr $a7,[$a_ptr,#28] | |
466 | ldr $t3,[$b_ptr,#12] | |
467 | subs $a0,$a0,$t0 | |
468 | ldr $t0,[$b_ptr,#16] | |
469 | sbcs $a1,$a1,$t1 | |
470 | ldr $t1,[$b_ptr,#20] | |
471 | sbcs $a2,$a2,$t2 | |
472 | ldr $t2,[$b_ptr,#24] | |
473 | sbcs $a3,$a3,$t3 | |
474 | ldr $t3,[$b_ptr,#28] | |
475 | sbcs $a4,$a4,$t0 | |
476 | sbcs $a5,$a5,$t1 | |
477 | sbcs $a6,$a6,$t2 | |
478 | sbcs $a7,$a7,$t3 | |
479 | sbc $ff,$ff,$ff @ broadcast borrow bit | |
480 | ldr lr,[sp],#4 @ pop lr | |
481 | ||
482 | .Lreduce_by_add: | |
483 | ||
484 | @ if a-b borrows, add modulus. | |
485 | @ | |
486 | @ Note that because mod has special form, i.e. consists of | |
487 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
488 | @ broadcasting borrow bit to a register, $ff, and using it as | |
489 | @ a whole or extracting single bit. | |
490 | ||
491 | adds $a0,$a0,$ff @ add synthesized modulus | |
492 | adcs $a1,$a1,$ff | |
493 | str $a0,[$r_ptr,#0] | |
494 | adcs $a2,$a2,$ff | |
495 | str $a1,[$r_ptr,#4] | |
496 | adcs $a3,$a3,#0 | |
497 | str $a2,[$r_ptr,#8] | |
498 | adcs $a4,$a4,#0 | |
499 | str $a3,[$r_ptr,#12] | |
500 | adcs $a5,$a5,#0 | |
501 | str $a4,[$r_ptr,#16] | |
502 | adcs $a6,$a6,$ff,lsr#31 | |
503 | str $a5,[$r_ptr,#20] | |
504 | adcs $a7,$a7,$ff | |
505 | str $a6,[$r_ptr,#24] | |
506 | str $a7,[$r_ptr,#28] | |
507 | ||
508 | mov pc,lr | |
313e6ec1 | 509 | .size __ecp_nistz256_sub,.-__ecp_nistz256_sub |
7a6c9a2e AP |
510 | |
511 | @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
512 | .globl ecp_nistz256_neg | |
513 | .type ecp_nistz256_neg,%function | |
514 | .align 4 | |
515 | ecp_nistz256_neg: | |
516 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 517 | bl __ecp_nistz256_neg |
7a6c9a2e AP |
518 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
519 | ldmia sp!,{r4-r12,pc} | |
520 | #else | |
521 | ldmia sp!,{r4-r12,lr} | |
522 | bx lr @ interoperable with Thumb ISA:-) | |
523 | #endif | |
524 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | |
525 | ||
313e6ec1 | 526 | .type __ecp_nistz256_neg,%function |
7a6c9a2e | 527 | .align 4 |
313e6ec1 | 528 | __ecp_nistz256_neg: |
7a6c9a2e AP |
529 | ldr $a0,[$a_ptr,#0] |
530 | eor $ff,$ff,$ff | |
531 | ldr $a1,[$a_ptr,#4] | |
532 | ldr $a2,[$a_ptr,#8] | |
533 | subs $a0,$ff,$a0 | |
534 | ldr $a3,[$a_ptr,#12] | |
535 | sbcs $a1,$ff,$a1 | |
536 | ldr $a4,[$a_ptr,#16] | |
537 | sbcs $a2,$ff,$a2 | |
538 | ldr $a5,[$a_ptr,#20] | |
539 | sbcs $a3,$ff,$a3 | |
540 | ldr $a6,[$a_ptr,#24] | |
541 | sbcs $a4,$ff,$a4 | |
542 | ldr $a7,[$a_ptr,#28] | |
543 | sbcs $a5,$ff,$a5 | |
544 | sbcs $a6,$ff,$a6 | |
545 | sbcs $a7,$ff,$a7 | |
546 | sbc $ff,$ff,$ff | |
547 | ||
548 | b .Lreduce_by_add | |
313e6ec1 | 549 | .size __ecp_nistz256_neg,.-__ecp_nistz256_neg |
7a6c9a2e AP |
550 | ___ |
551 | { | |
552 | my @acc=map("r$_",(3..11)); | |
553 | my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); | |
554 | ||
555 | $code.=<<___; | |
556 | @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); | |
557 | .globl ecp_nistz256_sqr_mont | |
558 | .type ecp_nistz256_sqr_mont,%function | |
559 | .align 4 | |
560 | ecp_nistz256_sqr_mont: | |
561 | mov $b_ptr,$a_ptr | |
562 | b .Lecp_nistz256_mul_mont | |
563 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | |
564 | ||
565 | @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], | |
566 | @ const BN_ULONG r2[8]); | |
567 | .globl ecp_nistz256_mul_mont | |
568 | .type ecp_nistz256_mul_mont,%function | |
569 | .align 4 | |
570 | ecp_nistz256_mul_mont: | |
571 | .Lecp_nistz256_mul_mont: | |
572 | stmdb sp!,{r4-r12,lr} | |
313e6ec1 | 573 | bl __ecp_nistz256_mul_mont |
7a6c9a2e AP |
574 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
575 | ldmia sp!,{r4-r12,pc} | |
576 | #else | |
577 | ldmia sp!,{r4-r12,lr} | |
578 | bx lr @ interoperable with Thumb ISA:-) | |
579 | #endif | |
580 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | |
581 | ||
313e6ec1 | 582 | .type __ecp_nistz256_mul_mont,%function |
7a6c9a2e | 583 | .align 4 |
313e6ec1 | 584 | __ecp_nistz256_mul_mont: |
7a6c9a2e AP |
585 | stmdb sp!,{r0-r2,lr} @ make a copy of arguments too |
586 | ||
587 | ldr $bj,[$b_ptr,#0] @ b[0] | |
588 | ldmia $a_ptr,{@acc[1]-@acc[8]} | |
589 | ||
590 | umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] | |
591 | stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so | |
592 | @ that it can be addressed | |
593 | @ without spending register | |
594 | @ on address | |
595 | umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] | |
596 | umull @acc[2],$t1,@acc[3],$bj | |
597 | adds @acc[1],@acc[1],$t3 @ accumulate high part of mult | |
598 | umull @acc[3],$t2,@acc[4],$bj | |
599 | adcs @acc[2],@acc[2],$t0 | |
600 | umull @acc[4],$t3,@acc[5],$bj | |
601 | adcs @acc[3],@acc[3],$t1 | |
602 | umull @acc[5],$t0,@acc[6],$bj | |
603 | adcs @acc[4],@acc[4],$t2 | |
604 | umull @acc[6],$t1,@acc[7],$bj | |
605 | adcs @acc[5],@acc[5],$t3 | |
606 | umull @acc[7],$t2,@acc[8],$bj | |
607 | adcs @acc[6],@acc[6],$t0 | |
608 | adcs @acc[7],@acc[7],$t1 | |
609 | eor $t3,$t3,$t3 @ first overflow bit is zero | |
610 | adc @acc[8],$t2,#0 | |
611 | ___ | |
612 | for(my $i=1;$i<8;$i++) { | |
613 | my $t4=@acc[0]; | |
614 | ||
615 | # Reduction iteration is normally performed by accumulating | |
616 | # result of multiplication of modulus by "magic" digit [and | |
617 | # omitting least significant word, which is guaranteed to | |
618 | # be 0], but thanks to special form of modulus and "magic" | |
619 | # digit being equal to least significant word, it can be | |
620 | # performed with additions and subtractions alone. Indeed: | |
621 | # | |
622 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff | |
623 | # * abcd | |
624 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
625 | # | |
626 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
627 | # rewrite above as: | |
628 | # | |
629 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd | |
630 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 | |
631 | # - abcd.0000.0000.0000.0000.0000.0000.abcd | |
632 | # | |
633 | # or marking redundant operations: | |
634 | # | |
635 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- | |
636 | # + abcd.0000.abcd.0000.0000.abcd.----.----.---- | |
637 | # - abcd.----.----.----.----.----.----.---- | |
638 | ||
639 | $code.=<<___; | |
640 | @ multiplication-less reduction $i | |
641 | adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] | |
642 | ldr $bj,[sp,#40] @ restore b_ptr | |
643 | adcs @acc[4],@acc[4],#0 @ r[4]+=0 | |
644 | adcs @acc[5],@acc[5],#0 @ r[5]+=0 | |
645 | adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] | |
646 | ldr $t1,[sp,#0] @ load a[0] | |
647 | adcs @acc[7],@acc[7],#0 @ r[7]+=0 | |
648 | ldr $bj,[$bj,#4*$i] @ load b[i] | |
649 | adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] | |
650 | eor $t0,$t0,$t0 | |
651 | adc $t3,$t3,#0 @ overflow bit | |
652 | subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] | |
653 | ldr $t2,[sp,#4] @ a[1] | |
654 | sbcs @acc[8],@acc[8],#0 @ r[8]-=0 | |
655 | umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] | |
656 | eor $t1,$t1,$t1 | |
657 | sbc @acc[0],$t3,#0 @ overflow bit, keep in mind | |
658 | @ that netto result is | |
659 | @ addition of a value which | |
660 | @ makes underflow impossible | |
661 | ||
662 | ldr $t3,[sp,#8] @ a[2] | |
663 | umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] | |
664 | str @acc[0],[sp,#36] @ temporarily offload overflow | |
665 | eor $t2,$t2,$t2 | |
666 | ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] | |
667 | umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] | |
668 | eor $t3,$t3,$t3 | |
669 | adds @acc[2],@acc[2],$t0 @ accumulate high part of mult | |
670 | ldr $t0,[sp,#16] @ a[4] | |
671 | umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] | |
672 | eor $t4,$t4,$t4 | |
673 | adcs @acc[3],@acc[3],$t1 | |
674 | ldr $t1,[sp,#20] @ a[5] | |
675 | umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] | |
676 | eor $t0,$t0,$t0 | |
677 | adcs @acc[4],@acc[4],$t2 | |
678 | ldr $t2,[sp,#24] @ a[6] | |
679 | umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] | |
680 | eor $t1,$t1,$t1 | |
681 | adcs @acc[5],@acc[5],$t3 | |
682 | ldr $t3,[sp,#28] @ a[7] | |
683 | umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] | |
684 | eor $t2,$t2,$t2 | |
685 | adcs @acc[6],@acc[6],$t4 | |
686 | ldr @acc[0],[sp,#36] @ restore overflow bit | |
687 | umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] | |
688 | eor $t3,$t3,$t3 | |
689 | adcs @acc[7],@acc[7],$t0 | |
690 | adcs @acc[8],@acc[8],$t1 | |
691 | adcs @acc[0],$acc[0],$t2 | |
692 | adc $t3,$t3,#0 @ new overflow bit | |
693 | ___ | |
694 | push(@acc,shift(@acc)); # rotate registers, so that | |
695 | # "r[i]" becomes r[i] | |
696 | } | |
697 | $code.=<<___; | |
698 | @ last multiplication-less reduction | |
699 | adds @acc[3],@acc[3],@acc[0] | |
700 | ldr $r_ptr,[sp,#32] @ restore r_ptr | |
701 | adcs @acc[4],@acc[4],#0 | |
702 | adcs @acc[5],@acc[5],#0 | |
703 | adcs @acc[6],@acc[6],@acc[0] | |
704 | adcs @acc[7],@acc[7],#0 | |
705 | adcs @acc[8],@acc[8],@acc[0] | |
706 | adc $t3,$t3,#0 | |
707 | subs @acc[7],@acc[7],@acc[0] | |
708 | sbcs @acc[8],@acc[8],#0 | |
709 | sbc @acc[0],$t3,#0 @ overflow bit | |
710 | ||
711 | @ Final step is "if result > mod, subtract mod", but we do it | |
712 | @ "other way around", namely subtract modulus from result | |
713 | @ and if it borrowed, add modulus back. | |
714 | ||
313e6ec1 AP |
715 | adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 |
716 | adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 | |
717 | adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 | |
7a6c9a2e AP |
718 | sbcs @acc[4],@acc[4],#0 |
719 | sbcs @acc[5],@acc[5],#0 | |
720 | sbcs @acc[6],@acc[6],#0 | |
721 | sbcs @acc[7],@acc[7],#1 | |
313e6ec1 | 722 | adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 |
7a6c9a2e AP |
723 | ldr lr,[sp,#44] @ restore lr |
724 | sbc @acc[0],@acc[0],#0 @ broadcast borrow bit | |
725 | add sp,sp,#48 | |
726 | ||
727 | @ Note that because mod has special form, i.e. consists of | |
728 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
729 | @ broadcasting borrow bit to a register, @acc[0], and using it as | |
730 | @ a whole or extracting single bit. | |
731 | ||
732 | adds @acc[1],@acc[1],@acc[0] @ add modulus or zero | |
733 | adcs @acc[2],@acc[2],@acc[0] | |
734 | str @acc[1],[$r_ptr,#0] | |
735 | adcs @acc[3],@acc[3],@acc[0] | |
736 | str @acc[2],[$r_ptr,#4] | |
737 | adcs @acc[4],@acc[4],#0 | |
738 | str @acc[3],[$r_ptr,#8] | |
739 | adcs @acc[5],@acc[5],#0 | |
740 | str @acc[4],[$r_ptr,#12] | |
741 | adcs @acc[6],@acc[6],#0 | |
742 | str @acc[5],[$r_ptr,#16] | |
743 | adcs @acc[7],@acc[7],@acc[0],lsr#31 | |
744 | str @acc[6],[$r_ptr,#20] | |
745 | adc @acc[8],@acc[8],@acc[0] | |
746 | str @acc[7],[$r_ptr,#24] | |
747 | str @acc[8],[$r_ptr,#28] | |
748 | ||
749 | mov pc,lr | |
313e6ec1 | 750 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont |
7a6c9a2e AP |
751 | ___ |
752 | } | |
753 | ||
754 | { | |
755 | my ($out,$inp,$index,$mask)=map("r$_",(0..3)); | |
756 | $code.=<<___; | |
757 | @ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, | |
758 | @ int r2); | |
759 | .globl ecp_nistz256_scatter_w5 | |
760 | .type ecp_nistz256_scatter_w5,%function | |
761 | .align 5 | |
762 | ecp_nistz256_scatter_w5: | |
763 | stmdb sp!,{r4-r11} | |
764 | ||
765 | add $out,$out,$index,lsl#2 | |
766 | ||
767 | ldmia $inp!,{r4-r11} @ X | |
768 | str r4,[$out,#64*0-4] | |
769 | str r5,[$out,#64*1-4] | |
770 | str r6,[$out,#64*2-4] | |
771 | str r7,[$out,#64*3-4] | |
772 | str r8,[$out,#64*4-4] | |
773 | str r9,[$out,#64*5-4] | |
774 | str r10,[$out,#64*6-4] | |
775 | str r11,[$out,#64*7-4] | |
776 | add $out,$out,#64*8 | |
777 | ||
778 | ldmia $inp!,{r4-r11} @ Y | |
779 | str r4,[$out,#64*0-4] | |
780 | str r5,[$out,#64*1-4] | |
781 | str r6,[$out,#64*2-4] | |
782 | str r7,[$out,#64*3-4] | |
783 | str r8,[$out,#64*4-4] | |
784 | str r9,[$out,#64*5-4] | |
785 | str r10,[$out,#64*6-4] | |
786 | str r11,[$out,#64*7-4] | |
787 | add $out,$out,#64*8 | |
788 | ||
789 | ldmia $inp,{r4-r11} @ Z | |
790 | str r4,[$out,#64*0-4] | |
791 | str r5,[$out,#64*1-4] | |
792 | str r6,[$out,#64*2-4] | |
793 | str r7,[$out,#64*3-4] | |
794 | str r8,[$out,#64*4-4] | |
795 | str r9,[$out,#64*5-4] | |
796 | str r10,[$out,#64*6-4] | |
797 | str r11,[$out,#64*7-4] | |
798 | ||
799 | ldmia sp!,{r4-r11} | |
800 | #if __ARM_ARCH__>=5 || defined(__thumb__) | |
801 | bx lr | |
802 | #else | |
803 | mov pc,lr | |
804 | #endif | |
805 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 | |
806 | ||
807 | @ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, | |
808 | @ int r2); | |
809 | .globl ecp_nistz256_gather_w5 | |
810 | .type ecp_nistz256_gather_w5,%function | |
811 | .align 5 | |
812 | ecp_nistz256_gather_w5: | |
813 | stmdb sp!,{r4-r11} | |
814 | ||
815 | cmp $index,#0 | |
816 | mov $mask,#0 | |
11208dcf AP |
817 | #ifdef __thumb2__ |
818 | itt ne | |
819 | #endif | |
7a6c9a2e AP |
820 | subne $index,$index,#1 |
821 | movne $mask,#-1 | |
822 | add $inp,$inp,$index,lsl#2 | |
823 | ||
824 | ldr r4,[$inp,#64*0] | |
825 | ldr r5,[$inp,#64*1] | |
826 | ldr r6,[$inp,#64*2] | |
827 | and r4,r4,$mask | |
828 | ldr r7,[$inp,#64*3] | |
829 | and r5,r5,$mask | |
830 | ldr r8,[$inp,#64*4] | |
831 | and r6,r6,$mask | |
832 | ldr r9,[$inp,#64*5] | |
833 | and r7,r7,$mask | |
834 | ldr r10,[$inp,#64*6] | |
835 | and r8,r8,$mask | |
836 | ldr r11,[$inp,#64*7] | |
837 | add $inp,$inp,#64*8 | |
838 | and r9,r9,$mask | |
839 | and r10,r10,$mask | |
840 | and r11,r11,$mask | |
841 | stmia $out!,{r4-r11} @ X | |
842 | ||
843 | ldr r4,[$inp,#64*0] | |
844 | ldr r5,[$inp,#64*1] | |
845 | ldr r6,[$inp,#64*2] | |
846 | and r4,r4,$mask | |
847 | ldr r7,[$inp,#64*3] | |
848 | and r5,r5,$mask | |
849 | ldr r8,[$inp,#64*4] | |
850 | and r6,r6,$mask | |
851 | ldr r9,[$inp,#64*5] | |
852 | and r7,r7,$mask | |
853 | ldr r10,[$inp,#64*6] | |
854 | and r8,r8,$mask | |
855 | ldr r11,[$inp,#64*7] | |
856 | add $inp,$inp,#64*8 | |
857 | and r9,r9,$mask | |
858 | and r10,r10,$mask | |
859 | and r11,r11,$mask | |
860 | stmia $out!,{r4-r11} @ Y | |
861 | ||
862 | ldr r4,[$inp,#64*0] | |
863 | ldr r5,[$inp,#64*1] | |
864 | ldr r6,[$inp,#64*2] | |
865 | and r4,r4,$mask | |
866 | ldr r7,[$inp,#64*3] | |
867 | and r5,r5,$mask | |
868 | ldr r8,[$inp,#64*4] | |
869 | and r6,r6,$mask | |
870 | ldr r9,[$inp,#64*5] | |
871 | and r7,r7,$mask | |
872 | ldr r10,[$inp,#64*6] | |
873 | and r8,r8,$mask | |
874 | ldr r11,[$inp,#64*7] | |
875 | and r9,r9,$mask | |
876 | and r10,r10,$mask | |
877 | and r11,r11,$mask | |
878 | stmia $out,{r4-r11} @ Z | |
879 | ||
880 | ldmia sp!,{r4-r11} | |
881 | #if __ARM_ARCH__>=5 || defined(__thumb__) | |
882 | bx lr | |
883 | #else | |
884 | mov pc,lr | |
885 | #endif | |
886 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 | |
887 | ||
888 | @ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, | |
889 | @ int r2); | |
890 | .globl ecp_nistz256_scatter_w7 | |
891 | .type ecp_nistz256_scatter_w7,%function | |
892 | .align 5 | |
893 | ecp_nistz256_scatter_w7: | |
894 | add $out,$out,$index | |
895 | mov $index,#64/4 | |
896 | .Loop_scatter_w7: | |
897 | ldr $mask,[$inp],#4 | |
898 | subs $index,$index,#1 | |
87a75b3e | 899 | strb $mask,[$out,#64*0] |
7a6c9a2e | 900 | mov $mask,$mask,lsr#8 |
87a75b3e | 901 | strb $mask,[$out,#64*1] |
7a6c9a2e | 902 | mov $mask,$mask,lsr#8 |
87a75b3e | 903 | strb $mask,[$out,#64*2] |
7a6c9a2e | 904 | mov $mask,$mask,lsr#8 |
87a75b3e | 905 | strb $mask,[$out,#64*3] |
7a6c9a2e AP |
906 | add $out,$out,#64*4 |
907 | bne .Loop_scatter_w7 | |
908 | ||
909 | #if __ARM_ARCH__>=5 || defined(__thumb__) | |
910 | bx lr | |
911 | #else | |
912 | mov pc,lr | |
913 | #endif | |
914 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 | |
915 | ||
916 | @ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, | |
917 | @ int r2); | |
918 | .globl ecp_nistz256_gather_w7 | |
919 | .type ecp_nistz256_gather_w7,%function | |
920 | .align 5 | |
921 | ecp_nistz256_gather_w7: | |
922 | stmdb sp!,{r4-r7} | |
923 | ||
924 | cmp $index,#0 | |
925 | mov $mask,#0 | |
11208dcf AP |
926 | #ifdef __thumb2__ |
927 | itt ne | |
928 | #endif | |
7a6c9a2e AP |
929 | subne $index,$index,#1 |
930 | movne $mask,#-1 | |
931 | add $inp,$inp,$index | |
932 | mov $index,#64/4 | |
933 | nop | |
934 | .Loop_gather_w7: | |
935 | ldrb r4,[$inp,#64*0] | |
936 | subs $index,$index,#1 | |
937 | ldrb r5,[$inp,#64*1] | |
938 | ldrb r6,[$inp,#64*2] | |
939 | ldrb r7,[$inp,#64*3] | |
940 | add $inp,$inp,#64*4 | |
941 | orr r4,r4,r5,lsl#8 | |
942 | orr r4,r4,r6,lsl#16 | |
943 | orr r4,r4,r7,lsl#24 | |
944 | and r4,r4,$mask | |
945 | str r4,[$out],#4 | |
946 | bne .Loop_gather_w7 | |
947 | ||
948 | ldmia sp!,{r4-r7} | |
949 | #if __ARM_ARCH__>=5 || defined(__thumb__) | |
950 | bx lr | |
951 | #else | |
952 | mov pc,lr | |
953 | #endif | |
954 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 | |
955 | ___ | |
956 | } | |
957 | if (0) { | |
958 | # In comparison to integer-only equivalent of below subroutine: | |
959 | # | |
960 | # Cortex-A8 +10% | |
961 | # Cortex-A9 -10% | |
962 | # Snapdragon S4 +5% | |
963 | # | |
964 | # As not all time is spent in multiplication, overall impact is deemed | |
965 | # too low to care about. | |
966 | ||
967 | my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); | |
968 | my $mask="q4"; | |
969 | my $mult="q5"; | |
970 | my @AxB=map("q$_",(8..15)); | |
971 | ||
972 | my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); | |
973 | ||
974 | $code.=<<___; | |
975 | #if __ARM_ARCH__>=7 | |
976 | .fpu neon | |
977 | ||
978 | .globl ecp_nistz256_mul_mont_neon | |
979 | .type ecp_nistz256_mul_mont_neon,%function | |
980 | .align 5 | |
981 | ecp_nistz256_mul_mont_neon: | |
982 | mov ip,sp | |
983 | stmdb sp!,{r4-r9} | |
984 | vstmdb sp!,{q4-q5} @ ABI specification says so | |
985 | ||
986 | sub $toutptr,sp,#40 | |
987 | vld1.32 {${Bi}[0]},[$bptr,:32]! | |
988 | veor $zero,$zero,$zero | |
989 | vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( | |
990 | vzip.16 $Bi,$zero | |
991 | mov sp,$toutptr @ alloca | |
992 | vmov.i64 $mask,#0xffff | |
993 | ||
994 | vmull.u32 @AxB[0],$Bi,${A0}[0] | |
995 | vmull.u32 @AxB[1],$Bi,${A0}[1] | |
996 | vmull.u32 @AxB[2],$Bi,${A1}[0] | |
997 | vmull.u32 @AxB[3],$Bi,${A1}[1] | |
998 | vshr.u64 $temp,@AxB[0]#lo,#16 | |
999 | vmull.u32 @AxB[4],$Bi,${A2}[0] | |
1000 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | |
1001 | vmull.u32 @AxB[5],$Bi,${A2}[1] | |
1002 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] | |
1003 | vmull.u32 @AxB[6],$Bi,${A3}[0] | |
1004 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | |
1005 | vmull.u32 @AxB[7],$Bi,${A3}[1] | |
1006 | ___ | |
1007 | for($i=1;$i<8;$i++) { | |
1008 | $code.=<<___; | |
1009 | vld1.32 {${Bi}[0]},[$bptr,:32]! | |
1010 | veor $zero,$zero,$zero | |
1011 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction | |
1012 | vshl.u64 $mult,@AxB[0],#32 | |
1013 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | |
1014 | vsub.u64 $mult,$mult,@AxB[0] | |
1015 | vzip.16 $Bi,$zero | |
1016 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | |
1017 | vadd.u64 @AxB[7],@AxB[7],$mult | |
1018 | ___ | |
1019 | push(@AxB,shift(@AxB)); | |
1020 | $code.=<<___; | |
1021 | vmlal.u32 @AxB[0],$Bi,${A0}[0] | |
1022 | vmlal.u32 @AxB[1],$Bi,${A0}[1] | |
1023 | vmlal.u32 @AxB[2],$Bi,${A1}[0] | |
1024 | vmlal.u32 @AxB[3],$Bi,${A1}[1] | |
1025 | vshr.u64 $temp,@AxB[0]#lo,#16 | |
1026 | vmlal.u32 @AxB[4],$Bi,${A2}[0] | |
1027 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp | |
1028 | vmlal.u32 @AxB[5],$Bi,${A2}[1] | |
1029 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] | |
1030 | vmlal.u32 @AxB[6],$Bi,${A3}[0] | |
1031 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] | |
1032 | vmull.u32 @AxB[7],$Bi,${A3}[1] | |
1033 | ___ | |
1034 | } | |
1035 | $code.=<<___; | |
1036 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction | |
1037 | vshl.u64 $mult,@AxB[0],#32 | |
1038 | vadd.u64 @AxB[3],@AxB[3],@AxB[0] | |
1039 | vsub.u64 $mult,$mult,@AxB[0] | |
1040 | vadd.u64 @AxB[6],@AxB[6],@AxB[0] | |
1041 | vadd.u64 @AxB[7],@AxB[7],$mult | |
1042 | ||
1043 | vshr.u64 $temp,@AxB[1]#lo,#16 @ convert | |
1044 | vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp | |
1045 | vshr.u64 $temp,@AxB[1]#hi,#16 | |
1046 | vzip.16 @AxB[1]#lo,@AxB[1]#hi | |
1047 | ___ | |
1048 | foreach (2..7) { | |
1049 | $code.=<<___; | |
1050 | vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp | |
1051 | vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! | |
1052 | vshr.u64 $temp,@AxB[$_]#lo,#16 | |
1053 | vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp | |
1054 | vshr.u64 $temp,@AxB[$_]#hi,#16 | |
1055 | vzip.16 @AxB[$_]#lo,@AxB[$_]#hi | |
1056 | ___ | |
1057 | } | |
1058 | $code.=<<___; | |
1059 | vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! | |
1060 | vst1.32 {$temp},[$toutptr] @ upper 33 bits | |
1061 | ||
1062 | ldr r1,[sp,#0] | |
1063 | ldr r2,[sp,#4] | |
1064 | ldr r3,[sp,#8] | |
1065 | subs r1,r1,#-1 | |
1066 | ldr r4,[sp,#12] | |
1067 | sbcs r2,r2,#-1 | |
1068 | ldr r5,[sp,#16] | |
1069 | sbcs r3,r3,#-1 | |
1070 | ldr r6,[sp,#20] | |
1071 | sbcs r4,r4,#0 | |
1072 | ldr r7,[sp,#24] | |
1073 | sbcs r5,r5,#0 | |
1074 | ldr r8,[sp,#28] | |
1075 | sbcs r6,r6,#0 | |
1076 | ldr r9,[sp,#32] @ top-most bit | |
1077 | sbcs r7,r7,#1 | |
1078 | sub sp,ip,#40+16 | |
1079 | sbcs r8,r8,#-1 | |
1080 | sbc r9,r9,#0 | |
1081 | vldmia sp!,{q4-q5} | |
1082 | ||
1083 | adds r1,r1,r9 | |
1084 | adcs r2,r2,r9 | |
1085 | str r1,[$rptr,#0] | |
1086 | adcs r3,r3,r9 | |
1087 | str r2,[$rptr,#4] | |
1088 | adcs r4,r4,#0 | |
1089 | str r3,[$rptr,#8] | |
1090 | adcs r5,r5,#0 | |
1091 | str r4,[$rptr,#12] | |
1092 | adcs r6,r6,#0 | |
1093 | str r5,[$rptr,#16] | |
1094 | adcs r7,r7,r9,lsr#31 | |
1095 | str r6,[$rptr,#20] | |
1096 | adcs r8,r8,r9 | |
1097 | str r7,[$rptr,#24] | |
1098 | str r8,[$rptr,#28] | |
1099 | ||
1100 | ldmia sp!,{r4-r9} | |
1101 | bx lr | |
1102 | .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon | |
1103 | #endif | |
1104 | ___ | |
1105 | } | |
1106 | ||
1107 | {{{ | |
1108 | ######################################################################## | |
1109 | # Below $aN assignment matches order in which 256-bit result appears in | |
313e6ec1 | 1110 | # register bank at return from __ecp_nistz256_mul_mont, so that we can |
7a6c9a2e AP |
1111 | # skip over reloading it from memory. This means that below functions |
1112 | # use custom calling sequence accepting 256-bit input in registers, | |
1113 | # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. | |
1114 | # | |
1115 | # See their "normal" counterparts for insights on calculations. | |
1116 | ||
1117 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, | |
1118 | $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); | |
1119 | my $ff=$b_ptr; | |
1120 | ||
1121 | $code.=<<___; | |
1122 | .type __ecp_nistz256_sub_from,%function | |
1123 | .align 5 | |
1124 | __ecp_nistz256_sub_from: | |
1125 | str lr,[sp,#-4]! @ push lr | |
1126 | ||
1127 | ldr $t0,[$b_ptr,#0] | |
1128 | ldr $t1,[$b_ptr,#4] | |
1129 | ldr $t2,[$b_ptr,#8] | |
1130 | ldr $t3,[$b_ptr,#12] | |
1131 | subs $a0,$a0,$t0 | |
1132 | ldr $t0,[$b_ptr,#16] | |
1133 | sbcs $a1,$a1,$t1 | |
1134 | ldr $t1,[$b_ptr,#20] | |
1135 | sbcs $a2,$a2,$t2 | |
1136 | ldr $t2,[$b_ptr,#24] | |
1137 | sbcs $a3,$a3,$t3 | |
1138 | ldr $t3,[$b_ptr,#28] | |
1139 | sbcs $a4,$a4,$t0 | |
1140 | sbcs $a5,$a5,$t1 | |
1141 | sbcs $a6,$a6,$t2 | |
1142 | sbcs $a7,$a7,$t3 | |
1143 | sbc $ff,$ff,$ff @ broadcast borrow bit | |
1144 | ldr lr,[sp],#4 @ pop lr | |
1145 | ||
1146 | adds $a0,$a0,$ff @ add synthesized modulus | |
1147 | adcs $a1,$a1,$ff | |
1148 | str $a0,[$r_ptr,#0] | |
1149 | adcs $a2,$a2,$ff | |
1150 | str $a1,[$r_ptr,#4] | |
1151 | adcs $a3,$a3,#0 | |
1152 | str $a2,[$r_ptr,#8] | |
1153 | adcs $a4,$a4,#0 | |
1154 | str $a3,[$r_ptr,#12] | |
1155 | adcs $a5,$a5,#0 | |
1156 | str $a4,[$r_ptr,#16] | |
1157 | adcs $a6,$a6,$ff,lsr#31 | |
1158 | str $a5,[$r_ptr,#20] | |
1159 | adcs $a7,$a7,$ff | |
1160 | str $a6,[$r_ptr,#24] | |
1161 | str $a7,[$r_ptr,#28] | |
1162 | ||
1163 | mov pc,lr | |
1164 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | |
1165 | ||
1166 | .type __ecp_nistz256_sub_morf,%function | |
1167 | .align 5 | |
1168 | __ecp_nistz256_sub_morf: | |
1169 | str lr,[sp,#-4]! @ push lr | |
1170 | ||
1171 | ldr $t0,[$b_ptr,#0] | |
1172 | ldr $t1,[$b_ptr,#4] | |
1173 | ldr $t2,[$b_ptr,#8] | |
1174 | ldr $t3,[$b_ptr,#12] | |
1175 | subs $a0,$t0,$a0 | |
1176 | ldr $t0,[$b_ptr,#16] | |
1177 | sbcs $a1,$t1,$a1 | |
1178 | ldr $t1,[$b_ptr,#20] | |
1179 | sbcs $a2,$t2,$a2 | |
1180 | ldr $t2,[$b_ptr,#24] | |
1181 | sbcs $a3,$t3,$a3 | |
1182 | ldr $t3,[$b_ptr,#28] | |
1183 | sbcs $a4,$t0,$a4 | |
1184 | sbcs $a5,$t1,$a5 | |
1185 | sbcs $a6,$t2,$a6 | |
1186 | sbcs $a7,$t3,$a7 | |
1187 | sbc $ff,$ff,$ff @ broadcast borrow bit | |
1188 | ldr lr,[sp],#4 @ pop lr | |
1189 | ||
1190 | adds $a0,$a0,$ff @ add synthesized modulus | |
1191 | adcs $a1,$a1,$ff | |
1192 | str $a0,[$r_ptr,#0] | |
1193 | adcs $a2,$a2,$ff | |
1194 | str $a1,[$r_ptr,#4] | |
1195 | adcs $a3,$a3,#0 | |
1196 | str $a2,[$r_ptr,#8] | |
1197 | adcs $a4,$a4,#0 | |
1198 | str $a3,[$r_ptr,#12] | |
1199 | adcs $a5,$a5,#0 | |
1200 | str $a4,[$r_ptr,#16] | |
1201 | adcs $a6,$a6,$ff,lsr#31 | |
1202 | str $a5,[$r_ptr,#20] | |
1203 | adcs $a7,$a7,$ff | |
1204 | str $a6,[$r_ptr,#24] | |
1205 | str $a7,[$r_ptr,#28] | |
1206 | ||
1207 | mov pc,lr | |
1208 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | |
1209 | ||
313e6ec1 | 1210 | .type __ecp_nistz256_add_self,%function |
7a6c9a2e | 1211 | .align 4 |
313e6ec1 | 1212 | __ecp_nistz256_add_self: |
7a6c9a2e AP |
1213 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] |
1214 | adcs $a1,$a1,$a1 | |
1215 | adcs $a2,$a2,$a2 | |
1216 | adcs $a3,$a3,$a3 | |
1217 | adcs $a4,$a4,$a4 | |
1218 | adcs $a5,$a5,$a5 | |
1219 | adcs $a6,$a6,$a6 | |
1220 | mov $ff,#0 | |
1221 | adcs $a7,$a7,$a7 | |
dfde4219 AP |
1222 | adc $ff,$ff,#0 |
1223 | ||
1224 | @ if a+b >= modulus, subtract modulus. | |
1225 | @ | |
1226 | @ But since comparison implies subtraction, we subtract | |
46f4e1be | 1227 | @ modulus and then add it back if subtraction borrowed. |
dfde4219 AP |
1228 | |
1229 | subs $a0,$a0,#-1 | |
1230 | sbcs $a1,$a1,#-1 | |
1231 | sbcs $a2,$a2,#-1 | |
1232 | sbcs $a3,$a3,#0 | |
1233 | sbcs $a4,$a4,#0 | |
1234 | sbcs $a5,$a5,#0 | |
1235 | sbcs $a6,$a6,#1 | |
1236 | sbcs $a7,$a7,#-1 | |
1237 | sbc $ff,$ff,#0 | |
7a6c9a2e | 1238 | |
dfde4219 AP |
1239 | @ Note that because mod has special form, i.e. consists of |
1240 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by | |
1241 | @ using value of borrow as a whole or extracting single bit. | |
1242 | @ Follow $ff register... | |
1243 | ||
1244 | adds $a0,$a0,$ff @ add synthesized modulus | |
1245 | adcs $a1,$a1,$ff | |
7a6c9a2e | 1246 | str $a0,[$r_ptr,#0] |
dfde4219 | 1247 | adcs $a2,$a2,$ff |
7a6c9a2e | 1248 | str $a1,[$r_ptr,#4] |
dfde4219 | 1249 | adcs $a3,$a3,#0 |
7a6c9a2e | 1250 | str $a2,[$r_ptr,#8] |
dfde4219 | 1251 | adcs $a4,$a4,#0 |
7a6c9a2e | 1252 | str $a3,[$r_ptr,#12] |
dfde4219 | 1253 | adcs $a5,$a5,#0 |
7a6c9a2e | 1254 | str $a4,[$r_ptr,#16] |
dfde4219 | 1255 | adcs $a6,$a6,$ff,lsr#31 |
7a6c9a2e | 1256 | str $a5,[$r_ptr,#20] |
dfde4219 | 1257 | adcs $a7,$a7,$ff |
7a6c9a2e AP |
1258 | str $a6,[$r_ptr,#24] |
1259 | str $a7,[$r_ptr,#28] | |
1260 | ||
1261 | mov pc,lr | |
313e6ec1 | 1262 | .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self |
7a6c9a2e AP |
1263 | |
1264 | ___ | |
1265 | ||
1266 | ######################################################################## | |
085b3860 | 1267 | # following subroutines are "literal" implementation of those found in |
7a6c9a2e AP |
1268 | # ecp_nistz256.c |
1269 | # | |
1270 | ######################################################################## | |
1271 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | |
1272 | # | |
1273 | { | |
1274 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); | |
1275 | # above map() describes stack layout with 5 temporary | |
1276 | # 256-bit vectors on top. Then note that we push | |
1277 | # starting from r0, which means that we have copy of | |
1278 | # input arguments just below these temporary vectors. | |
1279 | ||
1280 | $code.=<<___; | |
1281 | .globl ecp_nistz256_point_double | |
1282 | .type ecp_nistz256_point_double,%function | |
1283 | .align 5 | |
1284 | ecp_nistz256_point_double: | |
1285 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | |
1286 | sub sp,sp,#32*5 | |
1287 | ||
143ee099 | 1288 | .Lpoint_double_shortcut: |
7a6c9a2e AP |
1289 | add r3,sp,#$in_x |
1290 | ldmia $a_ptr!,{r4-r11} @ copy in_x | |
1291 | stmia r3,{r4-r11} | |
1292 | ||
1293 | add $r_ptr,sp,#$S | |
313e6ec1 | 1294 | bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); |
7a6c9a2e AP |
1295 | |
1296 | add $b_ptr,$a_ptr,#32 | |
1297 | add $a_ptr,$a_ptr,#32 | |
1298 | add $r_ptr,sp,#$Zsqr | |
313e6ec1 | 1299 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); |
7a6c9a2e AP |
1300 | |
1301 | add $a_ptr,sp,#$S | |
1302 | add $b_ptr,sp,#$S | |
1303 | add $r_ptr,sp,#$S | |
313e6ec1 | 1304 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); |
7a6c9a2e AP |
1305 | |
1306 | ldr $b_ptr,[sp,#32*5+4] | |
1307 | add $a_ptr,$b_ptr,#32 | |
1308 | add $b_ptr,$b_ptr,#64 | |
1309 | add $r_ptr,sp,#$tmp0 | |
313e6ec1 | 1310 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); |
7a6c9a2e AP |
1311 | |
1312 | ldr $r_ptr,[sp,#32*5] | |
1313 | add $r_ptr,$r_ptr,#64 | |
313e6ec1 | 1314 | bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); |
7a6c9a2e AP |
1315 | |
1316 | add $a_ptr,sp,#$in_x | |
1317 | add $b_ptr,sp,#$Zsqr | |
1318 | add $r_ptr,sp,#$M | |
313e6ec1 | 1319 | bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); |
7a6c9a2e AP |
1320 | |
1321 | add $a_ptr,sp,#$in_x | |
1322 | add $b_ptr,sp,#$Zsqr | |
1323 | add $r_ptr,sp,#$Zsqr | |
313e6ec1 | 1324 | bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); |
7a6c9a2e AP |
1325 | |
1326 | add $a_ptr,sp,#$S | |
1327 | add $b_ptr,sp,#$S | |
1328 | add $r_ptr,sp,#$tmp0 | |
313e6ec1 | 1329 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); |
7a6c9a2e AP |
1330 | |
1331 | add $a_ptr,sp,#$Zsqr | |
1332 | add $b_ptr,sp,#$M | |
1333 | add $r_ptr,sp,#$M | |
313e6ec1 | 1334 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); |
7a6c9a2e AP |
1335 | |
1336 | ldr $r_ptr,[sp,#32*5] | |
1337 | add $a_ptr,sp,#$tmp0 | |
1338 | add $r_ptr,$r_ptr,#32 | |
313e6ec1 | 1339 | bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); |
7a6c9a2e AP |
1340 | |
1341 | add $a_ptr,sp,#$M | |
1342 | add $r_ptr,sp,#$M | |
313e6ec1 | 1343 | bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); |
7a6c9a2e AP |
1344 | |
1345 | add $a_ptr,sp,#$in_x | |
1346 | add $b_ptr,sp,#$S | |
1347 | add $r_ptr,sp,#$S | |
313e6ec1 | 1348 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); |
7a6c9a2e AP |
1349 | |
1350 | add $r_ptr,sp,#$tmp0 | |
313e6ec1 | 1351 | bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); |
7a6c9a2e AP |
1352 | |
1353 | ldr $r_ptr,[sp,#32*5] | |
1354 | add $a_ptr,sp,#$M | |
1355 | add $b_ptr,sp,#$M | |
313e6ec1 | 1356 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); |
7a6c9a2e AP |
1357 | |
1358 | add $b_ptr,sp,#$tmp0 | |
1359 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); | |
1360 | ||
1361 | add $b_ptr,sp,#$S | |
1362 | add $r_ptr,sp,#$S | |
1363 | bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); | |
1364 | ||
1365 | add $a_ptr,sp,#$M | |
1366 | add $b_ptr,sp,#$S | |
313e6ec1 | 1367 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); |
7a6c9a2e AP |
1368 | |
1369 | ldr $r_ptr,[sp,#32*5] | |
1370 | add $b_ptr,$r_ptr,#32 | |
1371 | add $r_ptr,$r_ptr,#32 | |
1372 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); | |
1373 | ||
1374 | add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" | |
1375 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | |
1376 | ldmia sp!,{r4-r12,pc} | |
1377 | #else | |
1378 | ldmia sp!,{r4-r12,lr} | |
1379 | bx lr @ interoperable with Thumb ISA:-) | |
1380 | #endif | |
1381 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | |
1382 | ___ | |
1383 | } | |
1384 | ||
1385 | ######################################################################## | |
1386 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | |
1387 | # const P256_POINT *in2); | |
1388 | { | |
1389 | my ($res_x,$res_y,$res_z, | |
1390 | $in1_x,$in1_y,$in1_z, | |
1391 | $in2_x,$in2_y,$in2_z, | |
1392 | $H,$Hsqr,$R,$Rsqr,$Hcub, | |
1393 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); | |
1394 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
1395 | # above map() describes stack layout with 18 temporary | |
1396 | # 256-bit vectors on top. Then note that we push | |
1397 | # starting from r0, which means that we have copy of | |
1398 | # input arguments just below these temporary vectors. | |
1399 | # We use three of them for !in1infty, !in2intfy and | |
1400 | # result of check for zero. | |
1401 | ||
1402 | $code.=<<___; | |
1403 | .globl ecp_nistz256_point_add | |
1404 | .type ecp_nistz256_point_add,%function | |
1405 | .align 5 | |
1406 | ecp_nistz256_point_add: | |
1407 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | |
143ee099 | 1408 | sub sp,sp,#32*18+16 |
7a6c9a2e | 1409 | |
c74aea8d | 1410 | ldmia $b_ptr!,{r4-r11} @ copy in2_x |
7a6c9a2e | 1411 | add r3,sp,#$in2_x |
7a6c9a2e | 1412 | stmia r3!,{r4-r11} |
c74aea8d AP |
1413 | ldmia $b_ptr!,{r4-r11} @ copy in2_y |
1414 | stmia r3!,{r4-r11} | |
1415 | ldmia $b_ptr,{r4-r11} @ copy in2_z | |
1416 | orr r12,r4,r5 | |
7a6c9a2e AP |
1417 | orr r12,r12,r6 |
1418 | orr r12,r12,r7 | |
1419 | orr r12,r12,r8 | |
1420 | orr r12,r12,r9 | |
1421 | orr r12,r12,r10 | |
1422 | orr r12,r12,r11 | |
7a6c9a2e | 1423 | cmp r12,#0 |
11208dcf AP |
1424 | #ifdef __thumb2__ |
1425 | it ne | |
1426 | #endif | |
7a6c9a2e AP |
1427 | movne r12,#-1 |
1428 | stmia r3,{r4-r11} | |
1429 | str r12,[sp,#32*18+8] @ !in2infty | |
1430 | ||
c74aea8d | 1431 | ldmia $a_ptr!,{r4-r11} @ copy in1_x |
7a6c9a2e | 1432 | add r3,sp,#$in1_x |
7a6c9a2e | 1433 | stmia r3!,{r4-r11} |
c74aea8d AP |
1434 | ldmia $a_ptr!,{r4-r11} @ copy in1_y |
1435 | stmia r3!,{r4-r11} | |
1436 | ldmia $a_ptr,{r4-r11} @ copy in1_z | |
1437 | orr r12,r4,r5 | |
7a6c9a2e AP |
1438 | orr r12,r12,r6 |
1439 | orr r12,r12,r7 | |
1440 | orr r12,r12,r8 | |
1441 | orr r12,r12,r9 | |
1442 | orr r12,r12,r10 | |
1443 | orr r12,r12,r11 | |
7a6c9a2e | 1444 | cmp r12,#0 |
11208dcf AP |
1445 | #ifdef __thumb2__ |
1446 | it ne | |
1447 | #endif | |
7a6c9a2e AP |
1448 | movne r12,#-1 |
1449 | stmia r3,{r4-r11} | |
1450 | str r12,[sp,#32*18+4] @ !in1infty | |
1451 | ||
1452 | add $a_ptr,sp,#$in2_z | |
1453 | add $b_ptr,sp,#$in2_z | |
1454 | add $r_ptr,sp,#$Z2sqr | |
313e6ec1 | 1455 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); |
7a6c9a2e AP |
1456 | |
1457 | add $a_ptr,sp,#$in1_z | |
1458 | add $b_ptr,sp,#$in1_z | |
1459 | add $r_ptr,sp,#$Z1sqr | |
313e6ec1 | 1460 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); |
7a6c9a2e AP |
1461 | |
1462 | add $a_ptr,sp,#$in2_z | |
1463 | add $b_ptr,sp,#$Z2sqr | |
1464 | add $r_ptr,sp,#$S1 | |
313e6ec1 | 1465 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); |
7a6c9a2e AP |
1466 | |
1467 | add $a_ptr,sp,#$in1_z | |
1468 | add $b_ptr,sp,#$Z1sqr | |
1469 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1470 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); |
7a6c9a2e AP |
1471 | |
1472 | add $a_ptr,sp,#$in1_y | |
1473 | add $b_ptr,sp,#$S1 | |
1474 | add $r_ptr,sp,#$S1 | |
313e6ec1 | 1475 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); |
7a6c9a2e AP |
1476 | |
1477 | add $a_ptr,sp,#$in2_y | |
1478 | add $b_ptr,sp,#$S2 | |
1479 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1480 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); |
7a6c9a2e AP |
1481 | |
1482 | add $b_ptr,sp,#$S1 | |
1483 | add $r_ptr,sp,#$R | |
1484 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); | |
1485 | ||
1486 | orr $a0,$a0,$a1 @ see if result is zero | |
1487 | orr $a2,$a2,$a3 | |
1488 | orr $a4,$a4,$a5 | |
1489 | orr $a0,$a0,$a2 | |
1490 | orr $a4,$a4,$a6 | |
1491 | orr $a0,$a0,$a7 | |
1492 | add $a_ptr,sp,#$in1_x | |
1493 | orr $a0,$a0,$a4 | |
1494 | add $b_ptr,sp,#$Z2sqr | |
1495 | str $a0,[sp,#32*18+12] | |
1496 | ||
1497 | add $r_ptr,sp,#$U1 | |
313e6ec1 | 1498 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); |
7a6c9a2e AP |
1499 | |
1500 | add $a_ptr,sp,#$in2_x | |
1501 | add $b_ptr,sp,#$Z1sqr | |
1502 | add $r_ptr,sp,#$U2 | |
313e6ec1 | 1503 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); |
7a6c9a2e AP |
1504 | |
1505 | add $b_ptr,sp,#$U1 | |
1506 | add $r_ptr,sp,#$H | |
1507 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); | |
1508 | ||
1509 | orr $a0,$a0,$a1 @ see if result is zero | |
1510 | orr $a2,$a2,$a3 | |
1511 | orr $a4,$a4,$a5 | |
1512 | orr $a0,$a0,$a2 | |
1513 | orr $a4,$a4,$a6 | |
1514 | orr $a0,$a0,$a7 | |
1515 | orrs $a0,$a0,$a4 | |
1516 | ||
1517 | bne .Ladd_proceed @ is_equal(U1,U2)? | |
1518 | ||
1519 | ldr $t0,[sp,#32*18+4] | |
1520 | ldr $t1,[sp,#32*18+8] | |
1521 | ldr $t2,[sp,#32*18+12] | |
1522 | tst $t0,$t1 | |
1523 | beq .Ladd_proceed @ (in1infty || in2infty)? | |
1524 | tst $t2,$t2 | |
143ee099 | 1525 | beq .Ladd_double @ is_equal(S1,S2)? |
7a6c9a2e | 1526 | |
143ee099 | 1527 | ldr $r_ptr,[sp,#32*18+16] |
7a6c9a2e AP |
1528 | eor r4,r4,r4 |
1529 | eor r5,r5,r5 | |
1530 | eor r6,r6,r6 | |
1531 | eor r7,r7,r7 | |
1532 | eor r8,r8,r8 | |
1533 | eor r9,r9,r9 | |
1534 | eor r10,r10,r10 | |
1535 | eor r11,r11,r11 | |
1536 | stmia $r_ptr!,{r4-r11} | |
1537 | stmia $r_ptr!,{r4-r11} | |
1538 | stmia $r_ptr!,{r4-r11} | |
1539 | b .Ladd_done | |
1540 | ||
143ee099 AP |
1541 | .align 4 |
1542 | .Ladd_double: | |
1543 | ldr $a_ptr,[sp,#32*18+20] | |
1544 | add sp,sp,#32*(18-5)+16 @ difference in frame sizes | |
1545 | b .Lpoint_double_shortcut | |
1546 | ||
7a6c9a2e AP |
1547 | .align 4 |
1548 | .Ladd_proceed: | |
1549 | add $a_ptr,sp,#$R | |
1550 | add $b_ptr,sp,#$R | |
1551 | add $r_ptr,sp,#$Rsqr | |
313e6ec1 | 1552 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); |
7a6c9a2e AP |
1553 | |
1554 | add $a_ptr,sp,#$H | |
1555 | add $b_ptr,sp,#$in1_z | |
1556 | add $r_ptr,sp,#$res_z | |
313e6ec1 | 1557 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); |
7a6c9a2e AP |
1558 | |
1559 | add $a_ptr,sp,#$H | |
1560 | add $b_ptr,sp,#$H | |
1561 | add $r_ptr,sp,#$Hsqr | |
313e6ec1 | 1562 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); |
7a6c9a2e AP |
1563 | |
1564 | add $a_ptr,sp,#$in2_z | |
1565 | add $b_ptr,sp,#$res_z | |
1566 | add $r_ptr,sp,#$res_z | |
313e6ec1 | 1567 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); |
7a6c9a2e AP |
1568 | |
1569 | add $a_ptr,sp,#$H | |
1570 | add $b_ptr,sp,#$Hsqr | |
1571 | add $r_ptr,sp,#$Hcub | |
313e6ec1 | 1572 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); |
7a6c9a2e AP |
1573 | |
1574 | add $a_ptr,sp,#$Hsqr | |
1575 | add $b_ptr,sp,#$U1 | |
1576 | add $r_ptr,sp,#$U2 | |
313e6ec1 | 1577 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); |
7a6c9a2e AP |
1578 | |
1579 | add $r_ptr,sp,#$Hsqr | |
313e6ec1 | 1580 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); |
7a6c9a2e AP |
1581 | |
1582 | add $b_ptr,sp,#$Rsqr | |
1583 | add $r_ptr,sp,#$res_x | |
1584 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | |
1585 | ||
1586 | add $b_ptr,sp,#$Hcub | |
1587 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | |
1588 | ||
1589 | add $b_ptr,sp,#$U2 | |
1590 | add $r_ptr,sp,#$res_y | |
1591 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | |
1592 | ||
1593 | add $a_ptr,sp,#$Hcub | |
1594 | add $b_ptr,sp,#$S1 | |
1595 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1596 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); |
7a6c9a2e AP |
1597 | |
1598 | add $a_ptr,sp,#$R | |
1599 | add $b_ptr,sp,#$res_y | |
1600 | add $r_ptr,sp,#$res_y | |
313e6ec1 | 1601 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); |
7a6c9a2e AP |
1602 | |
1603 | add $b_ptr,sp,#$S2 | |
1604 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | |
1605 | ||
1606 | ldr r11,[sp,#32*18+4] @ !in1intfy | |
1607 | ldr r12,[sp,#32*18+8] @ !in2intfy | |
1608 | add r1,sp,#$res_x | |
1609 | add r2,sp,#$in2_x | |
1610 | and r10,r11,r12 | |
1611 | mvn r11,r11 | |
1612 | add r3,sp,#$in1_x | |
1613 | and r11,r11,r12 | |
1614 | mvn r12,r12 | |
143ee099 | 1615 | ldr $r_ptr,[sp,#32*18+16] |
7a6c9a2e AP |
1616 | ___ |
1617 | for($i=0;$i<96;$i+=8) { # conditional moves | |
1618 | $code.=<<___; | |
1619 | ldmia r1!,{r4-r5} @ res_x | |
1620 | ldmia r2!,{r6-r7} @ in2_x | |
1621 | ldmia r3!,{r8-r9} @ in1_x | |
1622 | and r4,r4,r10 | |
1623 | and r5,r5,r10 | |
1624 | and r6,r6,r11 | |
1625 | and r7,r7,r11 | |
1626 | and r8,r8,r12 | |
1627 | and r9,r9,r12 | |
1628 | orr r4,r4,r6 | |
1629 | orr r5,r5,r7 | |
1630 | orr r4,r4,r8 | |
1631 | orr r5,r5,r9 | |
1632 | stmia $r_ptr!,{r4-r5} | |
1633 | ___ | |
1634 | } | |
1635 | $code.=<<___; | |
1636 | .Ladd_done: | |
143ee099 | 1637 | add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" |
198a2ed7 | 1638 | #if __ARM_ARCH__>=5 || !defined(__thumb__) |
7a6c9a2e AP |
1639 | ldmia sp!,{r4-r12,pc} |
1640 | #else | |
1641 | ldmia sp!,{r4-r12,lr} | |
1642 | bx lr @ interoperable with Thumb ISA:-) | |
1643 | #endif | |
1644 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | |
1645 | ___ | |
1646 | } | |
1647 | ||
1648 | ######################################################################## | |
1649 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | |
1650 | # const P256_POINT_AFFINE *in2); | |
1651 | { | |
1652 | my ($res_x,$res_y,$res_z, | |
1653 | $in1_x,$in1_y,$in1_z, | |
1654 | $in2_x,$in2_y, | |
1655 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); | |
1656 | my $Z1sqr = $S2; | |
1657 | # above map() describes stack layout with 18 temporary | |
1658 | # 256-bit vectors on top. Then note that we push | |
1659 | # starting from r0, which means that we have copy of | |
1660 | # input arguments just below these temporary vectors. | |
1661 | # We use two of them for !in1infty, !in2intfy. | |
1662 | ||
1663 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); | |
1664 | ||
1665 | $code.=<<___; | |
1666 | .globl ecp_nistz256_point_add_affine | |
1667 | .type ecp_nistz256_point_add_affine,%function | |
1668 | .align 5 | |
1669 | ecp_nistz256_point_add_affine: | |
1670 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional | |
1671 | sub sp,sp,#32*15 | |
1672 | ||
c74aea8d | 1673 | ldmia $a_ptr!,{r4-r11} @ copy in1_x |
7a6c9a2e | 1674 | add r3,sp,#$in1_x |
7a6c9a2e | 1675 | stmia r3!,{r4-r11} |
c74aea8d AP |
1676 | ldmia $a_ptr!,{r4-r11} @ copy in1_y |
1677 | stmia r3!,{r4-r11} | |
1678 | ldmia $a_ptr,{r4-r11} @ copy in1_z | |
1679 | orr r12,r4,r5 | |
7a6c9a2e AP |
1680 | orr r12,r12,r6 |
1681 | orr r12,r12,r7 | |
1682 | orr r12,r12,r8 | |
1683 | orr r12,r12,r9 | |
1684 | orr r12,r12,r10 | |
1685 | orr r12,r12,r11 | |
7a6c9a2e | 1686 | cmp r12,#0 |
11208dcf AP |
1687 | #ifdef __thumb2__ |
1688 | it ne | |
1689 | #endif | |
7a6c9a2e AP |
1690 | movne r12,#-1 |
1691 | stmia r3,{r4-r11} | |
1692 | str r12,[sp,#32*15+4] @ !in1infty | |
1693 | ||
c74aea8d | 1694 | ldmia $b_ptr!,{r4-r11} @ copy in2_x |
7a6c9a2e AP |
1695 | add r3,sp,#$in2_x |
1696 | orr r12,r4,r5 | |
1697 | orr r12,r12,r6 | |
1698 | orr r12,r12,r7 | |
1699 | orr r12,r12,r8 | |
1700 | orr r12,r12,r9 | |
1701 | orr r12,r12,r10 | |
1702 | orr r12,r12,r11 | |
1703 | stmia r3!,{r4-r11} | |
c74aea8d | 1704 | ldmia $b_ptr!,{r4-r11} @ copy in2_y |
7a6c9a2e AP |
1705 | orr r12,r12,r4 |
1706 | orr r12,r12,r5 | |
1707 | orr r12,r12,r6 | |
1708 | orr r12,r12,r7 | |
1709 | orr r12,r12,r8 | |
1710 | orr r12,r12,r9 | |
1711 | orr r12,r12,r10 | |
1712 | orr r12,r12,r11 | |
1713 | stmia r3!,{r4-r11} | |
1714 | cmp r12,#0 | |
11208dcf AP |
1715 | #ifdef __thumb2__ |
1716 | it ne | |
1717 | #endif | |
7a6c9a2e AP |
1718 | movne r12,#-1 |
1719 | str r12,[sp,#32*15+8] @ !in2infty | |
1720 | ||
1721 | add $a_ptr,sp,#$in1_z | |
1722 | add $b_ptr,sp,#$in1_z | |
1723 | add $r_ptr,sp,#$Z1sqr | |
313e6ec1 | 1724 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); |
7a6c9a2e AP |
1725 | |
1726 | add $a_ptr,sp,#$Z1sqr | |
1727 | add $b_ptr,sp,#$in2_x | |
1728 | add $r_ptr,sp,#$U2 | |
313e6ec1 | 1729 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); |
7a6c9a2e AP |
1730 | |
1731 | add $b_ptr,sp,#$in1_x | |
1732 | add $r_ptr,sp,#$H | |
1733 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); | |
1734 | ||
1735 | add $a_ptr,sp,#$Z1sqr | |
1736 | add $b_ptr,sp,#$in1_z | |
1737 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1738 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); |
7a6c9a2e AP |
1739 | |
1740 | add $a_ptr,sp,#$H | |
1741 | add $b_ptr,sp,#$in1_z | |
1742 | add $r_ptr,sp,#$res_z | |
313e6ec1 | 1743 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); |
7a6c9a2e AP |
1744 | |
1745 | add $a_ptr,sp,#$in2_y | |
1746 | add $b_ptr,sp,#$S2 | |
1747 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1748 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); |
7a6c9a2e AP |
1749 | |
1750 | add $b_ptr,sp,#$in1_y | |
1751 | add $r_ptr,sp,#$R | |
1752 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); | |
1753 | ||
1754 | add $a_ptr,sp,#$H | |
1755 | add $b_ptr,sp,#$H | |
1756 | add $r_ptr,sp,#$Hsqr | |
313e6ec1 | 1757 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); |
7a6c9a2e AP |
1758 | |
1759 | add $a_ptr,sp,#$R | |
1760 | add $b_ptr,sp,#$R | |
1761 | add $r_ptr,sp,#$Rsqr | |
313e6ec1 | 1762 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); |
7a6c9a2e AP |
1763 | |
1764 | add $a_ptr,sp,#$H | |
1765 | add $b_ptr,sp,#$Hsqr | |
1766 | add $r_ptr,sp,#$Hcub | |
313e6ec1 | 1767 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); |
7a6c9a2e AP |
1768 | |
1769 | add $a_ptr,sp,#$Hsqr | |
1770 | add $b_ptr,sp,#$in1_x | |
1771 | add $r_ptr,sp,#$U2 | |
313e6ec1 | 1772 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); |
7a6c9a2e AP |
1773 | |
1774 | add $r_ptr,sp,#$Hsqr | |
313e6ec1 | 1775 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); |
7a6c9a2e AP |
1776 | |
1777 | add $b_ptr,sp,#$Rsqr | |
1778 | add $r_ptr,sp,#$res_x | |
1779 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); | |
1780 | ||
1781 | add $b_ptr,sp,#$Hcub | |
1782 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); | |
1783 | ||
1784 | add $b_ptr,sp,#$U2 | |
1785 | add $r_ptr,sp,#$res_y | |
1786 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); | |
1787 | ||
1788 | add $a_ptr,sp,#$Hcub | |
1789 | add $b_ptr,sp,#$in1_y | |
1790 | add $r_ptr,sp,#$S2 | |
313e6ec1 | 1791 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); |
7a6c9a2e AP |
1792 | |
1793 | add $a_ptr,sp,#$R | |
1794 | add $b_ptr,sp,#$res_y | |
1795 | add $r_ptr,sp,#$res_y | |
313e6ec1 | 1796 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); |
7a6c9a2e AP |
1797 | |
1798 | add $b_ptr,sp,#$S2 | |
1799 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); | |
1800 | ||
1801 | ldr r11,[sp,#32*15+4] @ !in1intfy | |
1802 | ldr r12,[sp,#32*15+8] @ !in2intfy | |
1803 | add r1,sp,#$res_x | |
1804 | add r2,sp,#$in2_x | |
1805 | and r10,r11,r12 | |
1806 | mvn r11,r11 | |
1807 | add r3,sp,#$in1_x | |
1808 | and r11,r11,r12 | |
1809 | mvn r12,r12 | |
1810 | ldr $r_ptr,[sp,#32*15] | |
1811 | ___ | |
1812 | for($i=0;$i<64;$i+=8) { # conditional moves | |
1813 | $code.=<<___; | |
1814 | ldmia r1!,{r4-r5} @ res_x | |
1815 | ldmia r2!,{r6-r7} @ in2_x | |
1816 | ldmia r3!,{r8-r9} @ in1_x | |
1817 | and r4,r4,r10 | |
1818 | and r5,r5,r10 | |
1819 | and r6,r6,r11 | |
1820 | and r7,r7,r11 | |
1821 | and r8,r8,r12 | |
1822 | and r9,r9,r12 | |
1823 | orr r4,r4,r6 | |
1824 | orr r5,r5,r7 | |
1825 | orr r4,r4,r8 | |
1826 | orr r5,r5,r9 | |
1827 | stmia $r_ptr!,{r4-r5} | |
1828 | ___ | |
1829 | } | |
1830 | for(;$i<96;$i+=8) { | |
1831 | my $j=($i-64)/4; | |
1832 | $code.=<<___; | |
1833 | ldmia r1!,{r4-r5} @ res_z | |
1834 | ldmia r3!,{r8-r9} @ in1_z | |
1835 | and r4,r4,r10 | |
1836 | and r5,r5,r10 | |
1837 | and r6,r11,#@ONE_mont[$j] | |
1838 | and r7,r11,#@ONE_mont[$j+1] | |
1839 | and r8,r8,r12 | |
1840 | and r9,r9,r12 | |
1841 | orr r4,r4,r6 | |
1842 | orr r5,r5,r7 | |
1843 | orr r4,r4,r8 | |
1844 | orr r5,r5,r9 | |
1845 | stmia $r_ptr!,{r4-r5} | |
1846 | ___ | |
1847 | } | |
1848 | $code.=<<___; | |
1849 | add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" | |
1850 | #if __ARM_ARCH__>=5 || !defined(__thumb__) | |
1851 | ldmia sp!,{r4-r12,pc} | |
1852 | #else | |
1853 | ldmia sp!,{r4-r12,lr} | |
1854 | bx lr @ interoperable with Thumb ISA:-) | |
1855 | #endif | |
1856 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | |
1857 | ___ | |
1858 | } }}} | |
1859 | ||
1860 | foreach (split("\n",$code)) { | |
1861 | s/\`([^\`]*)\`/eval $1/geo; | |
1862 | ||
1863 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; | |
1864 | ||
1865 | print $_,"\n"; | |
1866 | } | |
1867 | close STDOUT; # enforce flush |