]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
3c7d0945 | 2 | # Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
a7f182b7 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
e1613e7c AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # ECP_NISTZ256 module for ARMv8. | |
18 | # | |
19 | # February 2015. | |
20 | # | |
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | |
22 | # http://eprint.iacr.org/2013/816. | |
23 | # | |
d38f1b39 | 24 | # with/without -DECP_NISTZ256_ASM |
ab4f2026 AP |
25 | # Apple A7 +190-360% |
26 | # Cortex-A53 +190-400% | |
27 | # Cortex-A57 +190-350% | |
28 | # Denver +230-400% | |
e1613e7c AP |
29 | # |
30 | # Ranges denote minimum and maximum improvement coefficients depending | |
31 | # on benchmark. Lower coefficients are for ECDSA sign, server-side | |
d38f1b39 | 32 | # operation. Keep in mind that +400% means 5x improvement. |
e1613e7c AP |
33 | |
34 | $flavour = shift; | |
a5aa63a4 | 35 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} |
e1613e7c AP |
36 | |
37 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
38 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
39 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
40 | die "can't locate arm-xlate.pl"; | |
41 | ||
42 | open OUT,"| \"$^X\" $xlate $flavour $output"; | |
43 | *STDOUT=*OUT; | |
44 | ||
45 | { | |
46 | my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, | |
47 | $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = | |
48 | map("x$_",(0..17,19,20)); | |
49 | ||
50 | my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont | |
51 | ||
52 | $code.=<<___; | |
53 | #include "arm_arch.h" | |
54 | ||
55 | .text | |
56 | ___ | |
57 | ######################################################################## | |
58 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
59 | # | |
60 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
61 | open TABLE,"<ecp_nistz256_table.c" or | |
62 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
63 | die "failed to open ecp_nistz256_table.c:",$!; | |
64 | ||
65 | use integer; | |
66 | ||
67 | foreach(<TABLE>) { | |
68 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
69 | } | |
70 | close TABLE; | |
71 | ||
72 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37. | |
73 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not | |
74 | # amount of elements. | |
75 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
76 | ||
77 | $code.=<<___; | |
78 | .globl ecp_nistz256_precomputed | |
79 | .type ecp_nistz256_precomputed,%object | |
80 | .align 12 | |
81 | ecp_nistz256_precomputed: | |
82 | ___ | |
83 | ######################################################################## | |
84 | # this conversion smashes P256_POINT_AFFINE by individual bytes with | |
85 | # 64 byte interval, similar to | |
86 | # 1111222233334444 | |
87 | # 1234123412341234 | |
88 | for(1..37) { | |
89 | @tbl = splice(@arr,0,64*16); | |
90 | for($i=0;$i<64;$i++) { | |
91 | undef @line; | |
92 | for($j=0;$j<64;$j++) { | |
93 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; | |
94 | } | |
95 | $code.=".byte\t"; | |
96 | $code.=join(',',map { sprintf "0x%02x",$_} @line); | |
97 | $code.="\n"; | |
98 | } | |
99 | } | |
100 | $code.=<<___; | |
101 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed | |
102 | .align 5 | |
103 | .Lpoly: | |
104 | .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 | |
105 | .LRR: // 2^512 mod P precomputed for NIST P256 polynomial | |
106 | .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd | |
107 | .Lone_mont: | |
108 | .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe | |
109 | .Lone: | |
110 | .quad 1,0,0,0 | |
ab4f2026 AP |
111 | .Lord: |
112 | .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 | |
113 | .LordK: | |
114 | .quad 0xccd1c8aaee00bc4f | |
e1613e7c AP |
115 | .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
116 | ||
117 | // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
118 | .globl ecp_nistz256_to_mont | |
119 | .type ecp_nistz256_to_mont,%function | |
120 | .align 6 | |
121 | ecp_nistz256_to_mont: | |
9a18aae5 | 122 | .inst 0xd503233f // paciasp |
e1613e7c AP |
123 | stp x29,x30,[sp,#-32]! |
124 | add x29,sp,#0 | |
125 | stp x19,x20,[sp,#16] | |
126 | ||
127 | ldr $bi,.LRR // bp[0] | |
128 | ldp $a0,$a1,[$ap] | |
129 | ldp $a2,$a3,[$ap,#16] | |
130 | ldr $poly1,.Lpoly+8 | |
131 | ldr $poly3,.Lpoly+24 | |
132 | adr $bp,.LRR // &bp[0] | |
133 | ||
134 | bl __ecp_nistz256_mul_mont | |
135 | ||
136 | ldp x19,x20,[sp,#16] | |
137 | ldp x29,x30,[sp],#32 | |
9a18aae5 | 138 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
139 | ret |
140 | .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont | |
141 | ||
142 | // void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
143 | .globl ecp_nistz256_from_mont | |
144 | .type ecp_nistz256_from_mont,%function | |
145 | .align 4 | |
146 | ecp_nistz256_from_mont: | |
9a18aae5 | 147 | .inst 0xd503233f // paciasp |
e1613e7c AP |
148 | stp x29,x30,[sp,#-32]! |
149 | add x29,sp,#0 | |
150 | stp x19,x20,[sp,#16] | |
151 | ||
152 | mov $bi,#1 // bp[0] | |
153 | ldp $a0,$a1,[$ap] | |
154 | ldp $a2,$a3,[$ap,#16] | |
155 | ldr $poly1,.Lpoly+8 | |
156 | ldr $poly3,.Lpoly+24 | |
157 | adr $bp,.Lone // &bp[0] | |
158 | ||
159 | bl __ecp_nistz256_mul_mont | |
160 | ||
161 | ldp x19,x20,[sp,#16] | |
162 | ldp x29,x30,[sp],#32 | |
9a18aae5 | 163 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
164 | ret |
165 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont | |
166 | ||
167 | // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], | |
168 | // const BN_ULONG x2[4]); | |
169 | .globl ecp_nistz256_mul_mont | |
170 | .type ecp_nistz256_mul_mont,%function | |
171 | .align 4 | |
172 | ecp_nistz256_mul_mont: | |
9a18aae5 | 173 | .inst 0xd503233f // paciasp |
e1613e7c AP |
174 | stp x29,x30,[sp,#-32]! |
175 | add x29,sp,#0 | |
176 | stp x19,x20,[sp,#16] | |
177 | ||
178 | ldr $bi,[$bp] // bp[0] | |
179 | ldp $a0,$a1,[$ap] | |
180 | ldp $a2,$a3,[$ap,#16] | |
181 | ldr $poly1,.Lpoly+8 | |
182 | ldr $poly3,.Lpoly+24 | |
183 | ||
184 | bl __ecp_nistz256_mul_mont | |
185 | ||
186 | ldp x19,x20,[sp,#16] | |
187 | ldp x29,x30,[sp],#32 | |
9a18aae5 | 188 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
189 | ret |
190 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | |
191 | ||
192 | // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
193 | .globl ecp_nistz256_sqr_mont | |
194 | .type ecp_nistz256_sqr_mont,%function | |
195 | .align 4 | |
196 | ecp_nistz256_sqr_mont: | |
9a18aae5 | 197 | .inst 0xd503233f // paciasp |
e1613e7c AP |
198 | stp x29,x30,[sp,#-32]! |
199 | add x29,sp,#0 | |
200 | stp x19,x20,[sp,#16] | |
201 | ||
202 | ldp $a0,$a1,[$ap] | |
203 | ldp $a2,$a3,[$ap,#16] | |
204 | ldr $poly1,.Lpoly+8 | |
205 | ldr $poly3,.Lpoly+24 | |
206 | ||
207 | bl __ecp_nistz256_sqr_mont | |
208 | ||
209 | ldp x19,x20,[sp,#16] | |
210 | ldp x29,x30,[sp],#32 | |
9a18aae5 | 211 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
212 | ret |
213 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | |
214 | ||
215 | // void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], | |
216 | // const BN_ULONG x2[4]); | |
217 | .globl ecp_nistz256_add | |
218 | .type ecp_nistz256_add,%function | |
219 | .align 4 | |
220 | ecp_nistz256_add: | |
9a18aae5 | 221 | .inst 0xd503233f // paciasp |
e1613e7c AP |
222 | stp x29,x30,[sp,#-16]! |
223 | add x29,sp,#0 | |
224 | ||
225 | ldp $acc0,$acc1,[$ap] | |
226 | ldp $t0,$t1,[$bp] | |
227 | ldp $acc2,$acc3,[$ap,#16] | |
228 | ldp $t2,$t3,[$bp,#16] | |
229 | ldr $poly1,.Lpoly+8 | |
230 | ldr $poly3,.Lpoly+24 | |
231 | ||
232 | bl __ecp_nistz256_add | |
233 | ||
234 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 235 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
236 | ret |
237 | .size ecp_nistz256_add,.-ecp_nistz256_add | |
238 | ||
239 | // void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
240 | .globl ecp_nistz256_div_by_2 | |
241 | .type ecp_nistz256_div_by_2,%function | |
242 | .align 4 | |
243 | ecp_nistz256_div_by_2: | |
9a18aae5 | 244 | .inst 0xd503233f // paciasp |
e1613e7c AP |
245 | stp x29,x30,[sp,#-16]! |
246 | add x29,sp,#0 | |
247 | ||
248 | ldp $acc0,$acc1,[$ap] | |
249 | ldp $acc2,$acc3,[$ap,#16] | |
250 | ldr $poly1,.Lpoly+8 | |
251 | ldr $poly3,.Lpoly+24 | |
252 | ||
253 | bl __ecp_nistz256_div_by_2 | |
254 | ||
255 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 256 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
257 | ret |
258 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | |
259 | ||
260 | // void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
261 | .globl ecp_nistz256_mul_by_2 | |
262 | .type ecp_nistz256_mul_by_2,%function | |
263 | .align 4 | |
264 | ecp_nistz256_mul_by_2: | |
9a18aae5 | 265 | .inst 0xd503233f // paciasp |
e1613e7c AP |
266 | stp x29,x30,[sp,#-16]! |
267 | add x29,sp,#0 | |
268 | ||
269 | ldp $acc0,$acc1,[$ap] | |
270 | ldp $acc2,$acc3,[$ap,#16] | |
271 | ldr $poly1,.Lpoly+8 | |
272 | ldr $poly3,.Lpoly+24 | |
273 | mov $t0,$acc0 | |
274 | mov $t1,$acc1 | |
275 | mov $t2,$acc2 | |
276 | mov $t3,$acc3 | |
277 | ||
278 | bl __ecp_nistz256_add // ret = a+a // 2*a | |
279 | ||
280 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 281 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
282 | ret |
283 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | |
284 | ||
285 | // void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
286 | .globl ecp_nistz256_mul_by_3 | |
287 | .type ecp_nistz256_mul_by_3,%function | |
288 | .align 4 | |
289 | ecp_nistz256_mul_by_3: | |
9a18aae5 | 290 | .inst 0xd503233f // paciasp |
e1613e7c AP |
291 | stp x29,x30,[sp,#-16]! |
292 | add x29,sp,#0 | |
293 | ||
294 | ldp $acc0,$acc1,[$ap] | |
295 | ldp $acc2,$acc3,[$ap,#16] | |
296 | ldr $poly1,.Lpoly+8 | |
297 | ldr $poly3,.Lpoly+24 | |
298 | mov $t0,$acc0 | |
299 | mov $t1,$acc1 | |
300 | mov $t2,$acc2 | |
301 | mov $t3,$acc3 | |
302 | mov $a0,$acc0 | |
303 | mov $a1,$acc1 | |
304 | mov $a2,$acc2 | |
305 | mov $a3,$acc3 | |
306 | ||
307 | bl __ecp_nistz256_add // ret = a+a // 2*a | |
308 | ||
309 | mov $t0,$a0 | |
310 | mov $t1,$a1 | |
311 | mov $t2,$a2 | |
312 | mov $t3,$a3 | |
313 | ||
314 | bl __ecp_nistz256_add // ret += a // 2*a+a=3*a | |
315 | ||
316 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 317 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
318 | ret |
319 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | |
320 | ||
321 | // void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], | |
322 | // const BN_ULONG x2[4]); | |
323 | .globl ecp_nistz256_sub | |
324 | .type ecp_nistz256_sub,%function | |
325 | .align 4 | |
326 | ecp_nistz256_sub: | |
9a18aae5 | 327 | .inst 0xd503233f // paciasp |
e1613e7c AP |
328 | stp x29,x30,[sp,#-16]! |
329 | add x29,sp,#0 | |
330 | ||
331 | ldp $acc0,$acc1,[$ap] | |
332 | ldp $acc2,$acc3,[$ap,#16] | |
333 | ldr $poly1,.Lpoly+8 | |
334 | ldr $poly3,.Lpoly+24 | |
335 | ||
336 | bl __ecp_nistz256_sub_from | |
337 | ||
338 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 339 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
340 | ret |
341 | .size ecp_nistz256_sub,.-ecp_nistz256_sub | |
342 | ||
343 | // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
344 | .globl ecp_nistz256_neg | |
345 | .type ecp_nistz256_neg,%function | |
346 | .align 4 | |
347 | ecp_nistz256_neg: | |
9a18aae5 | 348 | .inst 0xd503233f // paciasp |
e1613e7c AP |
349 | stp x29,x30,[sp,#-16]! |
350 | add x29,sp,#0 | |
351 | ||
352 | mov $bp,$ap | |
353 | mov $acc0,xzr // a = 0 | |
354 | mov $acc1,xzr | |
355 | mov $acc2,xzr | |
356 | mov $acc3,xzr | |
357 | ldr $poly1,.Lpoly+8 | |
358 | ldr $poly3,.Lpoly+24 | |
359 | ||
360 | bl __ecp_nistz256_sub_from | |
361 | ||
362 | ldp x29,x30,[sp],#16 | |
9a18aae5 | 363 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
364 | ret |
365 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | |
366 | ||
367 | // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded | |
368 | // to $a0-$a3 and b[0] - to $bi | |
369 | .type __ecp_nistz256_mul_mont,%function | |
370 | .align 4 | |
371 | __ecp_nistz256_mul_mont: | |
372 | mul $acc0,$a0,$bi // a[0]*b[0] | |
373 | umulh $t0,$a0,$bi | |
374 | ||
375 | mul $acc1,$a1,$bi // a[1]*b[0] | |
376 | umulh $t1,$a1,$bi | |
377 | ||
378 | mul $acc2,$a2,$bi // a[2]*b[0] | |
379 | umulh $t2,$a2,$bi | |
380 | ||
381 | mul $acc3,$a3,$bi // a[3]*b[0] | |
382 | umulh $t3,$a3,$bi | |
383 | ldr $bi,[$bp,#8] // b[1] | |
384 | ||
385 | adds $acc1,$acc1,$t0 // accumulate high parts of multiplication | |
386 | lsl $t0,$acc0,#32 | |
387 | adcs $acc2,$acc2,$t1 | |
388 | lsr $t1,$acc0,#32 | |
389 | adcs $acc3,$acc3,$t2 | |
390 | adc $acc4,xzr,$t3 | |
391 | mov $acc5,xzr | |
392 | ___ | |
393 | for($i=1;$i<4;$i++) { | |
394 | # Reduction iteration is normally performed by accumulating | |
395 | # result of multiplication of modulus by "magic" digit [and | |
396 | # omitting least significant word, which is guaranteed to | |
397 | # be 0], but thanks to special form of modulus and "magic" | |
398 | # digit being equal to least significant word, it can be | |
399 | # performed with additions and subtractions alone. Indeed: | |
400 | # | |
401 | # ffff0001.00000000.0000ffff.ffffffff | |
402 | # * abcdefgh | |
403 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | |
404 | # | |
405 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
406 | # rewrite above as: | |
407 | # | |
408 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | |
409 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 | |
410 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh | |
411 | # | |
412 | # or marking redundant operations: | |
413 | # | |
414 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- | |
415 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- | |
416 | # - 0000abcd.efgh0000.--------.--------.-------- | |
417 | ||
418 | $code.=<<___; | |
419 | subs $t2,$acc0,$t0 // "*0xffff0001" | |
420 | sbc $t3,$acc0,$t1 | |
421 | adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] | |
422 | mul $t0,$a0,$bi // lo(a[0]*b[i]) | |
423 | adcs $acc1,$acc2,$t1 | |
424 | mul $t1,$a1,$bi // lo(a[1]*b[i]) | |
425 | adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 | |
426 | mul $t2,$a2,$bi // lo(a[2]*b[i]) | |
427 | adcs $acc3,$acc4,$t3 | |
428 | mul $t3,$a3,$bi // lo(a[3]*b[i]) | |
429 | adc $acc4,$acc5,xzr | |
430 | ||
431 | adds $acc0,$acc0,$t0 // accumulate low parts of multiplication | |
432 | umulh $t0,$a0,$bi // hi(a[0]*b[i]) | |
433 | adcs $acc1,$acc1,$t1 | |
434 | umulh $t1,$a1,$bi // hi(a[1]*b[i]) | |
435 | adcs $acc2,$acc2,$t2 | |
436 | umulh $t2,$a2,$bi // hi(a[2]*b[i]) | |
437 | adcs $acc3,$acc3,$t3 | |
438 | umulh $t3,$a3,$bi // hi(a[3]*b[i]) | |
439 | adc $acc4,$acc4,xzr | |
440 | ___ | |
441 | $code.=<<___ if ($i<3); | |
442 | ldr $bi,[$bp,#8*($i+1)] // b[$i+1] | |
443 | ___ | |
444 | $code.=<<___; | |
445 | adds $acc1,$acc1,$t0 // accumulate high parts of multiplication | |
446 | lsl $t0,$acc0,#32 | |
447 | adcs $acc2,$acc2,$t1 | |
448 | lsr $t1,$acc0,#32 | |
449 | adcs $acc3,$acc3,$t2 | |
450 | adcs $acc4,$acc4,$t3 | |
451 | adc $acc5,xzr,xzr | |
452 | ___ | |
453 | } | |
454 | $code.=<<___; | |
455 | // last reduction | |
456 | subs $t2,$acc0,$t0 // "*0xffff0001" | |
457 | sbc $t3,$acc0,$t1 | |
458 | adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] | |
459 | adcs $acc1,$acc2,$t1 | |
460 | adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 | |
461 | adcs $acc3,$acc4,$t3 | |
462 | adc $acc4,$acc5,xzr | |
463 | ||
464 | adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus | |
465 | sbcs $t1,$acc1,$poly1 | |
466 | sbcs $t2,$acc2,xzr | |
467 | sbcs $t3,$acc3,$poly3 | |
468 | sbcs xzr,$acc4,xzr // did it borrow? | |
469 | ||
470 | csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus | |
471 | csel $acc1,$acc1,$t1,lo | |
472 | csel $acc2,$acc2,$t2,lo | |
473 | stp $acc0,$acc1,[$rp] | |
474 | csel $acc3,$acc3,$t3,lo | |
475 | stp $acc2,$acc3,[$rp,#16] | |
476 | ||
477 | ret | |
478 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | |
479 | ||
480 | // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded | |
481 | // to $a0-$a3 | |
482 | .type __ecp_nistz256_sqr_mont,%function | |
483 | .align 4 | |
484 | __ecp_nistz256_sqr_mont: | |
485 | // | | | | | |a1*a0| | | |
486 | // | | | | |a2*a0| | | | |
487 | // | |a3*a2|a3*a0| | | | | |
488 | // | | | |a2*a1| | | | | |
489 | // | | |a3*a1| | | | | | |
490 | // *| | | | | | | | 2| | |
491 | // +|a3*a3|a2*a2|a1*a1|a0*a0| | |
492 | // |--+--+--+--+--+--+--+--| | |
493 | // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | |
494 | // | |
495 | // "can't overflow" below mark carrying into high part of | |
496 | // multiplication result, which can't overflow, because it | |
497 | // can never be all ones. | |
498 | ||
499 | mul $acc1,$a1,$a0 // a[1]*a[0] | |
500 | umulh $t1,$a1,$a0 | |
501 | mul $acc2,$a2,$a0 // a[2]*a[0] | |
502 | umulh $t2,$a2,$a0 | |
503 | mul $acc3,$a3,$a0 // a[3]*a[0] | |
504 | umulh $acc4,$a3,$a0 | |
505 | ||
506 | adds $acc2,$acc2,$t1 // accumulate high parts of multiplication | |
507 | mul $t0,$a2,$a1 // a[2]*a[1] | |
508 | umulh $t1,$a2,$a1 | |
509 | adcs $acc3,$acc3,$t2 | |
510 | mul $t2,$a3,$a1 // a[3]*a[1] | |
511 | umulh $t3,$a3,$a1 | |
512 | adc $acc4,$acc4,xzr // can't overflow | |
513 | ||
514 | mul $acc5,$a3,$a2 // a[3]*a[2] | |
515 | umulh $acc6,$a3,$a2 | |
516 | ||
517 | adds $t1,$t1,$t2 // accumulate high parts of multiplication | |
518 | mul $acc0,$a0,$a0 // a[0]*a[0] | |
519 | adc $t2,$t3,xzr // can't overflow | |
520 | ||
521 | adds $acc3,$acc3,$t0 // accumulate low parts of multiplication | |
522 | umulh $a0,$a0,$a0 | |
523 | adcs $acc4,$acc4,$t1 | |
524 | mul $t1,$a1,$a1 // a[1]*a[1] | |
525 | adcs $acc5,$acc5,$t2 | |
526 | umulh $a1,$a1,$a1 | |
527 | adc $acc6,$acc6,xzr // can't overflow | |
528 | ||
529 | adds $acc1,$acc1,$acc1 // acc[1-6]*=2 | |
530 | mul $t2,$a2,$a2 // a[2]*a[2] | |
531 | adcs $acc2,$acc2,$acc2 | |
532 | umulh $a2,$a2,$a2 | |
533 | adcs $acc3,$acc3,$acc3 | |
534 | mul $t3,$a3,$a3 // a[3]*a[3] | |
535 | adcs $acc4,$acc4,$acc4 | |
536 | umulh $a3,$a3,$a3 | |
537 | adcs $acc5,$acc5,$acc5 | |
538 | adcs $acc6,$acc6,$acc6 | |
539 | adc $acc7,xzr,xzr | |
540 | ||
541 | adds $acc1,$acc1,$a0 // +a[i]*a[i] | |
542 | adcs $acc2,$acc2,$t1 | |
543 | adcs $acc3,$acc3,$a1 | |
544 | adcs $acc4,$acc4,$t2 | |
545 | adcs $acc5,$acc5,$a2 | |
546 | lsl $t0,$acc0,#32 | |
547 | adcs $acc6,$acc6,$t3 | |
548 | lsr $t1,$acc0,#32 | |
549 | adc $acc7,$acc7,$a3 | |
550 | ___ | |
551 | for($i=0;$i<3;$i++) { # reductions, see commentary in | |
552 | # multiplication for details | |
553 | $code.=<<___; | |
554 | subs $t2,$acc0,$t0 // "*0xffff0001" | |
555 | sbc $t3,$acc0,$t1 | |
556 | adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] | |
557 | adcs $acc1,$acc2,$t1 | |
558 | lsl $t0,$acc0,#32 | |
559 | adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 | |
560 | lsr $t1,$acc0,#32 | |
561 | adc $acc3,$t3,xzr // can't overflow | |
562 | ___ | |
563 | } | |
564 | $code.=<<___; | |
565 | subs $t2,$acc0,$t0 // "*0xffff0001" | |
566 | sbc $t3,$acc0,$t1 | |
567 | adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] | |
568 | adcs $acc1,$acc2,$t1 | |
569 | adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 | |
570 | adc $acc3,$t3,xzr // can't overflow | |
571 | ||
572 | adds $acc0,$acc0,$acc4 // accumulate upper half | |
573 | adcs $acc1,$acc1,$acc5 | |
574 | adcs $acc2,$acc2,$acc6 | |
575 | adcs $acc3,$acc3,$acc7 | |
576 | adc $acc4,xzr,xzr | |
577 | ||
578 | adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus | |
579 | sbcs $t1,$acc1,$poly1 | |
580 | sbcs $t2,$acc2,xzr | |
581 | sbcs $t3,$acc3,$poly3 | |
582 | sbcs xzr,$acc4,xzr // did it borrow? | |
583 | ||
584 | csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus | |
585 | csel $acc1,$acc1,$t1,lo | |
586 | csel $acc2,$acc2,$t2,lo | |
587 | stp $acc0,$acc1,[$rp] | |
588 | csel $acc3,$acc3,$t3,lo | |
589 | stp $acc2,$acc3,[$rp,#16] | |
590 | ||
591 | ret | |
592 | .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont | |
593 | ||
594 | // Note that __ecp_nistz256_add expects both input vectors pre-loaded to | |
595 | // $a0-$a3 and $t0-$t3. This is done because it's used in multiple | |
596 | // contexts, e.g. in multiplication by 2 and 3... | |
597 | .type __ecp_nistz256_add,%function | |
598 | .align 4 | |
599 | __ecp_nistz256_add: | |
600 | adds $acc0,$acc0,$t0 // ret = a+b | |
601 | adcs $acc1,$acc1,$t1 | |
602 | adcs $acc2,$acc2,$t2 | |
603 | adcs $acc3,$acc3,$t3 | |
604 | adc $ap,xzr,xzr // zap $ap | |
605 | ||
606 | adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus | |
607 | sbcs $t1,$acc1,$poly1 | |
608 | sbcs $t2,$acc2,xzr | |
dfde4219 AP |
609 | sbcs $t3,$acc3,$poly3 |
610 | sbcs xzr,$ap,xzr // did subtraction borrow? | |
e1613e7c | 611 | |
dfde4219 AP |
612 | csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus |
613 | csel $acc1,$acc1,$t1,lo | |
614 | csel $acc2,$acc2,$t2,lo | |
e1613e7c | 615 | stp $acc0,$acc1,[$rp] |
dfde4219 | 616 | csel $acc3,$acc3,$t3,lo |
e1613e7c AP |
617 | stp $acc2,$acc3,[$rp,#16] |
618 | ||
619 | ret | |
620 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | |
621 | ||
622 | .type __ecp_nistz256_sub_from,%function | |
623 | .align 4 | |
624 | __ecp_nistz256_sub_from: | |
625 | ldp $t0,$t1,[$bp] | |
626 | ldp $t2,$t3,[$bp,#16] | |
627 | subs $acc0,$acc0,$t0 // ret = a-b | |
628 | sbcs $acc1,$acc1,$t1 | |
629 | sbcs $acc2,$acc2,$t2 | |
630 | sbcs $acc3,$acc3,$t3 | |
631 | sbc $ap,xzr,xzr // zap $ap | |
632 | ||
633 | subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus | |
634 | adcs $t1,$acc1,$poly1 | |
635 | adcs $t2,$acc2,xzr | |
636 | adc $t3,$acc3,$poly3 | |
637 | cmp $ap,xzr // did subtraction borrow? | |
638 | ||
639 | csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret | |
640 | csel $acc1,$acc1,$t1,eq | |
641 | csel $acc2,$acc2,$t2,eq | |
642 | stp $acc0,$acc1,[$rp] | |
643 | csel $acc3,$acc3,$t3,eq | |
644 | stp $acc2,$acc3,[$rp,#16] | |
645 | ||
646 | ret | |
647 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | |
648 | ||
649 | .type __ecp_nistz256_sub_morf,%function | |
650 | .align 4 | |
651 | __ecp_nistz256_sub_morf: | |
652 | ldp $t0,$t1,[$bp] | |
653 | ldp $t2,$t3,[$bp,#16] | |
654 | subs $acc0,$t0,$acc0 // ret = b-a | |
655 | sbcs $acc1,$t1,$acc1 | |
656 | sbcs $acc2,$t2,$acc2 | |
657 | sbcs $acc3,$t3,$acc3 | |
658 | sbc $ap,xzr,xzr // zap $ap | |
659 | ||
660 | subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus | |
661 | adcs $t1,$acc1,$poly1 | |
662 | adcs $t2,$acc2,xzr | |
663 | adc $t3,$acc3,$poly3 | |
664 | cmp $ap,xzr // did subtraction borrow? | |
665 | ||
666 | csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret | |
667 | csel $acc1,$acc1,$t1,eq | |
668 | csel $acc2,$acc2,$t2,eq | |
669 | stp $acc0,$acc1,[$rp] | |
670 | csel $acc3,$acc3,$t3,eq | |
671 | stp $acc2,$acc3,[$rp,#16] | |
672 | ||
673 | ret | |
674 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | |
675 | ||
676 | .type __ecp_nistz256_div_by_2,%function | |
677 | .align 4 | |
678 | __ecp_nistz256_div_by_2: | |
679 | subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus | |
680 | adcs $t1,$acc1,$poly1 | |
681 | adcs $t2,$acc2,xzr | |
682 | adcs $t3,$acc3,$poly3 | |
683 | adc $ap,xzr,xzr // zap $ap | |
684 | tst $acc0,#1 // is a even? | |
685 | ||
609b0852 | 686 | csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus |
e1613e7c AP |
687 | csel $acc1,$acc1,$t1,eq |
688 | csel $acc2,$acc2,$t2,eq | |
689 | csel $acc3,$acc3,$t3,eq | |
690 | csel $ap,xzr,$ap,eq | |
691 | ||
692 | lsr $acc0,$acc0,#1 // ret >>= 1 | |
693 | orr $acc0,$acc0,$acc1,lsl#63 | |
694 | lsr $acc1,$acc1,#1 | |
695 | orr $acc1,$acc1,$acc2,lsl#63 | |
696 | lsr $acc2,$acc2,#1 | |
697 | orr $acc2,$acc2,$acc3,lsl#63 | |
698 | lsr $acc3,$acc3,#1 | |
699 | stp $acc0,$acc1,[$rp] | |
700 | orr $acc3,$acc3,$ap,lsl#63 | |
701 | stp $acc2,$acc3,[$rp,#16] | |
702 | ||
703 | ret | |
704 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | |
705 | ___ | |
706 | ######################################################################## | |
085b3860 | 707 | # following subroutines are "literal" implementation of those found in |
e1613e7c AP |
708 | # ecp_nistz256.c |
709 | # | |
710 | ######################################################################## | |
711 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | |
712 | # | |
713 | { | |
714 | my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); | |
715 | # above map() describes stack layout with 4 temporary | |
716 | # 256-bit vectors on top. | |
717 | my ($rp_real,$ap_real) = map("x$_",(21,22)); | |
718 | ||
719 | $code.=<<___; | |
720 | .globl ecp_nistz256_point_double | |
721 | .type ecp_nistz256_point_double,%function | |
722 | .align 5 | |
723 | ecp_nistz256_point_double: | |
9a18aae5 | 724 | .inst 0xd503233f // paciasp |
143ee099 | 725 | stp x29,x30,[sp,#-80]! |
e1613e7c AP |
726 | add x29,sp,#0 |
727 | stp x19,x20,[sp,#16] | |
728 | stp x21,x22,[sp,#32] | |
729 | sub sp,sp,#32*4 | |
730 | ||
143ee099 | 731 | .Ldouble_shortcut: |
e1613e7c AP |
732 | ldp $acc0,$acc1,[$ap,#32] |
733 | mov $rp_real,$rp | |
734 | ldp $acc2,$acc3,[$ap,#48] | |
735 | mov $ap_real,$ap | |
736 | ldr $poly1,.Lpoly+8 | |
737 | mov $t0,$acc0 | |
738 | ldr $poly3,.Lpoly+24 | |
739 | mov $t1,$acc1 | |
740 | ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont | |
741 | mov $t2,$acc2 | |
742 | mov $t3,$acc3 | |
743 | ldp $a2,$a3,[$ap_real,#64+16] | |
744 | add $rp,sp,#$S | |
745 | bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); | |
746 | ||
747 | add $rp,sp,#$Zsqr | |
748 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); | |
749 | ||
750 | ldp $t0,$t1,[$ap_real] | |
751 | ldp $t2,$t3,[$ap_real,#16] | |
752 | mov $a0,$acc0 // put Zsqr aside for p256_sub | |
753 | mov $a1,$acc1 | |
754 | mov $a2,$acc2 | |
755 | mov $a3,$acc3 | |
756 | add $rp,sp,#$M | |
757 | bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); | |
758 | ||
759 | add $bp,$ap_real,#0 | |
760 | mov $acc0,$a0 // restore Zsqr | |
761 | mov $acc1,$a1 | |
762 | ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont | |
763 | mov $acc2,$a2 | |
764 | mov $acc3,$a3 | |
765 | ldp $a2,$a3,[sp,#$S+16] | |
766 | add $rp,sp,#$Zsqr | |
767 | bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); | |
768 | ||
769 | add $rp,sp,#$S | |
770 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); | |
771 | ||
772 | ldr $bi,[$ap_real,#32] | |
773 | ldp $a0,$a1,[$ap_real,#64] | |
774 | ldp $a2,$a3,[$ap_real,#64+16] | |
775 | add $bp,$ap_real,#32 | |
776 | add $rp,sp,#$tmp0 | |
777 | bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); | |
778 | ||
779 | mov $t0,$acc0 | |
780 | mov $t1,$acc1 | |
781 | ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont | |
782 | mov $t2,$acc2 | |
783 | mov $t3,$acc3 | |
784 | ldp $a2,$a3,[sp,#$S+16] | |
785 | add $rp,$rp_real,#64 | |
786 | bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); | |
787 | ||
788 | add $rp,sp,#$tmp0 | |
789 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); | |
790 | ||
791 | ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont | |
792 | ldp $a0,$a1,[sp,#$M] | |
793 | ldp $a2,$a3,[sp,#$M+16] | |
794 | add $rp,$rp_real,#32 | |
795 | bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); | |
796 | ||
797 | add $bp,sp,#$Zsqr | |
798 | add $rp,sp,#$M | |
799 | bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); | |
800 | ||
801 | mov $t0,$acc0 // duplicate M | |
802 | mov $t1,$acc1 | |
803 | mov $t2,$acc2 | |
804 | mov $t3,$acc3 | |
805 | mov $a0,$acc0 // put M aside | |
806 | mov $a1,$acc1 | |
807 | mov $a2,$acc2 | |
808 | mov $a3,$acc3 | |
809 | add $rp,sp,#$M | |
810 | bl __ecp_nistz256_add | |
811 | mov $t0,$a0 // restore M | |
812 | mov $t1,$a1 | |
813 | ldr $bi,[$ap_real] // forward load for p256_mul_mont | |
814 | mov $t2,$a2 | |
815 | ldp $a0,$a1,[sp,#$S] | |
816 | mov $t3,$a3 | |
817 | ldp $a2,$a3,[sp,#$S+16] | |
818 | bl __ecp_nistz256_add // p256_mul_by_3(M, M); | |
819 | ||
820 | add $bp,$ap_real,#0 | |
821 | add $rp,sp,#$S | |
822 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); | |
823 | ||
824 | mov $t0,$acc0 | |
825 | mov $t1,$acc1 | |
826 | ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont | |
827 | mov $t2,$acc2 | |
828 | mov $t3,$acc3 | |
829 | ldp $a2,$a3,[sp,#$M+16] | |
830 | add $rp,sp,#$tmp0 | |
831 | bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); | |
832 | ||
833 | add $rp,$rp_real,#0 | |
834 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); | |
835 | ||
836 | add $bp,sp,#$tmp0 | |
837 | bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); | |
838 | ||
839 | add $bp,sp,#$S | |
840 | add $rp,sp,#$S | |
841 | bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); | |
842 | ||
843 | ldr $bi,[sp,#$M] | |
844 | mov $a0,$acc0 // copy S | |
845 | mov $a1,$acc1 | |
846 | mov $a2,$acc2 | |
847 | mov $a3,$acc3 | |
848 | add $bp,sp,#$M | |
849 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); | |
850 | ||
851 | add $bp,$rp_real,#32 | |
852 | add $rp,$rp_real,#32 | |
853 | bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); | |
854 | ||
855 | add sp,x29,#0 // destroy frame | |
856 | ldp x19,x20,[x29,#16] | |
857 | ldp x21,x22,[x29,#32] | |
143ee099 | 858 | ldp x29,x30,[sp],#80 |
9a18aae5 | 859 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
860 | ret |
861 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | |
862 | ___ | |
863 | } | |
864 | ||
865 | ######################################################################## | |
866 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | |
867 | # const P256_POINT *in2); | |
868 | { | |
869 | my ($res_x,$res_y,$res_z, | |
870 | $H,$Hsqr,$R,$Rsqr,$Hcub, | |
871 | $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); | |
872 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
873 | # above map() describes stack layout with 12 temporary | |
874 | # 256-bit vectors on top. | |
875 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); | |
876 | ||
877 | $code.=<<___; | |
878 | .globl ecp_nistz256_point_add | |
879 | .type ecp_nistz256_point_add,%function | |
880 | .align 5 | |
881 | ecp_nistz256_point_add: | |
9a18aae5 | 882 | .inst 0xd503233f // paciasp |
e1613e7c AP |
883 | stp x29,x30,[sp,#-80]! |
884 | add x29,sp,#0 | |
885 | stp x19,x20,[sp,#16] | |
886 | stp x21,x22,[sp,#32] | |
887 | stp x23,x24,[sp,#48] | |
888 | stp x25,x26,[sp,#64] | |
889 | sub sp,sp,#32*12 | |
890 | ||
c74aea8d AP |
891 | ldp $a0,$a1,[$bp,#64] // in2_z |
892 | ldp $a2,$a3,[$bp,#64+16] | |
e1613e7c AP |
893 | mov $rp_real,$rp |
894 | mov $ap_real,$ap | |
895 | mov $bp_real,$bp | |
e1613e7c AP |
896 | ldr $poly1,.Lpoly+8 |
897 | ldr $poly3,.Lpoly+24 | |
c74aea8d AP |
898 | orr $t0,$a0,$a1 |
899 | orr $t2,$a2,$a3 | |
900 | orr $in2infty,$t0,$t2 | |
901 | cmp $in2infty,#0 | |
902 | csetm $in2infty,ne // !in2infty | |
e1613e7c AP |
903 | add $rp,sp,#$Z2sqr |
904 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); | |
905 | ||
c74aea8d | 906 | ldp $a0,$a1,[$ap_real,#64] // in1_z |
e1613e7c | 907 | ldp $a2,$a3,[$ap_real,#64+16] |
c74aea8d AP |
908 | orr $t0,$a0,$a1 |
909 | orr $t2,$a2,$a3 | |
910 | orr $in1infty,$t0,$t2 | |
911 | cmp $in1infty,#0 | |
912 | csetm $in1infty,ne // !in1infty | |
e1613e7c AP |
913 | add $rp,sp,#$Z1sqr |
914 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); | |
915 | ||
916 | ldr $bi,[$bp_real,#64] | |
917 | ldp $a0,$a1,[sp,#$Z2sqr] | |
918 | ldp $a2,$a3,[sp,#$Z2sqr+16] | |
919 | add $bp,$bp_real,#64 | |
920 | add $rp,sp,#$S1 | |
921 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); | |
922 | ||
923 | ldr $bi,[$ap_real,#64] | |
924 | ldp $a0,$a1,[sp,#$Z1sqr] | |
925 | ldp $a2,$a3,[sp,#$Z1sqr+16] | |
926 | add $bp,$ap_real,#64 | |
927 | add $rp,sp,#$S2 | |
928 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); | |
929 | ||
930 | ldr $bi,[$ap_real,#32] | |
931 | ldp $a0,$a1,[sp,#$S1] | |
932 | ldp $a2,$a3,[sp,#$S1+16] | |
933 | add $bp,$ap_real,#32 | |
934 | add $rp,sp,#$S1 | |
935 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); | |
936 | ||
937 | ldr $bi,[$bp_real,#32] | |
938 | ldp $a0,$a1,[sp,#$S2] | |
939 | ldp $a2,$a3,[sp,#$S2+16] | |
940 | add $bp,$bp_real,#32 | |
941 | add $rp,sp,#$S2 | |
942 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); | |
943 | ||
944 | add $bp,sp,#$S1 | |
945 | ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont | |
946 | ldp $a0,$a1,[$ap_real] | |
947 | ldp $a2,$a3,[$ap_real,#16] | |
948 | add $rp,sp,#$R | |
949 | bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); | |
950 | ||
951 | orr $acc0,$acc0,$acc1 // see if result is zero | |
952 | orr $acc2,$acc2,$acc3 | |
953 | orr $temp,$acc0,$acc2 | |
954 | ||
955 | add $bp,sp,#$Z2sqr | |
956 | add $rp,sp,#$U1 | |
957 | bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); | |
958 | ||
959 | ldr $bi,[sp,#$Z1sqr] | |
960 | ldp $a0,$a1,[$bp_real] | |
961 | ldp $a2,$a3,[$bp_real,#16] | |
962 | add $bp,sp,#$Z1sqr | |
963 | add $rp,sp,#$U2 | |
964 | bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); | |
965 | ||
966 | add $bp,sp,#$U1 | |
967 | ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont | |
968 | ldp $a2,$a3,[sp,#$R+16] | |
969 | add $rp,sp,#$H | |
970 | bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); | |
971 | ||
972 | orr $acc0,$acc0,$acc1 // see if result is zero | |
973 | orr $acc2,$acc2,$acc3 | |
974 | orr $acc0,$acc0,$acc2 | |
975 | tst $acc0,$acc0 | |
976 | b.ne .Ladd_proceed // is_equal(U1,U2)? | |
977 | ||
978 | tst $in1infty,$in2infty | |
979 | b.eq .Ladd_proceed // (in1infty || in2infty)? | |
980 | ||
981 | tst $temp,$temp | |
143ee099 | 982 | b.eq .Ladd_double // is_equal(S1,S2)? |
e1613e7c AP |
983 | |
984 | eor $a0,$a0,$a0 | |
985 | eor $a1,$a1,$a1 | |
986 | stp $a0,$a1,[$rp_real] | |
987 | stp $a0,$a1,[$rp_real,#16] | |
988 | stp $a0,$a1,[$rp_real,#32] | |
989 | stp $a0,$a1,[$rp_real,#48] | |
990 | stp $a0,$a1,[$rp_real,#64] | |
991 | stp $a0,$a1,[$rp_real,#80] | |
992 | b .Ladd_done | |
993 | ||
143ee099 AP |
994 | .align 4 |
995 | .Ladd_double: | |
996 | mov $ap,$ap_real | |
997 | mov $rp,$rp_real | |
998 | ldp x23,x24,[x29,#48] | |
999 | ldp x25,x26,[x29,#64] | |
1000 | add sp,sp,#32*(12-4) // difference in stack frames | |
1001 | b .Ldouble_shortcut | |
1002 | ||
e1613e7c AP |
1003 | .align 4 |
1004 | .Ladd_proceed: | |
1005 | add $rp,sp,#$Rsqr | |
1006 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); | |
1007 | ||
1008 | ldr $bi,[$ap_real,#64] | |
1009 | ldp $a0,$a1,[sp,#$H] | |
1010 | ldp $a2,$a3,[sp,#$H+16] | |
1011 | add $bp,$ap_real,#64 | |
1012 | add $rp,sp,#$res_z | |
1013 | bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); | |
1014 | ||
1015 | ldp $a0,$a1,[sp,#$H] | |
1016 | ldp $a2,$a3,[sp,#$H+16] | |
1017 | add $rp,sp,#$Hsqr | |
1018 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); | |
1019 | ||
1020 | ldr $bi,[$bp_real,#64] | |
1021 | ldp $a0,$a1,[sp,#$res_z] | |
1022 | ldp $a2,$a3,[sp,#$res_z+16] | |
1023 | add $bp,$bp_real,#64 | |
1024 | add $rp,sp,#$res_z | |
1025 | bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); | |
1026 | ||
1027 | ldr $bi,[sp,#$H] | |
1028 | ldp $a0,$a1,[sp,#$Hsqr] | |
1029 | ldp $a2,$a3,[sp,#$Hsqr+16] | |
1030 | add $bp,sp,#$H | |
1031 | add $rp,sp,#$Hcub | |
1032 | bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); | |
1033 | ||
1034 | ldr $bi,[sp,#$Hsqr] | |
1035 | ldp $a0,$a1,[sp,#$U1] | |
1036 | ldp $a2,$a3,[sp,#$U1+16] | |
1037 | add $bp,sp,#$Hsqr | |
1038 | add $rp,sp,#$U2 | |
1039 | bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); | |
1040 | ||
1041 | mov $t0,$acc0 | |
1042 | mov $t1,$acc1 | |
1043 | mov $t2,$acc2 | |
1044 | mov $t3,$acc3 | |
1045 | add $rp,sp,#$Hsqr | |
1046 | bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); | |
1047 | ||
1048 | add $bp,sp,#$Rsqr | |
1049 | add $rp,sp,#$res_x | |
1050 | bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); | |
1051 | ||
1052 | add $bp,sp,#$Hcub | |
1053 | bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); | |
1054 | ||
1055 | add $bp,sp,#$U2 | |
1056 | ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont | |
1057 | ldp $a0,$a1,[sp,#$S1] | |
1058 | ldp $a2,$a3,[sp,#$S1+16] | |
1059 | add $rp,sp,#$res_y | |
1060 | bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); | |
1061 | ||
1062 | add $bp,sp,#$Hcub | |
1063 | add $rp,sp,#$S2 | |
1064 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); | |
1065 | ||
1066 | ldr $bi,[sp,#$R] | |
1067 | ldp $a0,$a1,[sp,#$res_y] | |
1068 | ldp $a2,$a3,[sp,#$res_y+16] | |
1069 | add $bp,sp,#$R | |
1070 | add $rp,sp,#$res_y | |
1071 | bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); | |
1072 | ||
1073 | add $bp,sp,#$S2 | |
1074 | bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); | |
1075 | ||
1076 | ldp $a0,$a1,[sp,#$res_x] // res | |
1077 | ldp $a2,$a3,[sp,#$res_x+16] | |
1078 | ldp $t0,$t1,[$bp_real] // in2 | |
1079 | ldp $t2,$t3,[$bp_real,#16] | |
1080 | ___ | |
1081 | for($i=0;$i<64;$i+=32) { # conditional moves | |
1082 | $code.=<<___; | |
1083 | ldp $acc0,$acc1,[$ap_real,#$i] // in1 | |
1084 | cmp $in1infty,#0 // !$in1intfy, remember? | |
1085 | ldp $acc2,$acc3,[$ap_real,#$i+16] | |
1086 | csel $t0,$a0,$t0,ne | |
1087 | csel $t1,$a1,$t1,ne | |
1088 | ldp $a0,$a1,[sp,#$res_x+$i+32] // res | |
1089 | csel $t2,$a2,$t2,ne | |
1090 | csel $t3,$a3,$t3,ne | |
1091 | cmp $in2infty,#0 // !$in2intfy, remember? | |
1092 | ldp $a2,$a3,[sp,#$res_x+$i+48] | |
1093 | csel $acc0,$t0,$acc0,ne | |
1094 | csel $acc1,$t1,$acc1,ne | |
1095 | ldp $t0,$t1,[$bp_real,#$i+32] // in2 | |
1096 | csel $acc2,$t2,$acc2,ne | |
1097 | csel $acc3,$t3,$acc3,ne | |
1098 | ldp $t2,$t3,[$bp_real,#$i+48] | |
1099 | stp $acc0,$acc1,[$rp_real,#$i] | |
1100 | stp $acc2,$acc3,[$rp_real,#$i+16] | |
1101 | ___ | |
1102 | } | |
1103 | $code.=<<___; | |
1104 | ldp $acc0,$acc1,[$ap_real,#$i] // in1 | |
1105 | cmp $in1infty,#0 // !$in1intfy, remember? | |
1106 | ldp $acc2,$acc3,[$ap_real,#$i+16] | |
1107 | csel $t0,$a0,$t0,ne | |
1108 | csel $t1,$a1,$t1,ne | |
1109 | csel $t2,$a2,$t2,ne | |
1110 | csel $t3,$a3,$t3,ne | |
1111 | cmp $in2infty,#0 // !$in2intfy, remember? | |
1112 | csel $acc0,$t0,$acc0,ne | |
1113 | csel $acc1,$t1,$acc1,ne | |
1114 | csel $acc2,$t2,$acc2,ne | |
1115 | csel $acc3,$t3,$acc3,ne | |
1116 | stp $acc0,$acc1,[$rp_real,#$i] | |
1117 | stp $acc2,$acc3,[$rp_real,#$i+16] | |
1118 | ||
1119 | .Ladd_done: | |
9a18aae5 | 1120 | add sp,x29,#0 // destroy frame |
e1613e7c AP |
1121 | ldp x19,x20,[x29,#16] |
1122 | ldp x21,x22,[x29,#32] | |
1123 | ldp x23,x24,[x29,#48] | |
1124 | ldp x25,x26,[x29,#64] | |
1125 | ldp x29,x30,[sp],#80 | |
9a18aae5 | 1126 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
1127 | ret |
1128 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | |
1129 | ___ | |
1130 | } | |
1131 | ||
1132 | ######################################################################## | |
1133 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | |
1134 | # const P256_POINT_AFFINE *in2); | |
1135 | { | |
1136 | my ($res_x,$res_y,$res_z, | |
1137 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); | |
1138 | my $Z1sqr = $S2; | |
1139 | # above map() describes stack layout with 10 temporary | |
1140 | # 256-bit vectors on top. | |
1141 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); | |
1142 | ||
1143 | $code.=<<___; | |
1144 | .globl ecp_nistz256_point_add_affine | |
1145 | .type ecp_nistz256_point_add_affine,%function | |
1146 | .align 5 | |
1147 | ecp_nistz256_point_add_affine: | |
9a18aae5 | 1148 | .inst 0xd503233f // paciasp |
e1613e7c AP |
1149 | stp x29,x30,[sp,#-80]! |
1150 | add x29,sp,#0 | |
1151 | stp x19,x20,[sp,#16] | |
1152 | stp x21,x22,[sp,#32] | |
1153 | stp x23,x24,[sp,#48] | |
1154 | stp x25,x26,[sp,#64] | |
1155 | sub sp,sp,#32*10 | |
1156 | ||
1157 | mov $rp_real,$rp | |
1158 | mov $ap_real,$ap | |
1159 | mov $bp_real,$bp | |
1160 | ldr $poly1,.Lpoly+8 | |
1161 | ldr $poly3,.Lpoly+24 | |
1162 | ||
c74aea8d AP |
1163 | ldp $a0,$a1,[$ap,#64] // in1_z |
1164 | ldp $a2,$a3,[$ap,#64+16] | |
1165 | orr $t0,$a0,$a1 | |
1166 | orr $t2,$a2,$a3 | |
1167 | orr $in1infty,$t0,$t2 | |
e1613e7c AP |
1168 | cmp $in1infty,#0 |
1169 | csetm $in1infty,ne // !in1infty | |
1170 | ||
c74aea8d AP |
1171 | ldp $acc0,$acc1,[$bp] // in2_x |
1172 | ldp $acc2,$acc3,[$bp,#16] | |
1173 | ldp $t0,$t1,[$bp,#32] // in2_y | |
e1613e7c | 1174 | ldp $t2,$t3,[$bp,#48] |
c74aea8d AP |
1175 | orr $acc0,$acc0,$acc1 |
1176 | orr $acc2,$acc2,$acc3 | |
e1613e7c AP |
1177 | orr $t0,$t0,$t1 |
1178 | orr $t2,$t2,$t3 | |
c74aea8d | 1179 | orr $acc0,$acc0,$acc2 |
e1613e7c | 1180 | orr $t0,$t0,$t2 |
c74aea8d | 1181 | orr $in2infty,$acc0,$t0 |
e1613e7c AP |
1182 | cmp $in2infty,#0 |
1183 | csetm $in2infty,ne // !in2infty | |
1184 | ||
e1613e7c AP |
1185 | add $rp,sp,#$Z1sqr |
1186 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); | |
1187 | ||
1188 | mov $a0,$acc0 | |
1189 | mov $a1,$acc1 | |
1190 | mov $a2,$acc2 | |
1191 | mov $a3,$acc3 | |
1192 | ldr $bi,[$bp_real] | |
1193 | add $bp,$bp_real,#0 | |
1194 | add $rp,sp,#$U2 | |
1195 | bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); | |
1196 | ||
1197 | add $bp,$ap_real,#0 | |
1198 | ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont | |
1199 | ldp $a0,$a1,[sp,#$Z1sqr] | |
1200 | ldp $a2,$a3,[sp,#$Z1sqr+16] | |
1201 | add $rp,sp,#$H | |
1202 | bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); | |
1203 | ||
1204 | add $bp,$ap_real,#64 | |
1205 | add $rp,sp,#$S2 | |
1206 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); | |
1207 | ||
1208 | ldr $bi,[$ap_real,#64] | |
1209 | ldp $a0,$a1,[sp,#$H] | |
1210 | ldp $a2,$a3,[sp,#$H+16] | |
1211 | add $bp,$ap_real,#64 | |
1212 | add $rp,sp,#$res_z | |
1213 | bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); | |
1214 | ||
1215 | ldr $bi,[$bp_real,#32] | |
1216 | ldp $a0,$a1,[sp,#$S2] | |
1217 | ldp $a2,$a3,[sp,#$S2+16] | |
1218 | add $bp,$bp_real,#32 | |
1219 | add $rp,sp,#$S2 | |
1220 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); | |
1221 | ||
1222 | add $bp,$ap_real,#32 | |
1223 | ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont | |
1224 | ldp $a2,$a3,[sp,#$H+16] | |
1225 | add $rp,sp,#$R | |
1226 | bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); | |
1227 | ||
1228 | add $rp,sp,#$Hsqr | |
1229 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); | |
1230 | ||
1231 | ldp $a0,$a1,[sp,#$R] | |
1232 | ldp $a2,$a3,[sp,#$R+16] | |
1233 | add $rp,sp,#$Rsqr | |
1234 | bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); | |
1235 | ||
1236 | ldr $bi,[sp,#$H] | |
1237 | ldp $a0,$a1,[sp,#$Hsqr] | |
1238 | ldp $a2,$a3,[sp,#$Hsqr+16] | |
1239 | add $bp,sp,#$H | |
1240 | add $rp,sp,#$Hcub | |
1241 | bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); | |
1242 | ||
1243 | ldr $bi,[$ap_real] | |
1244 | ldp $a0,$a1,[sp,#$Hsqr] | |
1245 | ldp $a2,$a3,[sp,#$Hsqr+16] | |
1246 | add $bp,$ap_real,#0 | |
1247 | add $rp,sp,#$U2 | |
1248 | bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); | |
1249 | ||
1250 | mov $t0,$acc0 | |
1251 | mov $t1,$acc1 | |
1252 | mov $t2,$acc2 | |
1253 | mov $t3,$acc3 | |
1254 | add $rp,sp,#$Hsqr | |
1255 | bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); | |
1256 | ||
1257 | add $bp,sp,#$Rsqr | |
1258 | add $rp,sp,#$res_x | |
1259 | bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); | |
1260 | ||
1261 | add $bp,sp,#$Hcub | |
1262 | bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); | |
1263 | ||
1264 | add $bp,sp,#$U2 | |
1265 | ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont | |
1266 | ldp $a0,$a1,[sp,#$Hcub] | |
1267 | ldp $a2,$a3,[sp,#$Hcub+16] | |
1268 | add $rp,sp,#$res_y | |
1269 | bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); | |
1270 | ||
1271 | add $bp,$ap_real,#32 | |
1272 | add $rp,sp,#$S2 | |
1273 | bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); | |
1274 | ||
1275 | ldr $bi,[sp,#$R] | |
1276 | ldp $a0,$a1,[sp,#$res_y] | |
1277 | ldp $a2,$a3,[sp,#$res_y+16] | |
1278 | add $bp,sp,#$R | |
1279 | add $rp,sp,#$res_y | |
1280 | bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); | |
1281 | ||
1282 | add $bp,sp,#$S2 | |
1283 | bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); | |
1284 | ||
1285 | ldp $a0,$a1,[sp,#$res_x] // res | |
1286 | ldp $a2,$a3,[sp,#$res_x+16] | |
1287 | ldp $t0,$t1,[$bp_real] // in2 | |
1288 | ldp $t2,$t3,[$bp_real,#16] | |
1289 | ___ | |
1290 | for($i=0;$i<64;$i+=32) { # conditional moves | |
1291 | $code.=<<___; | |
1292 | ldp $acc0,$acc1,[$ap_real,#$i] // in1 | |
1293 | cmp $in1infty,#0 // !$in1intfy, remember? | |
1294 | ldp $acc2,$acc3,[$ap_real,#$i+16] | |
1295 | csel $t0,$a0,$t0,ne | |
1296 | csel $t1,$a1,$t1,ne | |
1297 | ldp $a0,$a1,[sp,#$res_x+$i+32] // res | |
1298 | csel $t2,$a2,$t2,ne | |
1299 | csel $t3,$a3,$t3,ne | |
1300 | cmp $in2infty,#0 // !$in2intfy, remember? | |
1301 | ldp $a2,$a3,[sp,#$res_x+$i+48] | |
1302 | csel $acc0,$t0,$acc0,ne | |
1303 | csel $acc1,$t1,$acc1,ne | |
1304 | ldp $t0,$t1,[$bp_real,#$i+32] // in2 | |
1305 | csel $acc2,$t2,$acc2,ne | |
1306 | csel $acc3,$t3,$acc3,ne | |
1307 | ldp $t2,$t3,[$bp_real,#$i+48] | |
1308 | stp $acc0,$acc1,[$rp_real,#$i] | |
1309 | stp $acc2,$acc3,[$rp_real,#$i+16] | |
1310 | ___ | |
57758351 AP |
1311 | $code.=<<___ if ($i == 0); |
1312 | adr $bp_real,.Lone_mont-64 | |
1313 | ___ | |
e1613e7c AP |
1314 | } |
1315 | $code.=<<___; | |
1316 | ldp $acc0,$acc1,[$ap_real,#$i] // in1 | |
1317 | cmp $in1infty,#0 // !$in1intfy, remember? | |
1318 | ldp $acc2,$acc3,[$ap_real,#$i+16] | |
1319 | csel $t0,$a0,$t0,ne | |
1320 | csel $t1,$a1,$t1,ne | |
1321 | csel $t2,$a2,$t2,ne | |
1322 | csel $t3,$a3,$t3,ne | |
1323 | cmp $in2infty,#0 // !$in2intfy, remember? | |
1324 | csel $acc0,$t0,$acc0,ne | |
1325 | csel $acc1,$t1,$acc1,ne | |
1326 | csel $acc2,$t2,$acc2,ne | |
1327 | csel $acc3,$t3,$acc3,ne | |
1328 | stp $acc0,$acc1,[$rp_real,#$i] | |
1329 | stp $acc2,$acc3,[$rp_real,#$i+16] | |
1330 | ||
1331 | add sp,x29,#0 // destroy frame | |
1332 | ldp x19,x20,[x29,#16] | |
1333 | ldp x21,x22,[x29,#32] | |
1334 | ldp x23,x24,[x29,#48] | |
1335 | ldp x25,x26,[x29,#64] | |
1336 | ldp x29,x30,[sp],#80 | |
9a18aae5 | 1337 | .inst 0xd50323bf // autiasp |
e1613e7c AP |
1338 | ret |
1339 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | |
1340 | ___ | |
ab4f2026 AP |
1341 | } |
1342 | if (1) { | |
1343 | my ($ord0,$ord1) = ($poly1,$poly3); | |
1344 | my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); | |
1345 | my $acc7 = $bi; | |
1346 | ||
1347 | $code.=<<___; | |
1348 | //////////////////////////////////////////////////////////////////////// | |
1349 | // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], | |
1350 | // uint64_t b[4]); | |
1351 | .globl ecp_nistz256_ord_mul_mont | |
1352 | .type ecp_nistz256_ord_mul_mont,%function | |
1353 | .align 4 | |
1354 | ecp_nistz256_ord_mul_mont: | |
1355 | stp x29,x30,[sp,#-64]! | |
1356 | add x29,sp,#0 | |
1357 | stp x19,x20,[sp,#16] | |
1358 | stp x21,x22,[sp,#32] | |
1359 | stp x23,x24,[sp,#48] | |
1360 | ||
1361 | adr $ordk,.Lord | |
1362 | ldr $bi,[$bp] // bp[0] | |
1363 | ldp $a0,$a1,[$ap] | |
1364 | ldp $a2,$a3,[$ap,#16] | |
1365 | ||
1366 | ldp $ord0,$ord1,[$ordk,#0] | |
1367 | ldp $ord2,$ord3,[$ordk,#16] | |
1368 | ldr $ordk,[$ordk,#32] | |
1369 | ||
1370 | mul $acc0,$a0,$bi // a[0]*b[0] | |
1371 | umulh $t0,$a0,$bi | |
1372 | ||
1373 | mul $acc1,$a1,$bi // a[1]*b[0] | |
1374 | umulh $t1,$a1,$bi | |
1375 | ||
1376 | mul $acc2,$a2,$bi // a[2]*b[0] | |
1377 | umulh $t2,$a2,$bi | |
1378 | ||
1379 | mul $acc3,$a3,$bi // a[3]*b[0] | |
1380 | umulh $acc4,$a3,$bi | |
1381 | ||
1382 | mul $t4,$acc0,$ordk | |
1383 | ||
1384 | adds $acc1,$acc1,$t0 // accumulate high parts of multiplication | |
1385 | adcs $acc2,$acc2,$t1 | |
1386 | adcs $acc3,$acc3,$t2 | |
1387 | adc $acc4,$acc4,xzr | |
1388 | mov $acc5,xzr | |
1389 | ___ | |
1390 | for ($i=1;$i<4;$i++) { | |
1391 | ################################################################ | |
1392 | # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz | |
1393 | # * abcdefgh | |
1394 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx | |
1395 | # | |
1396 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
1397 | # rewrite above as: | |
1398 | # | |
1399 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx | |
1400 | # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 | |
1401 | # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh | |
1402 | $code.=<<___; | |
1403 | ldr $bi,[$bp,#8*$i] // b[i] | |
1404 | ||
1405 | lsl $t0,$t4,#32 | |
1406 | subs $acc2,$acc2,$t4 | |
1407 | lsr $t1,$t4,#32 | |
1408 | sbcs $acc3,$acc3,$t0 | |
1409 | sbcs $acc4,$acc4,$t1 | |
1410 | sbc $acc5,$acc5,xzr | |
1411 | ||
1412 | subs xzr,$acc0,#1 | |
1413 | umulh $t1,$ord0,$t4 | |
1414 | mul $t2,$ord1,$t4 | |
1415 | umulh $t3,$ord1,$t4 | |
1416 | ||
1417 | adcs $t2,$t2,$t1 | |
1418 | mul $t0,$a0,$bi | |
1419 | adc $t3,$t3,xzr | |
1420 | mul $t1,$a1,$bi | |
1421 | ||
1422 | adds $acc0,$acc1,$t2 | |
1423 | mul $t2,$a2,$bi | |
1424 | adcs $acc1,$acc2,$t3 | |
1425 | mul $t3,$a3,$bi | |
1426 | adcs $acc2,$acc3,$t4 | |
1427 | adcs $acc3,$acc4,$t4 | |
1428 | adc $acc4,$acc5,xzr | |
1429 | ||
1430 | adds $acc0,$acc0,$t0 // accumulate low parts | |
1431 | umulh $t0,$a0,$bi | |
1432 | adcs $acc1,$acc1,$t1 | |
1433 | umulh $t1,$a1,$bi | |
1434 | adcs $acc2,$acc2,$t2 | |
1435 | umulh $t2,$a2,$bi | |
1436 | adcs $acc3,$acc3,$t3 | |
1437 | umulh $t3,$a3,$bi | |
1438 | adc $acc4,$acc4,xzr | |
1439 | mul $t4,$acc0,$ordk | |
1440 | adds $acc1,$acc1,$t0 // accumulate high parts | |
1441 | adcs $acc2,$acc2,$t1 | |
1442 | adcs $acc3,$acc3,$t2 | |
1443 | adcs $acc4,$acc4,$t3 | |
1444 | adc $acc5,xzr,xzr | |
1445 | ___ | |
1446 | } | |
1447 | $code.=<<___; | |
1448 | lsl $t0,$t4,#32 // last reduction | |
1449 | subs $acc2,$acc2,$t4 | |
1450 | lsr $t1,$t4,#32 | |
1451 | sbcs $acc3,$acc3,$t0 | |
1452 | sbcs $acc4,$acc4,$t1 | |
1453 | sbc $acc5,$acc5,xzr | |
1454 | ||
1455 | subs xzr,$acc0,#1 | |
1456 | umulh $t1,$ord0,$t4 | |
1457 | mul $t2,$ord1,$t4 | |
1458 | umulh $t3,$ord1,$t4 | |
1459 | ||
1460 | adcs $t2,$t2,$t1 | |
1461 | adc $t3,$t3,xzr | |
1462 | ||
1463 | adds $acc0,$acc1,$t2 | |
1464 | adcs $acc1,$acc2,$t3 | |
1465 | adcs $acc2,$acc3,$t4 | |
1466 | adcs $acc3,$acc4,$t4 | |
1467 | adc $acc4,$acc5,xzr | |
1468 | ||
1469 | subs $t0,$acc0,$ord0 // ret -= modulus | |
1470 | sbcs $t1,$acc1,$ord1 | |
1471 | sbcs $t2,$acc2,$ord2 | |
1472 | sbcs $t3,$acc3,$ord3 | |
1473 | sbcs xzr,$acc4,xzr | |
1474 | ||
1475 | csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus | |
1476 | csel $acc1,$acc1,$t1,lo | |
1477 | csel $acc2,$acc2,$t2,lo | |
1478 | stp $acc0,$acc1,[$rp] | |
1479 | csel $acc3,$acc3,$t3,lo | |
1480 | stp $acc2,$acc3,[$rp,#16] | |
1481 | ||
1482 | ldp x19,x20,[sp,#16] | |
1483 | ldp x21,x22,[sp,#32] | |
1484 | ldp x23,x24,[sp,#48] | |
1485 | ldr x29,[sp],#64 | |
1486 | ret | |
1487 | .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont | |
1488 | ||
1489 | //////////////////////////////////////////////////////////////////////// | |
1490 | // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], | |
15972296 | 1491 | // uint64_t rep); |
ab4f2026 AP |
1492 | .globl ecp_nistz256_ord_sqr_mont |
1493 | .type ecp_nistz256_ord_sqr_mont,%function | |
1494 | .align 4 | |
1495 | ecp_nistz256_ord_sqr_mont: | |
1496 | stp x29,x30,[sp,#-64]! | |
1497 | add x29,sp,#0 | |
1498 | stp x19,x20,[sp,#16] | |
1499 | stp x21,x22,[sp,#32] | |
1500 | stp x23,x24,[sp,#48] | |
1501 | ||
1502 | adr $ordk,.Lord | |
1503 | ldp $a0,$a1,[$ap] | |
1504 | ldp $a2,$a3,[$ap,#16] | |
1505 | ||
1506 | ldp $ord0,$ord1,[$ordk,#0] | |
1507 | ldp $ord2,$ord3,[$ordk,#16] | |
1508 | ldr $ordk,[$ordk,#32] | |
1509 | b .Loop_ord_sqr | |
1510 | ||
1511 | .align 4 | |
1512 | .Loop_ord_sqr: | |
1513 | sub $bp,$bp,#1 | |
1514 | //////////////////////////////////////////////////////////////// | |
1515 | // | | | | | |a1*a0| | | |
1516 | // | | | | |a2*a0| | | | |
1517 | // | |a3*a2|a3*a0| | | | | |
1518 | // | | | |a2*a1| | | | | |
1519 | // | | |a3*a1| | | | | | |
1520 | // *| | | | | | | | 2| | |
1521 | // +|a3*a3|a2*a2|a1*a1|a0*a0| | |
1522 | // |--+--+--+--+--+--+--+--| | |
1523 | // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | |
1524 | // | |
1525 | // "can't overflow" below mark carrying into high part of | |
1526 | // multiplication result, which can't overflow, because it | |
1527 | // can never be all ones. | |
1528 | ||
1529 | mul $acc1,$a1,$a0 // a[1]*a[0] | |
1530 | umulh $t1,$a1,$a0 | |
1531 | mul $acc2,$a2,$a0 // a[2]*a[0] | |
1532 | umulh $t2,$a2,$a0 | |
1533 | mul $acc3,$a3,$a0 // a[3]*a[0] | |
1534 | umulh $acc4,$a3,$a0 | |
1535 | ||
1536 | adds $acc2,$acc2,$t1 // accumulate high parts of multiplication | |
1537 | mul $t0,$a2,$a1 // a[2]*a[1] | |
1538 | umulh $t1,$a2,$a1 | |
1539 | adcs $acc3,$acc3,$t2 | |
1540 | mul $t2,$a3,$a1 // a[3]*a[1] | |
1541 | umulh $t3,$a3,$a1 | |
1542 | adc $acc4,$acc4,xzr // can't overflow | |
1543 | ||
1544 | mul $acc5,$a3,$a2 // a[3]*a[2] | |
1545 | umulh $acc6,$a3,$a2 | |
1546 | ||
1547 | adds $t1,$t1,$t2 // accumulate high parts of multiplication | |
1548 | mul $acc0,$a0,$a0 // a[0]*a[0] | |
1549 | adc $t2,$t3,xzr // can't overflow | |
1550 | ||
1551 | adds $acc3,$acc3,$t0 // accumulate low parts of multiplication | |
1552 | umulh $a0,$a0,$a0 | |
1553 | adcs $acc4,$acc4,$t1 | |
1554 | mul $t1,$a1,$a1 // a[1]*a[1] | |
1555 | adcs $acc5,$acc5,$t2 | |
1556 | umulh $a1,$a1,$a1 | |
1557 | adc $acc6,$acc6,xzr // can't overflow | |
1558 | ||
1559 | adds $acc1,$acc1,$acc1 // acc[1-6]*=2 | |
1560 | mul $t2,$a2,$a2 // a[2]*a[2] | |
1561 | adcs $acc2,$acc2,$acc2 | |
1562 | umulh $a2,$a2,$a2 | |
1563 | adcs $acc3,$acc3,$acc3 | |
1564 | mul $t3,$a3,$a3 // a[3]*a[3] | |
1565 | adcs $acc4,$acc4,$acc4 | |
1566 | umulh $a3,$a3,$a3 | |
1567 | adcs $acc5,$acc5,$acc5 | |
1568 | adcs $acc6,$acc6,$acc6 | |
1569 | adc $acc7,xzr,xzr | |
1570 | ||
1571 | adds $acc1,$acc1,$a0 // +a[i]*a[i] | |
1572 | mul $t4,$acc0,$ordk | |
1573 | adcs $acc2,$acc2,$t1 | |
1574 | adcs $acc3,$acc3,$a1 | |
1575 | adcs $acc4,$acc4,$t2 | |
1576 | adcs $acc5,$acc5,$a2 | |
1577 | adcs $acc6,$acc6,$t3 | |
1578 | adc $acc7,$acc7,$a3 | |
1579 | ___ | |
1580 | for($i=0; $i<4; $i++) { # reductions | |
1581 | $code.=<<___; | |
1582 | subs xzr,$acc0,#1 | |
1583 | umulh $t1,$ord0,$t4 | |
1584 | mul $t2,$ord1,$t4 | |
1585 | umulh $t3,$ord1,$t4 | |
1586 | ||
1587 | adcs $t2,$t2,$t1 | |
1588 | adc $t3,$t3,xzr | |
1589 | ||
1590 | adds $acc0,$acc1,$t2 | |
1591 | adcs $acc1,$acc2,$t3 | |
1592 | adcs $acc2,$acc3,$t4 | |
1593 | adc $acc3,xzr,$t4 // can't overflow | |
1594 | ___ | |
1595 | $code.=<<___ if ($i<3); | |
1596 | mul $t3,$acc0,$ordk | |
1597 | ___ | |
1598 | $code.=<<___; | |
1599 | lsl $t0,$t4,#32 | |
1600 | subs $acc1,$acc1,$t4 | |
1601 | lsr $t1,$t4,#32 | |
1602 | sbcs $acc2,$acc2,$t0 | |
1603 | sbc $acc3,$acc3,$t1 // can't borrow | |
1604 | ___ | |
1605 | ($t3,$t4) = ($t4,$t3); | |
1606 | } | |
1607 | $code.=<<___; | |
1608 | adds $acc0,$acc0,$acc4 // accumulate upper half | |
1609 | adcs $acc1,$acc1,$acc5 | |
1610 | adcs $acc2,$acc2,$acc6 | |
1611 | adcs $acc3,$acc3,$acc7 | |
1612 | adc $acc4,xzr,xzr | |
1613 | ||
1614 | subs $t0,$acc0,$ord0 // ret -= modulus | |
1615 | sbcs $t1,$acc1,$ord1 | |
1616 | sbcs $t2,$acc2,$ord2 | |
1617 | sbcs $t3,$acc3,$ord3 | |
1618 | sbcs xzr,$acc4,xzr | |
1619 | ||
1620 | csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus | |
1621 | csel $a1,$acc1,$t1,lo | |
1622 | csel $a2,$acc2,$t2,lo | |
1623 | csel $a3,$acc3,$t3,lo | |
1624 | ||
1625 | cbnz $bp,.Loop_ord_sqr | |
1626 | ||
1627 | stp $a0,$a1,[$rp] | |
1628 | stp $a2,$a3,[$rp,#16] | |
1629 | ||
1630 | ldp x19,x20,[sp,#16] | |
1631 | ldp x21,x22,[sp,#32] | |
1632 | ldp x23,x24,[sp,#48] | |
1633 | ldr x29,[sp],#64 | |
1634 | ret | |
1635 | .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont | |
1636 | ___ | |
e1613e7c AP |
1637 | } } |
1638 | ||
1639 | ######################################################################## | |
1640 | # scatter-gather subroutines | |
1641 | { | |
1642 | my ($out,$inp,$index,$mask)=map("x$_",(0..3)); | |
1643 | $code.=<<___; | |
1644 | // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, | |
1645 | // int x2); | |
1646 | .globl ecp_nistz256_scatter_w5 | |
1647 | .type ecp_nistz256_scatter_w5,%function | |
1648 | .align 4 | |
1649 | ecp_nistz256_scatter_w5: | |
1650 | stp x29,x30,[sp,#-16]! | |
1651 | add x29,sp,#0 | |
1652 | ||
1653 | add $out,$out,$index,lsl#2 | |
1654 | ||
1655 | ldp x4,x5,[$inp] // X | |
1656 | ldp x6,x7,[$inp,#16] | |
db42bb44 | 1657 | stur w4,[$out,#64*0-4] |
e1613e7c AP |
1658 | lsr x4,x4,#32 |
1659 | str w5,[$out,#64*1-4] | |
1660 | lsr x5,x5,#32 | |
1661 | str w6,[$out,#64*2-4] | |
1662 | lsr x6,x6,#32 | |
1663 | str w7,[$out,#64*3-4] | |
1664 | lsr x7,x7,#32 | |
1665 | str w4,[$out,#64*4-4] | |
1666 | str w5,[$out,#64*5-4] | |
1667 | str w6,[$out,#64*6-4] | |
1668 | str w7,[$out,#64*7-4] | |
1669 | add $out,$out,#64*8 | |
1670 | ||
1671 | ldp x4,x5,[$inp,#32] // Y | |
1672 | ldp x6,x7,[$inp,#48] | |
db42bb44 | 1673 | stur w4,[$out,#64*0-4] |
e1613e7c AP |
1674 | lsr x4,x4,#32 |
1675 | str w5,[$out,#64*1-4] | |
1676 | lsr x5,x5,#32 | |
1677 | str w6,[$out,#64*2-4] | |
1678 | lsr x6,x6,#32 | |
1679 | str w7,[$out,#64*3-4] | |
1680 | lsr x7,x7,#32 | |
1681 | str w4,[$out,#64*4-4] | |
1682 | str w5,[$out,#64*5-4] | |
1683 | str w6,[$out,#64*6-4] | |
1684 | str w7,[$out,#64*7-4] | |
1685 | add $out,$out,#64*8 | |
1686 | ||
1687 | ldp x4,x5,[$inp,#64] // Z | |
1688 | ldp x6,x7,[$inp,#80] | |
db42bb44 | 1689 | stur w4,[$out,#64*0-4] |
e1613e7c AP |
1690 | lsr x4,x4,#32 |
1691 | str w5,[$out,#64*1-4] | |
1692 | lsr x5,x5,#32 | |
1693 | str w6,[$out,#64*2-4] | |
1694 | lsr x6,x6,#32 | |
1695 | str w7,[$out,#64*3-4] | |
1696 | lsr x7,x7,#32 | |
1697 | str w4,[$out,#64*4-4] | |
1698 | str w5,[$out,#64*5-4] | |
1699 | str w6,[$out,#64*6-4] | |
1700 | str w7,[$out,#64*7-4] | |
1701 | ||
1702 | ldr x29,[sp],#16 | |
1703 | ret | |
1704 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 | |
1705 | ||
1706 | // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, | |
1707 | // int x2); | |
1708 | .globl ecp_nistz256_gather_w5 | |
1709 | .type ecp_nistz256_gather_w5,%function | |
1710 | .align 4 | |
1711 | ecp_nistz256_gather_w5: | |
1712 | stp x29,x30,[sp,#-16]! | |
1713 | add x29,sp,#0 | |
1714 | ||
1715 | cmp $index,xzr | |
1716 | csetm x3,ne | |
1717 | add $index,$index,x3 | |
1718 | add $inp,$inp,$index,lsl#2 | |
1719 | ||
1720 | ldr w4,[$inp,#64*0] | |
1721 | ldr w5,[$inp,#64*1] | |
1722 | ldr w6,[$inp,#64*2] | |
1723 | ldr w7,[$inp,#64*3] | |
1724 | ldr w8,[$inp,#64*4] | |
1725 | ldr w9,[$inp,#64*5] | |
1726 | ldr w10,[$inp,#64*6] | |
1727 | ldr w11,[$inp,#64*7] | |
1728 | add $inp,$inp,#64*8 | |
1729 | orr x4,x4,x8,lsl#32 | |
1730 | orr x5,x5,x9,lsl#32 | |
1731 | orr x6,x6,x10,lsl#32 | |
1732 | orr x7,x7,x11,lsl#32 | |
1733 | csel x4,x4,xzr,ne | |
1734 | csel x5,x5,xzr,ne | |
1735 | csel x6,x6,xzr,ne | |
1736 | csel x7,x7,xzr,ne | |
1737 | stp x4,x5,[$out] // X | |
1738 | stp x6,x7,[$out,#16] | |
1739 | ||
1740 | ldr w4,[$inp,#64*0] | |
1741 | ldr w5,[$inp,#64*1] | |
1742 | ldr w6,[$inp,#64*2] | |
1743 | ldr w7,[$inp,#64*3] | |
1744 | ldr w8,[$inp,#64*4] | |
1745 | ldr w9,[$inp,#64*5] | |
1746 | ldr w10,[$inp,#64*6] | |
1747 | ldr w11,[$inp,#64*7] | |
1748 | add $inp,$inp,#64*8 | |
1749 | orr x4,x4,x8,lsl#32 | |
1750 | orr x5,x5,x9,lsl#32 | |
1751 | orr x6,x6,x10,lsl#32 | |
1752 | orr x7,x7,x11,lsl#32 | |
1753 | csel x4,x4,xzr,ne | |
1754 | csel x5,x5,xzr,ne | |
1755 | csel x6,x6,xzr,ne | |
1756 | csel x7,x7,xzr,ne | |
1757 | stp x4,x5,[$out,#32] // Y | |
1758 | stp x6,x7,[$out,#48] | |
1759 | ||
1760 | ldr w4,[$inp,#64*0] | |
1761 | ldr w5,[$inp,#64*1] | |
1762 | ldr w6,[$inp,#64*2] | |
1763 | ldr w7,[$inp,#64*3] | |
1764 | ldr w8,[$inp,#64*4] | |
1765 | ldr w9,[$inp,#64*5] | |
1766 | ldr w10,[$inp,#64*6] | |
1767 | ldr w11,[$inp,#64*7] | |
1768 | orr x4,x4,x8,lsl#32 | |
1769 | orr x5,x5,x9,lsl#32 | |
1770 | orr x6,x6,x10,lsl#32 | |
1771 | orr x7,x7,x11,lsl#32 | |
1772 | csel x4,x4,xzr,ne | |
1773 | csel x5,x5,xzr,ne | |
1774 | csel x6,x6,xzr,ne | |
1775 | csel x7,x7,xzr,ne | |
1776 | stp x4,x5,[$out,#64] // Z | |
1777 | stp x6,x7,[$out,#80] | |
1778 | ||
1779 | ldr x29,[sp],#16 | |
1780 | ret | |
1781 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 | |
1782 | ||
1783 | // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, | |
1784 | // int x2); | |
1785 | .globl ecp_nistz256_scatter_w7 | |
1786 | .type ecp_nistz256_scatter_w7,%function | |
1787 | .align 4 | |
1788 | ecp_nistz256_scatter_w7: | |
1789 | stp x29,x30,[sp,#-16]! | |
1790 | add x29,sp,#0 | |
1791 | ||
1792 | add $out,$out,$index | |
1793 | mov $index,#64/8 | |
1794 | .Loop_scatter_w7: | |
1795 | ldr x3,[$inp],#8 | |
1796 | subs $index,$index,#1 | |
1797 | prfm pstl1strm,[$out,#4096+64*0] | |
1798 | prfm pstl1strm,[$out,#4096+64*1] | |
1799 | prfm pstl1strm,[$out,#4096+64*2] | |
1800 | prfm pstl1strm,[$out,#4096+64*3] | |
1801 | prfm pstl1strm,[$out,#4096+64*4] | |
1802 | prfm pstl1strm,[$out,#4096+64*5] | |
1803 | prfm pstl1strm,[$out,#4096+64*6] | |
1804 | prfm pstl1strm,[$out,#4096+64*7] | |
87a75b3e | 1805 | strb w3,[$out,#64*0] |
e1613e7c | 1806 | lsr x3,x3,#8 |
87a75b3e | 1807 | strb w3,[$out,#64*1] |
e1613e7c | 1808 | lsr x3,x3,#8 |
87a75b3e | 1809 | strb w3,[$out,#64*2] |
e1613e7c | 1810 | lsr x3,x3,#8 |
87a75b3e | 1811 | strb w3,[$out,#64*3] |
e1613e7c | 1812 | lsr x3,x3,#8 |
87a75b3e | 1813 | strb w3,[$out,#64*4] |
e1613e7c | 1814 | lsr x3,x3,#8 |
87a75b3e | 1815 | strb w3,[$out,#64*5] |
e1613e7c | 1816 | lsr x3,x3,#8 |
87a75b3e | 1817 | strb w3,[$out,#64*6] |
e1613e7c | 1818 | lsr x3,x3,#8 |
87a75b3e | 1819 | strb w3,[$out,#64*7] |
e1613e7c AP |
1820 | add $out,$out,#64*8 |
1821 | b.ne .Loop_scatter_w7 | |
1822 | ||
1823 | ldr x29,[sp],#16 | |
1824 | ret | |
1825 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 | |
1826 | ||
1827 | // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, | |
1828 | // int x2); | |
1829 | .globl ecp_nistz256_gather_w7 | |
1830 | .type ecp_nistz256_gather_w7,%function | |
1831 | .align 4 | |
1832 | ecp_nistz256_gather_w7: | |
1833 | stp x29,x30,[sp,#-16]! | |
1834 | add x29,sp,#0 | |
1835 | ||
1836 | cmp $index,xzr | |
1837 | csetm x3,ne | |
1838 | add $index,$index,x3 | |
1839 | add $inp,$inp,$index | |
1840 | mov $index,#64/8 | |
1841 | nop | |
1842 | .Loop_gather_w7: | |
1843 | ldrb w4,[$inp,#64*0] | |
1844 | prfm pldl1strm,[$inp,#4096+64*0] | |
1845 | subs $index,$index,#1 | |
1846 | ldrb w5,[$inp,#64*1] | |
1847 | prfm pldl1strm,[$inp,#4096+64*1] | |
1848 | ldrb w6,[$inp,#64*2] | |
1849 | prfm pldl1strm,[$inp,#4096+64*2] | |
1850 | ldrb w7,[$inp,#64*3] | |
1851 | prfm pldl1strm,[$inp,#4096+64*3] | |
1852 | ldrb w8,[$inp,#64*4] | |
1853 | prfm pldl1strm,[$inp,#4096+64*4] | |
1854 | ldrb w9,[$inp,#64*5] | |
1855 | prfm pldl1strm,[$inp,#4096+64*5] | |
1856 | ldrb w10,[$inp,#64*6] | |
1857 | prfm pldl1strm,[$inp,#4096+64*6] | |
1858 | ldrb w11,[$inp,#64*7] | |
1859 | prfm pldl1strm,[$inp,#4096+64*7] | |
1860 | add $inp,$inp,#64*8 | |
1861 | orr x4,x4,x5,lsl#8 | |
1862 | orr x6,x6,x7,lsl#8 | |
1863 | orr x8,x8,x9,lsl#8 | |
1864 | orr x4,x4,x6,lsl#16 | |
1865 | orr x10,x10,x11,lsl#8 | |
1866 | orr x4,x4,x8,lsl#32 | |
1867 | orr x4,x4,x10,lsl#48 | |
1868 | and x4,x4,x3 | |
1869 | str x4,[$out],#8 | |
1870 | b.ne .Loop_gather_w7 | |
1871 | ||
1872 | ldr x29,[sp],#16 | |
1873 | ret | |
1874 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 | |
1875 | ___ | |
1876 | } | |
1877 | ||
1878 | foreach (split("\n",$code)) { | |
1879 | s/\`([^\`]*)\`/eval $1/ge; | |
1880 | ||
1881 | print $_,"\n"; | |
1882 | } | |
1883 | close STDOUT; # enforce flush |