]>
Commit | Line | Data |
---|---|---|
d8f432aa | 1 | #! /usr/bin/env perl |
1212818e | 2 | # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. |
624265c6 RS |
3 | # |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
d8f432aa AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # ECP_NISTZ256 module for PPC64. | |
18 | # | |
19 | # August 2016. | |
20 | # | |
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in | |
22 | # http://eprint.iacr.org/2013/816. | |
23 | # | |
24 | # with/without -DECP_NISTZ256_ASM | |
25 | # POWER7 +260-530% | |
26 | # POWER8 +220-340% | |
27 | ||
28 | $flavour = shift; | |
29 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} | |
30 | ||
31 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
32 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
33 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
34 | die "can't locate ppc-xlate.pl"; | |
35 | ||
36 | open OUT,"| \"$^X\" $xlate $flavour $output"; | |
37 | *STDOUT=*OUT; | |
38 | ||
39 | my $sp="r1"; | |
40 | ||
41 | { | |
42 | my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, | |
43 | $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = | |
44 | map("r$_",(3..12,22..31)); | |
45 | ||
46 | my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont | |
47 | ||
48 | $code.=<<___; | |
49 | .machine "any" | |
50 | .text | |
51 | ___ | |
52 | ######################################################################## | |
53 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 | |
54 | # | |
55 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
56 | open TABLE,"<ecp_nistz256_table.c" or | |
57 | open TABLE,"<${dir}../ecp_nistz256_table.c" or | |
58 | die "failed to open ecp_nistz256_table.c:",$!; | |
59 | ||
60 | use integer; | |
61 | ||
62 | foreach(<TABLE>) { | |
63 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; | |
64 | } | |
65 | close TABLE; | |
66 | ||
67 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37. | |
68 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not | |
69 | # amount of elements. | |
70 | die "insane number of elements" if ($#arr != 64*16*37-1); | |
71 | ||
72 | $code.=<<___; | |
73 | .type ecp_nistz256_precomputed,\@object | |
74 | .globl ecp_nistz256_precomputed | |
75 | .align 12 | |
76 | ecp_nistz256_precomputed: | |
77 | ___ | |
78 | ######################################################################## | |
79 | # this conversion smashes P256_POINT_AFFINE by individual bytes with | |
80 | # 64 byte interval, similar to | |
81 | # 1111222233334444 | |
82 | # 1234123412341234 | |
83 | for(1..37) { | |
84 | @tbl = splice(@arr,0,64*16); | |
85 | for($i=0;$i<64;$i++) { | |
86 | undef @line; | |
87 | for($j=0;$j<64;$j++) { | |
88 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; | |
89 | } | |
90 | $code.=".byte\t"; | |
91 | $code.=join(',',map { sprintf "0x%02x",$_} @line); | |
92 | $code.="\n"; | |
93 | } | |
94 | } | |
95 | ||
96 | $code.=<<___; | |
97 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed | |
98 | .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>" | |
99 | ||
100 | # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], | |
101 | # const BN_ULONG x2[4]); | |
102 | .globl ecp_nistz256_mul_mont | |
103 | .align 5 | |
104 | ecp_nistz256_mul_mont: | |
105 | stdu $sp,-128($sp) | |
106 | mflr r0 | |
107 | std r22,48($sp) | |
108 | std r23,56($sp) | |
109 | std r24,64($sp) | |
110 | std r25,72($sp) | |
111 | std r26,80($sp) | |
112 | std r27,88($sp) | |
113 | std r28,96($sp) | |
114 | std r29,104($sp) | |
115 | std r30,112($sp) | |
116 | std r31,120($sp) | |
117 | ||
118 | ld $a0,0($ap) | |
119 | ld $bi,0($bp) | |
120 | ld $a1,8($ap) | |
121 | ld $a2,16($ap) | |
122 | ld $a3,24($ap) | |
123 | ||
124 | li $poly1,-1 | |
125 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
126 | li $poly3,1 | |
127 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
128 | ||
129 | bl __ecp_nistz256_mul_mont | |
130 | ||
131 | mtlr r0 | |
132 | ld r22,48($sp) | |
133 | ld r23,56($sp) | |
134 | ld r24,64($sp) | |
135 | ld r25,72($sp) | |
136 | ld r26,80($sp) | |
137 | ld r27,88($sp) | |
138 | ld r28,96($sp) | |
139 | ld r29,104($sp) | |
140 | ld r30,112($sp) | |
141 | ld r31,120($sp) | |
142 | addi $sp,$sp,128 | |
143 | blr | |
144 | .long 0 | |
145 | .byte 0,12,4,0,0x80,10,3,0 | |
146 | .long 0 | |
147 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont | |
148 | ||
149 | # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
150 | .globl ecp_nistz256_sqr_mont | |
151 | .align 4 | |
152 | ecp_nistz256_sqr_mont: | |
153 | stdu $sp,-128($sp) | |
154 | mflr r0 | |
155 | std r22,48($sp) | |
156 | std r23,56($sp) | |
157 | std r24,64($sp) | |
158 | std r25,72($sp) | |
159 | std r26,80($sp) | |
160 | std r27,88($sp) | |
161 | std r28,96($sp) | |
162 | std r29,104($sp) | |
163 | std r30,112($sp) | |
164 | std r31,120($sp) | |
165 | ||
166 | ld $a0,0($ap) | |
167 | ld $a1,8($ap) | |
168 | ld $a2,16($ap) | |
169 | ld $a3,24($ap) | |
170 | ||
171 | li $poly1,-1 | |
172 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
173 | li $poly3,1 | |
174 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
175 | ||
176 | bl __ecp_nistz256_sqr_mont | |
177 | ||
178 | mtlr r0 | |
179 | ld r22,48($sp) | |
180 | ld r23,56($sp) | |
181 | ld r24,64($sp) | |
182 | ld r25,72($sp) | |
183 | ld r26,80($sp) | |
184 | ld r27,88($sp) | |
185 | ld r28,96($sp) | |
186 | ld r29,104($sp) | |
187 | ld r30,112($sp) | |
188 | ld r31,120($sp) | |
189 | addi $sp,$sp,128 | |
190 | blr | |
191 | .long 0 | |
192 | .byte 0,12,4,0,0x80,10,2,0 | |
193 | .long 0 | |
194 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont | |
195 | ||
196 | # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], | |
197 | # const BN_ULONG x2[4]); | |
198 | .globl ecp_nistz256_add | |
199 | .align 4 | |
200 | ecp_nistz256_add: | |
201 | stdu $sp,-128($sp) | |
202 | mflr r0 | |
203 | std r28,96($sp) | |
204 | std r29,104($sp) | |
205 | std r30,112($sp) | |
206 | std r31,120($sp) | |
207 | ||
208 | ld $acc0,0($ap) | |
209 | ld $t0, 0($bp) | |
210 | ld $acc1,8($ap) | |
211 | ld $t1, 8($bp) | |
212 | ld $acc2,16($ap) | |
213 | ld $t2, 16($bp) | |
214 | ld $acc3,24($ap) | |
215 | ld $t3, 24($bp) | |
216 | ||
217 | li $poly1,-1 | |
218 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
219 | li $poly3,1 | |
220 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
221 | ||
222 | bl __ecp_nistz256_add | |
223 | ||
224 | mtlr r0 | |
225 | ld r28,96($sp) | |
226 | ld r29,104($sp) | |
227 | ld r30,112($sp) | |
228 | ld r31,120($sp) | |
229 | addi $sp,$sp,128 | |
230 | blr | |
231 | .long 0 | |
232 | .byte 0,12,4,0,0x80,4,3,0 | |
233 | .long 0 | |
234 | .size ecp_nistz256_add,.-ecp_nistz256_add | |
235 | ||
236 | # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
237 | .globl ecp_nistz256_div_by_2 | |
238 | .align 4 | |
239 | ecp_nistz256_div_by_2: | |
240 | stdu $sp,-128($sp) | |
241 | mflr r0 | |
242 | std r28,96($sp) | |
243 | std r29,104($sp) | |
244 | std r30,112($sp) | |
245 | std r31,120($sp) | |
246 | ||
247 | ld $acc0,0($ap) | |
248 | ld $acc1,8($ap) | |
249 | ld $acc2,16($ap) | |
250 | ld $acc3,24($ap) | |
251 | ||
252 | li $poly1,-1 | |
253 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
254 | li $poly3,1 | |
255 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
256 | ||
257 | bl __ecp_nistz256_div_by_2 | |
258 | ||
259 | mtlr r0 | |
260 | ld r28,96($sp) | |
261 | ld r29,104($sp) | |
262 | ld r30,112($sp) | |
263 | ld r31,120($sp) | |
264 | addi $sp,$sp,128 | |
265 | blr | |
266 | .long 0 | |
267 | .byte 0,12,4,0,0x80,4,2,0 | |
268 | .long 0 | |
269 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 | |
270 | ||
271 | # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
272 | .globl ecp_nistz256_mul_by_2 | |
273 | .align 4 | |
274 | ecp_nistz256_mul_by_2: | |
275 | stdu $sp,-128($sp) | |
276 | mflr r0 | |
277 | std r28,96($sp) | |
278 | std r29,104($sp) | |
279 | std r30,112($sp) | |
280 | std r31,120($sp) | |
281 | ||
282 | ld $acc0,0($ap) | |
283 | ld $acc1,8($ap) | |
284 | ld $acc2,16($ap) | |
285 | ld $acc3,24($ap) | |
286 | ||
287 | mr $t0,$acc0 | |
288 | mr $t1,$acc1 | |
289 | mr $t2,$acc2 | |
290 | mr $t3,$acc3 | |
291 | ||
292 | li $poly1,-1 | |
293 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
294 | li $poly3,1 | |
295 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
296 | ||
297 | bl __ecp_nistz256_add # ret = a+a // 2*a | |
298 | ||
299 | mtlr r0 | |
300 | ld r28,96($sp) | |
301 | ld r29,104($sp) | |
302 | ld r30,112($sp) | |
303 | ld r31,120($sp) | |
304 | addi $sp,$sp,128 | |
305 | blr | |
306 | .long 0 | |
307 | .byte 0,12,4,0,0x80,4,3,0 | |
308 | .long 0 | |
309 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 | |
310 | ||
311 | # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
312 | .globl ecp_nistz256_mul_by_3 | |
313 | .align 4 | |
314 | ecp_nistz256_mul_by_3: | |
315 | stdu $sp,-128($sp) | |
316 | mflr r0 | |
317 | std r28,96($sp) | |
318 | std r29,104($sp) | |
319 | std r30,112($sp) | |
320 | std r31,120($sp) | |
321 | ||
322 | ld $acc0,0($ap) | |
323 | ld $acc1,8($ap) | |
324 | ld $acc2,16($ap) | |
325 | ld $acc3,24($ap) | |
326 | ||
327 | mr $t0,$acc0 | |
328 | std $acc0,64($sp) | |
329 | mr $t1,$acc1 | |
330 | std $acc1,72($sp) | |
331 | mr $t2,$acc2 | |
332 | std $acc2,80($sp) | |
333 | mr $t3,$acc3 | |
334 | std $acc3,88($sp) | |
335 | ||
336 | li $poly1,-1 | |
337 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
338 | li $poly3,1 | |
339 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
340 | ||
341 | bl __ecp_nistz256_add # ret = a+a // 2*a | |
342 | ||
343 | ld $t0,64($sp) | |
344 | ld $t1,72($sp) | |
345 | ld $t2,80($sp) | |
346 | ld $t3,88($sp) | |
347 | ||
348 | bl __ecp_nistz256_add # ret += a // 2*a+a=3*a | |
349 | ||
350 | mtlr r0 | |
351 | ld r28,96($sp) | |
352 | ld r29,104($sp) | |
353 | ld r30,112($sp) | |
354 | ld r31,120($sp) | |
355 | addi $sp,$sp,128 | |
356 | blr | |
357 | .long 0 | |
358 | .byte 0,12,4,0,0x80,4,2,0 | |
359 | .long 0 | |
360 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 | |
361 | ||
362 | # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], | |
363 | # const BN_ULONG x2[4]); | |
364 | .globl ecp_nistz256_sub | |
365 | .align 4 | |
366 | ecp_nistz256_sub: | |
367 | stdu $sp,-128($sp) | |
368 | mflr r0 | |
369 | std r28,96($sp) | |
370 | std r29,104($sp) | |
371 | std r30,112($sp) | |
372 | std r31,120($sp) | |
373 | ||
374 | ld $acc0,0($ap) | |
375 | ld $acc1,8($ap) | |
376 | ld $acc2,16($ap) | |
377 | ld $acc3,24($ap) | |
378 | ||
379 | li $poly1,-1 | |
380 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
381 | li $poly3,1 | |
382 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
383 | ||
384 | bl __ecp_nistz256_sub_from | |
385 | ||
386 | mtlr r0 | |
387 | ld r28,96($sp) | |
388 | ld r29,104($sp) | |
389 | ld r30,112($sp) | |
390 | ld r31,120($sp) | |
391 | addi $sp,$sp,128 | |
392 | blr | |
393 | .long 0 | |
394 | .byte 0,12,4,0,0x80,4,3,0 | |
395 | .long 0 | |
396 | .size ecp_nistz256_sub,.-ecp_nistz256_sub | |
397 | ||
398 | # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); | |
399 | .globl ecp_nistz256_neg | |
400 | .align 4 | |
401 | ecp_nistz256_neg: | |
402 | stdu $sp,-128($sp) | |
403 | mflr r0 | |
404 | std r28,96($sp) | |
405 | std r29,104($sp) | |
406 | std r30,112($sp) | |
407 | std r31,120($sp) | |
408 | ||
409 | mr $bp,$ap | |
410 | li $acc0,0 | |
411 | li $acc1,0 | |
412 | li $acc2,0 | |
413 | li $acc3,0 | |
414 | ||
415 | li $poly1,-1 | |
416 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
417 | li $poly3,1 | |
418 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
419 | ||
420 | bl __ecp_nistz256_sub_from | |
421 | ||
422 | mtlr r0 | |
423 | ld r28,96($sp) | |
424 | ld r29,104($sp) | |
425 | ld r30,112($sp) | |
426 | ld r31,120($sp) | |
427 | addi $sp,$sp,128 | |
428 | blr | |
429 | .long 0 | |
430 | .byte 0,12,4,0,0x80,4,2,0 | |
431 | .long 0 | |
432 | .size ecp_nistz256_neg,.-ecp_nistz256_neg | |
433 | ||
434 | # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded | |
435 | # to $a0-$a3 and b[0] - to $bi | |
436 | .type __ecp_nistz256_mul_mont,\@function | |
437 | .align 4 | |
438 | __ecp_nistz256_mul_mont: | |
439 | mulld $acc0,$a0,$bi # a[0]*b[0] | |
440 | mulhdu $t0,$a0,$bi | |
441 | ||
442 | mulld $acc1,$a1,$bi # a[1]*b[0] | |
443 | mulhdu $t1,$a1,$bi | |
444 | ||
445 | mulld $acc2,$a2,$bi # a[2]*b[0] | |
446 | mulhdu $t2,$a2,$bi | |
447 | ||
448 | mulld $acc3,$a3,$bi # a[3]*b[0] | |
449 | mulhdu $t3,$a3,$bi | |
450 | ld $bi,8($bp) # b[1] | |
451 | ||
452 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication | |
453 | sldi $t0,$acc0,32 | |
454 | adde $acc2,$acc2,$t1 | |
455 | srdi $t1,$acc0,32 | |
456 | adde $acc3,$acc3,$t2 | |
457 | addze $acc4,$t3 | |
458 | li $acc5,0 | |
459 | ___ | |
460 | for($i=1;$i<4;$i++) { | |
461 | ################################################################ | |
462 | # Reduction iteration is normally performed by accumulating | |
463 | # result of multiplication of modulus by "magic" digit [and | |
464 | # omitting least significant word, which is guaranteed to | |
465 | # be 0], but thanks to special form of modulus and "magic" | |
466 | # digit being equal to least significant word, it can be | |
467 | # performed with additions and subtractions alone. Indeed: | |
468 | # | |
469 | # ffff0001.00000000.0000ffff.ffffffff | |
470 | # * abcdefgh | |
471 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | |
472 | # | |
473 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
474 | # rewrite above as: | |
475 | # | |
476 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh | |
477 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 | |
478 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh | |
479 | # | |
480 | # or marking redundant operations: | |
481 | # | |
482 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- | |
483 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- | |
484 | # - 0000abcd.efgh0000.--------.--------.-------- | |
485 | ||
486 | $code.=<<___; | |
487 | subfc $t2,$t0,$acc0 # "*0xffff0001" | |
488 | subfe $t3,$t1,$acc0 | |
489 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] | |
d8f432aa | 490 | adde $acc1,$acc2,$t1 |
d8f432aa | 491 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 |
d8f432aa | 492 | adde $acc3,$acc4,$t3 |
d8f432aa AP |
493 | addze $acc4,$acc5 |
494 | ||
6f553edb AP |
495 | mulld $t0,$a0,$bi # lo(a[0]*b[i]) |
496 | mulld $t1,$a1,$bi # lo(a[1]*b[i]) | |
497 | mulld $t2,$a2,$bi # lo(a[2]*b[i]) | |
498 | mulld $t3,$a3,$bi # lo(a[3]*b[i]) | |
d8f432aa AP |
499 | addc $acc0,$acc0,$t0 # accumulate low parts of multiplication |
500 | mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) | |
501 | adde $acc1,$acc1,$t1 | |
502 | mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) | |
503 | adde $acc2,$acc2,$t2 | |
504 | mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) | |
505 | adde $acc3,$acc3,$t3 | |
506 | mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) | |
507 | addze $acc4,$acc4 | |
508 | ___ | |
509 | $code.=<<___ if ($i<3); | |
510 | ld $bi,8*($i+1)($bp) # b[$i+1] | |
511 | ___ | |
512 | $code.=<<___; | |
513 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication | |
514 | sldi $t0,$acc0,32 | |
515 | adde $acc2,$acc2,$t1 | |
516 | srdi $t1,$acc0,32 | |
517 | adde $acc3,$acc3,$t2 | |
d8f432aa | 518 | adde $acc4,$acc4,$t3 |
6f553edb | 519 | li $acc5,0 |
d8f432aa AP |
520 | addze $acc5,$acc5 |
521 | ___ | |
522 | } | |
523 | $code.=<<___; | |
524 | # last reduction | |
525 | subfc $t2,$t0,$acc0 # "*0xffff0001" | |
526 | subfe $t3,$t1,$acc0 | |
527 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] | |
528 | adde $acc1,$acc2,$t1 | |
529 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 | |
530 | adde $acc3,$acc4,$t3 | |
531 | addze $acc4,$acc5 | |
532 | ||
533 | li $t2,0 | |
534 | addic $acc0,$acc0,1 # ret -= modulus | |
535 | subfe $acc1,$poly1,$acc1 | |
536 | subfe $acc2,$t2,$acc2 | |
537 | subfe $acc3,$poly3,$acc3 | |
538 | subfe $acc4,$t2,$acc4 | |
539 | ||
540 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow | |
541 | and $t1,$poly1,$acc4 | |
542 | and $t3,$poly3,$acc4 | |
543 | adde $acc1,$acc1,$t1 | |
544 | addze $acc2,$acc2 | |
545 | adde $acc3,$acc3,$t3 | |
546 | ||
547 | std $acc0,0($rp) | |
548 | std $acc1,8($rp) | |
549 | std $acc2,16($rp) | |
550 | std $acc3,24($rp) | |
551 | ||
552 | blr | |
553 | .long 0 | |
554 | .byte 0,12,0x14,0,0,0,1,0 | |
555 | .long 0 | |
556 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont | |
557 | ||
558 | # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded | |
559 | # to $a0-$a3 | |
560 | .type __ecp_nistz256_sqr_mont,\@function | |
561 | .align 4 | |
562 | __ecp_nistz256_sqr_mont: | |
563 | ################################################################ | |
564 | # | | | | | |a1*a0| | | |
565 | # | | | | |a2*a0| | | | |
566 | # | |a3*a2|a3*a0| | | | | |
567 | # | | | |a2*a1| | | | | |
568 | # | | |a3*a1| | | | | | |
569 | # *| | | | | | | | 2| | |
570 | # +|a3*a3|a2*a2|a1*a1|a0*a0| | |
571 | # |--+--+--+--+--+--+--+--| | |
572 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | |
573 | # | |
574 | # "can't overflow" below mark carrying into high part of | |
575 | # multiplication result, which can't overflow, because it | |
576 | # can never be all ones. | |
577 | ||
578 | mulld $acc1,$a1,$a0 # a[1]*a[0] | |
579 | mulhdu $t1,$a1,$a0 | |
580 | mulld $acc2,$a2,$a0 # a[2]*a[0] | |
581 | mulhdu $t2,$a2,$a0 | |
582 | mulld $acc3,$a3,$a0 # a[3]*a[0] | |
583 | mulhdu $acc4,$a3,$a0 | |
584 | ||
585 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication | |
586 | mulld $t0,$a2,$a1 # a[2]*a[1] | |
587 | mulhdu $t1,$a2,$a1 | |
588 | adde $acc3,$acc3,$t2 | |
589 | mulld $t2,$a3,$a1 # a[3]*a[1] | |
590 | mulhdu $t3,$a3,$a1 | |
591 | addze $acc4,$acc4 # can't overflow | |
592 | ||
593 | mulld $acc5,$a3,$a2 # a[3]*a[2] | |
594 | mulhdu $acc6,$a3,$a2 | |
595 | ||
596 | addc $t1,$t1,$t2 # accumulate high parts of multiplication | |
d8f432aa AP |
597 | addze $t2,$t3 # can't overflow |
598 | ||
599 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication | |
d8f432aa | 600 | adde $acc4,$acc4,$t1 |
d8f432aa | 601 | adde $acc5,$acc5,$t2 |
d8f432aa AP |
602 | addze $acc6,$acc6 # can't overflow |
603 | ||
604 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2 | |
d8f432aa | 605 | adde $acc2,$acc2,$acc2 |
d8f432aa | 606 | adde $acc3,$acc3,$acc3 |
d8f432aa | 607 | adde $acc4,$acc4,$acc4 |
d8f432aa AP |
608 | adde $acc5,$acc5,$acc5 |
609 | adde $acc6,$acc6,$acc6 | |
610 | li $acc7,0 | |
611 | addze $acc7,$acc7 | |
612 | ||
6f553edb AP |
613 | mulld $acc0,$a0,$a0 # a[0]*a[0] |
614 | mulhdu $a0,$a0,$a0 | |
615 | mulld $t1,$a1,$a1 # a[1]*a[1] | |
616 | mulhdu $a1,$a1,$a1 | |
617 | mulld $t2,$a2,$a2 # a[2]*a[2] | |
618 | mulhdu $a2,$a2,$a2 | |
619 | mulld $t3,$a3,$a3 # a[3]*a[3] | |
620 | mulhdu $a3,$a3,$a3 | |
d8f432aa | 621 | addc $acc1,$acc1,$a0 # +a[i]*a[i] |
6f553edb | 622 | sldi $t0,$acc0,32 |
d8f432aa | 623 | adde $acc2,$acc2,$t1 |
6f553edb | 624 | srdi $t1,$acc0,32 |
d8f432aa AP |
625 | adde $acc3,$acc3,$a1 |
626 | adde $acc4,$acc4,$t2 | |
627 | adde $acc5,$acc5,$a2 | |
d8f432aa | 628 | adde $acc6,$acc6,$t3 |
d8f432aa AP |
629 | adde $acc7,$acc7,$a3 |
630 | ___ | |
631 | for($i=0;$i<3;$i++) { # reductions, see commentary in | |
632 | # multiplication for details | |
633 | $code.=<<___; | |
634 | subfc $t2,$t0,$acc0 # "*0xffff0001" | |
635 | subfe $t3,$t1,$acc0 | |
636 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] | |
d8f432aa | 637 | sldi $t0,$acc0,32 |
6f553edb | 638 | adde $acc1,$acc2,$t1 |
d8f432aa | 639 | srdi $t1,$acc0,32 |
6f553edb | 640 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 |
d8f432aa AP |
641 | addze $acc3,$t3 # can't overflow |
642 | ___ | |
643 | } | |
644 | $code.=<<___; | |
645 | subfc $t2,$t0,$acc0 # "*0xffff0001" | |
646 | subfe $t3,$t1,$acc0 | |
647 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] | |
648 | adde $acc1,$acc2,$t1 | |
649 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 | |
d8f432aa AP |
650 | addze $acc3,$t3 # can't overflow |
651 | ||
652 | addc $acc0,$acc0,$acc4 # accumulate upper half | |
653 | adde $acc1,$acc1,$acc5 | |
654 | adde $acc2,$acc2,$acc6 | |
655 | adde $acc3,$acc3,$acc7 | |
6f553edb | 656 | li $t2,0 |
d8f432aa AP |
657 | addze $acc4,$t2 |
658 | ||
659 | addic $acc0,$acc0,1 # ret -= modulus | |
660 | subfe $acc1,$poly1,$acc1 | |
661 | subfe $acc2,$t2,$acc2 | |
662 | subfe $acc3,$poly3,$acc3 | |
663 | subfe $acc4,$t2,$acc4 | |
664 | ||
665 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow | |
666 | and $t1,$poly1,$acc4 | |
667 | and $t3,$poly3,$acc4 | |
668 | adde $acc1,$acc1,$t1 | |
669 | addze $acc2,$acc2 | |
670 | adde $acc3,$acc3,$t3 | |
671 | ||
672 | std $acc0,0($rp) | |
673 | std $acc1,8($rp) | |
674 | std $acc2,16($rp) | |
675 | std $acc3,24($rp) | |
676 | ||
677 | blr | |
678 | .long 0 | |
679 | .byte 0,12,0x14,0,0,0,1,0 | |
680 | .long 0 | |
681 | .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont | |
682 | ||
683 | # Note that __ecp_nistz256_add expects both input vectors pre-loaded to | |
684 | # $a0-$a3 and $t0-$t3. This is done because it's used in multiple | |
685 | # contexts, e.g. in multiplication by 2 and 3... | |
686 | .type __ecp_nistz256_add,\@function | |
687 | .align 4 | |
688 | __ecp_nistz256_add: | |
689 | addc $acc0,$acc0,$t0 # ret = a+b | |
690 | adde $acc1,$acc1,$t1 | |
691 | adde $acc2,$acc2,$t2 | |
692 | li $t2,0 | |
693 | adde $acc3,$acc3,$t3 | |
694 | addze $t0,$t2 | |
695 | ||
696 | # if a+b >= modulus, subtract modulus | |
697 | # | |
698 | # But since comparison implies subtraction, we subtract | |
46f4e1be | 699 | # modulus and then add it back if subtraction borrowed. |
d8f432aa AP |
700 | |
701 | subic $acc0,$acc0,-1 | |
702 | subfe $acc1,$poly1,$acc1 | |
703 | subfe $acc2,$t2,$acc2 | |
704 | subfe $acc3,$poly3,$acc3 | |
705 | subfe $t0,$t2,$t0 | |
706 | ||
707 | addc $acc0,$acc0,$t0 | |
708 | and $t1,$poly1,$t0 | |
709 | and $t3,$poly3,$t0 | |
710 | adde $acc1,$acc1,$t1 | |
711 | addze $acc2,$acc2 | |
712 | adde $acc3,$acc3,$t3 | |
713 | ||
714 | std $acc0,0($rp) | |
715 | std $acc1,8($rp) | |
716 | std $acc2,16($rp) | |
717 | std $acc3,24($rp) | |
718 | ||
719 | blr | |
720 | .long 0 | |
721 | .byte 0,12,0x14,0,0,0,3,0 | |
722 | .long 0 | |
723 | .size __ecp_nistz256_add,.-__ecp_nistz256_add | |
724 | ||
725 | .type __ecp_nistz256_sub_from,\@function | |
726 | .align 4 | |
727 | __ecp_nistz256_sub_from: | |
728 | ld $t0,0($bp) | |
729 | ld $t1,8($bp) | |
730 | ld $t2,16($bp) | |
731 | ld $t3,24($bp) | |
732 | subfc $acc0,$t0,$acc0 # ret = a-b | |
733 | subfe $acc1,$t1,$acc1 | |
734 | subfe $acc2,$t2,$acc2 | |
735 | subfe $acc3,$t3,$acc3 | |
736 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 | |
737 | ||
738 | # if a-b borrowed, add modulus | |
739 | ||
740 | addc $acc0,$acc0,$t0 # ret -= modulus & t0 | |
741 | and $t1,$poly1,$t0 | |
742 | and $t3,$poly3,$t0 | |
743 | adde $acc1,$acc1,$t1 | |
744 | addze $acc2,$acc2 | |
745 | adde $acc3,$acc3,$t3 | |
746 | ||
747 | std $acc0,0($rp) | |
748 | std $acc1,8($rp) | |
749 | std $acc2,16($rp) | |
750 | std $acc3,24($rp) | |
751 | ||
752 | blr | |
753 | .long 0 | |
754 | .byte 0,12,0x14,0,0,0,3,0 | |
755 | .long 0 | |
756 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from | |
757 | ||
758 | .type __ecp_nistz256_sub_morf,\@function | |
759 | .align 4 | |
760 | __ecp_nistz256_sub_morf: | |
761 | ld $t0,0($bp) | |
762 | ld $t1,8($bp) | |
763 | ld $t2,16($bp) | |
764 | ld $t3,24($bp) | |
765 | subfc $acc0,$acc0,$t0 # ret = b-a | |
766 | subfe $acc1,$acc1,$t1 | |
767 | subfe $acc2,$acc2,$t2 | |
768 | subfe $acc3,$acc3,$t3 | |
769 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 | |
770 | ||
771 | # if b-a borrowed, add modulus | |
772 | ||
773 | addc $acc0,$acc0,$t0 # ret -= modulus & t0 | |
774 | and $t1,$poly1,$t0 | |
775 | and $t3,$poly3,$t0 | |
776 | adde $acc1,$acc1,$t1 | |
777 | addze $acc2,$acc2 | |
778 | adde $acc3,$acc3,$t3 | |
779 | ||
780 | std $acc0,0($rp) | |
781 | std $acc1,8($rp) | |
782 | std $acc2,16($rp) | |
783 | std $acc3,24($rp) | |
784 | ||
785 | blr | |
786 | .long 0 | |
787 | .byte 0,12,0x14,0,0,0,3,0 | |
788 | .long 0 | |
789 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf | |
790 | ||
791 | .type __ecp_nistz256_div_by_2,\@function | |
792 | .align 4 | |
793 | __ecp_nistz256_div_by_2: | |
794 | andi. $t0,$acc0,1 | |
795 | addic $acc0,$acc0,-1 # a += modulus | |
796 | neg $t0,$t0 | |
797 | adde $acc1,$acc1,$poly1 | |
798 | not $t0,$t0 | |
799 | addze $acc2,$acc2 | |
800 | li $t2,0 | |
801 | adde $acc3,$acc3,$poly3 | |
802 | and $t1,$poly1,$t0 | |
803 | addze $ap,$t2 # ap = carry | |
804 | and $t3,$poly3,$t0 | |
805 | ||
806 | subfc $acc0,$t0,$acc0 # a -= modulus if a was even | |
807 | subfe $acc1,$t1,$acc1 | |
808 | subfe $acc2,$t2,$acc2 | |
809 | subfe $acc3,$t3,$acc3 | |
810 | subfe $ap, $t2,$ap | |
811 | ||
812 | srdi $acc0,$acc0,1 | |
813 | sldi $t0,$acc1,63 | |
814 | srdi $acc1,$acc1,1 | |
815 | sldi $t1,$acc2,63 | |
816 | srdi $acc2,$acc2,1 | |
817 | sldi $t2,$acc3,63 | |
818 | srdi $acc3,$acc3,1 | |
819 | sldi $t3,$ap,63 | |
820 | or $acc0,$acc0,$t0 | |
821 | or $acc1,$acc1,$t1 | |
822 | or $acc2,$acc2,$t2 | |
823 | or $acc3,$acc3,$t3 | |
824 | ||
825 | std $acc0,0($rp) | |
826 | std $acc1,8($rp) | |
827 | std $acc2,16($rp) | |
828 | std $acc3,24($rp) | |
829 | ||
830 | blr | |
831 | .long 0 | |
832 | .byte 0,12,0x14,0,0,0,1,0 | |
833 | .long 0 | |
834 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 | |
835 | ___ | |
836 | ######################################################################## | |
837 | # following subroutines are "literal" implementation of those found in | |
838 | # ecp_nistz256.c | |
839 | # | |
840 | ######################################################################## | |
841 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); | |
842 | # | |
843 | if (1) { | |
844 | my $FRAME=64+32*4+12*8; | |
845 | my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); | |
846 | # above map() describes stack layout with 4 temporary | |
847 | # 256-bit vectors on top. | |
848 | my ($rp_real,$ap_real) = map("r$_",(20,21)); | |
849 | ||
850 | $code.=<<___; | |
851 | .globl ecp_nistz256_point_double | |
852 | .align 5 | |
853 | ecp_nistz256_point_double: | |
854 | stdu $sp,-$FRAME($sp) | |
855 | mflr r0 | |
856 | std r20,$FRAME-8*12($sp) | |
857 | std r21,$FRAME-8*11($sp) | |
858 | std r22,$FRAME-8*10($sp) | |
859 | std r23,$FRAME-8*9($sp) | |
860 | std r24,$FRAME-8*8($sp) | |
861 | std r25,$FRAME-8*7($sp) | |
862 | std r26,$FRAME-8*6($sp) | |
863 | std r27,$FRAME-8*5($sp) | |
864 | std r28,$FRAME-8*4($sp) | |
865 | std r29,$FRAME-8*3($sp) | |
866 | std r30,$FRAME-8*2($sp) | |
867 | std r31,$FRAME-8*1($sp) | |
868 | ||
869 | li $poly1,-1 | |
870 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
871 | li $poly3,1 | |
872 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
873 | .Ldouble_shortcut: | |
874 | ld $acc0,32($ap) | |
875 | ld $acc1,40($ap) | |
876 | ld $acc2,48($ap) | |
877 | ld $acc3,56($ap) | |
878 | mr $t0,$acc0 | |
879 | mr $t1,$acc1 | |
880 | mr $t2,$acc2 | |
881 | mr $t3,$acc3 | |
882 | ld $a0,64($ap) # forward load for p256_sqr_mont | |
883 | ld $a1,72($ap) | |
884 | ld $a2,80($ap) | |
885 | ld $a3,88($ap) | |
886 | mr $rp_real,$rp | |
887 | mr $ap_real,$ap | |
888 | addi $rp,$sp,$S | |
889 | bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); | |
890 | ||
891 | addi $rp,$sp,$Zsqr | |
892 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); | |
893 | ||
894 | ld $t0,0($ap_real) | |
895 | ld $t1,8($ap_real) | |
896 | ld $t2,16($ap_real) | |
897 | ld $t3,24($ap_real) | |
898 | mr $a0,$acc0 # put Zsqr aside for p256_sub | |
899 | mr $a1,$acc1 | |
900 | mr $a2,$acc2 | |
901 | mr $a3,$acc3 | |
902 | addi $rp,$sp,$M | |
903 | bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); | |
904 | ||
905 | addi $bp,$ap_real,0 | |
906 | mr $acc0,$a0 # restore Zsqr | |
907 | mr $acc1,$a1 | |
908 | mr $acc2,$a2 | |
909 | mr $acc3,$a3 | |
910 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont | |
911 | ld $a1,$S+8($sp) | |
912 | ld $a2,$S+16($sp) | |
913 | ld $a3,$S+24($sp) | |
914 | addi $rp,$sp,$Zsqr | |
915 | bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); | |
916 | ||
917 | addi $rp,$sp,$S | |
918 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); | |
919 | ||
920 | ld $bi,32($ap_real) | |
921 | ld $a0,64($ap_real) | |
922 | ld $a1,72($ap_real) | |
923 | ld $a2,80($ap_real) | |
924 | ld $a3,88($ap_real) | |
925 | addi $bp,$ap_real,32 | |
926 | addi $rp,$sp,$tmp0 | |
927 | bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); | |
928 | ||
929 | mr $t0,$acc0 | |
930 | mr $t1,$acc1 | |
931 | mr $t2,$acc2 | |
932 | mr $t3,$acc3 | |
933 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont | |
934 | ld $a1,$S+8($sp) | |
935 | ld $a2,$S+16($sp) | |
936 | ld $a3,$S+24($sp) | |
937 | addi $rp,$rp_real,64 | |
938 | bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); | |
939 | ||
940 | addi $rp,$sp,$tmp0 | |
941 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); | |
942 | ||
943 | ld $bi,$Zsqr($sp) # forward load for p256_mul_mont | |
944 | ld $a0,$M+0($sp) | |
945 | ld $a1,$M+8($sp) | |
946 | ld $a2,$M+16($sp) | |
947 | ld $a3,$M+24($sp) | |
948 | addi $rp,$rp_real,32 | |
949 | bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); | |
950 | ||
951 | addi $bp,$sp,$Zsqr | |
952 | addi $rp,$sp,$M | |
953 | bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); | |
954 | ||
955 | mr $t0,$acc0 # duplicate M | |
956 | mr $t1,$acc1 | |
957 | mr $t2,$acc2 | |
958 | mr $t3,$acc3 | |
959 | mr $a0,$acc0 # put M aside | |
960 | mr $a1,$acc1 | |
961 | mr $a2,$acc2 | |
962 | mr $a3,$acc3 | |
963 | addi $rp,$sp,$M | |
964 | bl __ecp_nistz256_add | |
965 | mr $t0,$a0 # restore M | |
966 | mr $t1,$a1 | |
967 | mr $t2,$a2 | |
968 | mr $t3,$a3 | |
969 | ld $bi,0($ap_real) # forward load for p256_mul_mont | |
970 | ld $a0,$S+0($sp) | |
971 | ld $a1,$S+8($sp) | |
972 | ld $a2,$S+16($sp) | |
973 | ld $a3,$S+24($sp) | |
974 | bl __ecp_nistz256_add # p256_mul_by_3(M, M); | |
975 | ||
976 | addi $bp,$ap_real,0 | |
977 | addi $rp,$sp,$S | |
978 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); | |
979 | ||
980 | mr $t0,$acc0 | |
981 | mr $t1,$acc1 | |
982 | mr $t2,$acc2 | |
983 | mr $t3,$acc3 | |
984 | ld $a0,$M+0($sp) # forward load for p256_sqr_mont | |
985 | ld $a1,$M+8($sp) | |
986 | ld $a2,$M+16($sp) | |
987 | ld $a3,$M+24($sp) | |
988 | addi $rp,$sp,$tmp0 | |
989 | bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); | |
990 | ||
991 | addi $rp,$rp_real,0 | |
992 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); | |
993 | ||
994 | addi $bp,$sp,$tmp0 | |
995 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); | |
996 | ||
997 | addi $bp,$sp,$S | |
998 | addi $rp,$sp,$S | |
999 | bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); | |
1000 | ||
1001 | ld $bi,$M($sp) | |
1002 | mr $a0,$acc0 # copy S | |
1003 | mr $a1,$acc1 | |
1004 | mr $a2,$acc2 | |
1005 | mr $a3,$acc3 | |
1006 | addi $bp,$sp,$M | |
1007 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); | |
1008 | ||
1009 | addi $bp,$rp_real,32 | |
1010 | addi $rp,$rp_real,32 | |
1011 | bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); | |
1012 | ||
1013 | mtlr r0 | |
1014 | ld r20,$FRAME-8*12($sp) | |
1015 | ld r21,$FRAME-8*11($sp) | |
1016 | ld r22,$FRAME-8*10($sp) | |
1017 | ld r23,$FRAME-8*9($sp) | |
1018 | ld r24,$FRAME-8*8($sp) | |
1019 | ld r25,$FRAME-8*7($sp) | |
1020 | ld r26,$FRAME-8*6($sp) | |
1021 | ld r27,$FRAME-8*5($sp) | |
1022 | ld r28,$FRAME-8*4($sp) | |
1023 | ld r29,$FRAME-8*3($sp) | |
1024 | ld r30,$FRAME-8*2($sp) | |
1025 | ld r31,$FRAME-8*1($sp) | |
1026 | addi $sp,$sp,$FRAME | |
1027 | blr | |
1028 | .long 0 | |
1029 | .byte 0,12,4,0,0x80,12,2,0 | |
1030 | .long 0 | |
1031 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double | |
1032 | ___ | |
1033 | } | |
1034 | ||
1035 | ######################################################################## | |
1036 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, | |
1037 | # const P256_POINT *in2); | |
1038 | if (1) { | |
1039 | my $FRAME = 64 + 32*12 + 16*8; | |
1040 | my ($res_x,$res_y,$res_z, | |
1041 | $H,$Hsqr,$R,$Rsqr,$Hcub, | |
1042 | $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); | |
1043 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); | |
1044 | # above map() describes stack layout with 12 temporary | |
1045 | # 256-bit vectors on top. | |
1046 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); | |
1047 | ||
1048 | $code.=<<___; | |
1049 | .globl ecp_nistz256_point_add | |
1050 | .align 5 | |
1051 | ecp_nistz256_point_add: | |
1052 | stdu $sp,-$FRAME($sp) | |
1053 | mflr r0 | |
1054 | std r16,$FRAME-8*16($sp) | |
1055 | std r17,$FRAME-8*15($sp) | |
1056 | std r18,$FRAME-8*14($sp) | |
1057 | std r19,$FRAME-8*13($sp) | |
1058 | std r20,$FRAME-8*12($sp) | |
1059 | std r21,$FRAME-8*11($sp) | |
1060 | std r22,$FRAME-8*10($sp) | |
1061 | std r23,$FRAME-8*9($sp) | |
1062 | std r24,$FRAME-8*8($sp) | |
1063 | std r25,$FRAME-8*7($sp) | |
1064 | std r26,$FRAME-8*6($sp) | |
1065 | std r27,$FRAME-8*5($sp) | |
1066 | std r28,$FRAME-8*4($sp) | |
1067 | std r29,$FRAME-8*3($sp) | |
1068 | std r30,$FRAME-8*2($sp) | |
1069 | std r31,$FRAME-8*1($sp) | |
1070 | ||
1071 | li $poly1,-1 | |
1072 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
1073 | li $poly3,1 | |
1074 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
1075 | ||
1076 | ld $a0,64($bp) # in2_z | |
1077 | ld $a1,72($bp) | |
1078 | ld $a2,80($bp) | |
1079 | ld $a3,88($bp) | |
1080 | mr $rp_real,$rp | |
1081 | mr $ap_real,$ap | |
1082 | mr $bp_real,$bp | |
1083 | or $t0,$a0,$a1 | |
1084 | or $t2,$a2,$a3 | |
1085 | or $in2infty,$t0,$t2 | |
1086 | neg $t0,$in2infty | |
1087 | or $in2infty,$in2infty,$t0 | |
1088 | sradi $in2infty,$in2infty,63 # !in2infty | |
1089 | addi $rp,$sp,$Z2sqr | |
1090 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); | |
1091 | ||
1092 | ld $a0,64($ap_real) # in1_z | |
1093 | ld $a1,72($ap_real) | |
1094 | ld $a2,80($ap_real) | |
1095 | ld $a3,88($ap_real) | |
1096 | or $t0,$a0,$a1 | |
1097 | or $t2,$a2,$a3 | |
1098 | or $in1infty,$t0,$t2 | |
1099 | neg $t0,$in1infty | |
1100 | or $in1infty,$in1infty,$t0 | |
1101 | sradi $in1infty,$in1infty,63 # !in1infty | |
1102 | addi $rp,$sp,$Z1sqr | |
1103 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); | |
1104 | ||
1105 | ld $bi,64($bp_real) | |
1106 | ld $a0,$Z2sqr+0($sp) | |
1107 | ld $a1,$Z2sqr+8($sp) | |
1108 | ld $a2,$Z2sqr+16($sp) | |
1109 | ld $a3,$Z2sqr+24($sp) | |
1110 | addi $bp,$bp_real,64 | |
1111 | addi $rp,$sp,$S1 | |
1112 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); | |
1113 | ||
1114 | ld $bi,64($ap_real) | |
1115 | ld $a0,$Z1sqr+0($sp) | |
1116 | ld $a1,$Z1sqr+8($sp) | |
1117 | ld $a2,$Z1sqr+16($sp) | |
1118 | ld $a3,$Z1sqr+24($sp) | |
1119 | addi $bp,$ap_real,64 | |
1120 | addi $rp,$sp,$S2 | |
1121 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); | |
1122 | ||
1123 | ld $bi,32($ap_real) | |
1124 | ld $a0,$S1+0($sp) | |
1125 | ld $a1,$S1+8($sp) | |
1126 | ld $a2,$S1+16($sp) | |
1127 | ld $a3,$S1+24($sp) | |
1128 | addi $bp,$ap_real,32 | |
1129 | addi $rp,$sp,$S1 | |
1130 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); | |
1131 | ||
1132 | ld $bi,32($bp_real) | |
1133 | ld $a0,$S2+0($sp) | |
1134 | ld $a1,$S2+8($sp) | |
1135 | ld $a2,$S2+16($sp) | |
1136 | ld $a3,$S2+24($sp) | |
1137 | addi $bp,$bp_real,32 | |
1138 | addi $rp,$sp,$S2 | |
1139 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); | |
1140 | ||
1141 | addi $bp,$sp,$S1 | |
1142 | ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont | |
1143 | ld $a0,0($ap_real) | |
1144 | ld $a1,8($ap_real) | |
1145 | ld $a2,16($ap_real) | |
1146 | ld $a3,24($ap_real) | |
1147 | addi $rp,$sp,$R | |
1148 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); | |
1149 | ||
1150 | or $acc0,$acc0,$acc1 # see if result is zero | |
1151 | or $acc2,$acc2,$acc3 | |
1152 | or $temp,$acc0,$acc2 | |
1153 | ||
1154 | addi $bp,$sp,$Z2sqr | |
1155 | addi $rp,$sp,$U1 | |
1156 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); | |
1157 | ||
1158 | ld $bi,$Z1sqr($sp) | |
1159 | ld $a0,0($bp_real) | |
1160 | ld $a1,8($bp_real) | |
1161 | ld $a2,16($bp_real) | |
1162 | ld $a3,24($bp_real) | |
1163 | addi $bp,$sp,$Z1sqr | |
1164 | addi $rp,$sp,$U2 | |
1165 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); | |
1166 | ||
1167 | addi $bp,$sp,$U1 | |
1168 | ld $a0,$R+0($sp) # forward load for p256_sqr_mont | |
1169 | ld $a1,$R+8($sp) | |
1170 | ld $a2,$R+16($sp) | |
1171 | ld $a3,$R+24($sp) | |
1172 | addi $rp,$sp,$H | |
1173 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); | |
1174 | ||
1175 | or $acc0,$acc0,$acc1 # see if result is zero | |
1176 | or $acc2,$acc2,$acc3 | |
1177 | or. $acc0,$acc0,$acc2 | |
1178 | bne .Ladd_proceed # is_equal(U1,U2)? | |
1179 | ||
1180 | and. $t0,$in1infty,$in2infty | |
1181 | beq .Ladd_proceed # (in1infty || in2infty)? | |
1182 | ||
1183 | cmpldi $temp,0 | |
1184 | beq .Ladd_double # is_equal(S1,S2)? | |
1185 | ||
1186 | xor $a0,$a0,$a0 | |
1187 | std $a0,0($rp_real) | |
1188 | std $a0,8($rp_real) | |
1189 | std $a0,16($rp_real) | |
1190 | std $a0,24($rp_real) | |
1191 | std $a0,32($rp_real) | |
1192 | std $a0,40($rp_real) | |
1193 | std $a0,48($rp_real) | |
1194 | std $a0,56($rp_real) | |
1195 | std $a0,64($rp_real) | |
1196 | std $a0,72($rp_real) | |
1197 | std $a0,80($rp_real) | |
1198 | std $a0,88($rp_real) | |
1199 | b .Ladd_done | |
1200 | ||
1201 | .align 4 | |
1202 | .Ladd_double: | |
1203 | ld $bp,0($sp) # back-link | |
1204 | mr $ap,$ap_real | |
1205 | mr $rp,$rp_real | |
1206 | ld r16,$FRAME-8*16($sp) | |
1207 | ld r17,$FRAME-8*15($sp) | |
1208 | ld r18,$FRAME-8*14($sp) | |
1209 | ld r19,$FRAME-8*13($sp) | |
1210 | stdu $bp,$FRAME-288($sp) # difference in stack frame sizes | |
1211 | b .Ldouble_shortcut | |
1212 | ||
1213 | .align 4 | |
1214 | .Ladd_proceed: | |
1215 | addi $rp,$sp,$Rsqr | |
1216 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); | |
1217 | ||
1218 | ld $bi,64($ap_real) | |
1219 | ld $a0,$H+0($sp) | |
1220 | ld $a1,$H+8($sp) | |
1221 | ld $a2,$H+16($sp) | |
1222 | ld $a3,$H+24($sp) | |
1223 | addi $bp,$ap_real,64 | |
1224 | addi $rp,$sp,$res_z | |
1225 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); | |
1226 | ||
1227 | ld $a0,$H+0($sp) | |
1228 | ld $a1,$H+8($sp) | |
1229 | ld $a2,$H+16($sp) | |
1230 | ld $a3,$H+24($sp) | |
1231 | addi $rp,$sp,$Hsqr | |
1232 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); | |
1233 | ||
1234 | ld $bi,64($bp_real) | |
1235 | ld $a0,$res_z+0($sp) | |
1236 | ld $a1,$res_z+8($sp) | |
1237 | ld $a2,$res_z+16($sp) | |
1238 | ld $a3,$res_z+24($sp) | |
1239 | addi $bp,$bp_real,64 | |
1240 | addi $rp,$sp,$res_z | |
1241 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); | |
1242 | ||
1243 | ld $bi,$H($sp) | |
1244 | ld $a0,$Hsqr+0($sp) | |
1245 | ld $a1,$Hsqr+8($sp) | |
1246 | ld $a2,$Hsqr+16($sp) | |
1247 | ld $a3,$Hsqr+24($sp) | |
1248 | addi $bp,$sp,$H | |
1249 | addi $rp,$sp,$Hcub | |
1250 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); | |
1251 | ||
1252 | ld $bi,$Hsqr($sp) | |
1253 | ld $a0,$U1+0($sp) | |
1254 | ld $a1,$U1+8($sp) | |
1255 | ld $a2,$U1+16($sp) | |
1256 | ld $a3,$U1+24($sp) | |
1257 | addi $bp,$sp,$Hsqr | |
1258 | addi $rp,$sp,$U2 | |
1259 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); | |
1260 | ||
1261 | mr $t0,$acc0 | |
1262 | mr $t1,$acc1 | |
1263 | mr $t2,$acc2 | |
1264 | mr $t3,$acc3 | |
1265 | addi $rp,$sp,$Hsqr | |
1266 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); | |
1267 | ||
1268 | addi $bp,$sp,$Rsqr | |
1269 | addi $rp,$sp,$res_x | |
1270 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); | |
1271 | ||
1272 | addi $bp,$sp,$Hcub | |
1273 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); | |
1274 | ||
1275 | addi $bp,$sp,$U2 | |
1276 | ld $bi,$Hcub($sp) # forward load for p256_mul_mont | |
1277 | ld $a0,$S1+0($sp) | |
1278 | ld $a1,$S1+8($sp) | |
1279 | ld $a2,$S1+16($sp) | |
1280 | ld $a3,$S1+24($sp) | |
1281 | addi $rp,$sp,$res_y | |
1282 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); | |
1283 | ||
1284 | addi $bp,$sp,$Hcub | |
1285 | addi $rp,$sp,$S2 | |
1286 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); | |
1287 | ||
1288 | ld $bi,$R($sp) | |
1289 | ld $a0,$res_y+0($sp) | |
1290 | ld $a1,$res_y+8($sp) | |
1291 | ld $a2,$res_y+16($sp) | |
1292 | ld $a3,$res_y+24($sp) | |
1293 | addi $bp,$sp,$R | |
1294 | addi $rp,$sp,$res_y | |
1295 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); | |
1296 | ||
1297 | addi $bp,$sp,$S2 | |
1298 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); | |
1299 | ||
1300 | ld $t0,0($bp_real) # in2 | |
1301 | ld $t1,8($bp_real) | |
1302 | ld $t2,16($bp_real) | |
1303 | ld $t3,24($bp_real) | |
1304 | ld $a0,$res_x+0($sp) # res | |
1305 | ld $a1,$res_x+8($sp) | |
1306 | ld $a2,$res_x+16($sp) | |
1307 | ld $a3,$res_x+24($sp) | |
1308 | ___ | |
1309 | for($i=0;$i<64;$i+=32) { # conditional moves | |
1310 | $code.=<<___; | |
1311 | ld $acc0,$i+0($ap_real) # in1 | |
1312 | ld $acc1,$i+8($ap_real) | |
1313 | ld $acc2,$i+16($ap_real) | |
1314 | ld $acc3,$i+24($ap_real) | |
1315 | andc $t0,$t0,$in1infty | |
1316 | andc $t1,$t1,$in1infty | |
1317 | andc $t2,$t2,$in1infty | |
1318 | andc $t3,$t3,$in1infty | |
1319 | and $a0,$a0,$in1infty | |
1320 | and $a1,$a1,$in1infty | |
1321 | and $a2,$a2,$in1infty | |
1322 | and $a3,$a3,$in1infty | |
1323 | or $t0,$t0,$a0 | |
1324 | or $t1,$t1,$a1 | |
1325 | or $t2,$t2,$a2 | |
1326 | or $t3,$t3,$a3 | |
1327 | andc $acc0,$acc0,$in2infty | |
1328 | andc $acc1,$acc1,$in2infty | |
1329 | andc $acc2,$acc2,$in2infty | |
1330 | andc $acc3,$acc3,$in2infty | |
1331 | and $t0,$t0,$in2infty | |
1332 | and $t1,$t1,$in2infty | |
1333 | and $t2,$t2,$in2infty | |
1334 | and $t3,$t3,$in2infty | |
1335 | or $acc0,$acc0,$t0 | |
1336 | or $acc1,$acc1,$t1 | |
1337 | or $acc2,$acc2,$t2 | |
1338 | or $acc3,$acc3,$t3 | |
1339 | ||
1340 | ld $t0,$i+32($bp_real) # in2 | |
1341 | ld $t1,$i+40($bp_real) | |
1342 | ld $t2,$i+48($bp_real) | |
1343 | ld $t3,$i+56($bp_real) | |
1344 | ld $a0,$res_x+$i+32($sp) | |
1345 | ld $a1,$res_x+$i+40($sp) | |
1346 | ld $a2,$res_x+$i+48($sp) | |
1347 | ld $a3,$res_x+$i+56($sp) | |
1348 | std $acc0,$i+0($rp_real) | |
1349 | std $acc1,$i+8($rp_real) | |
1350 | std $acc2,$i+16($rp_real) | |
1351 | std $acc3,$i+24($rp_real) | |
1352 | ___ | |
1353 | } | |
1354 | $code.=<<___; | |
1355 | ld $acc0,$i+0($ap_real) # in1 | |
1356 | ld $acc1,$i+8($ap_real) | |
1357 | ld $acc2,$i+16($ap_real) | |
1358 | ld $acc3,$i+24($ap_real) | |
1359 | andc $t0,$t0,$in1infty | |
1360 | andc $t1,$t1,$in1infty | |
1361 | andc $t2,$t2,$in1infty | |
1362 | andc $t3,$t3,$in1infty | |
1363 | and $a0,$a0,$in1infty | |
1364 | and $a1,$a1,$in1infty | |
1365 | and $a2,$a2,$in1infty | |
1366 | and $a3,$a3,$in1infty | |
1367 | or $t0,$t0,$a0 | |
1368 | or $t1,$t1,$a1 | |
1369 | or $t2,$t2,$a2 | |
1370 | or $t3,$t3,$a3 | |
1371 | andc $acc0,$acc0,$in2infty | |
1372 | andc $acc1,$acc1,$in2infty | |
1373 | andc $acc2,$acc2,$in2infty | |
1374 | andc $acc3,$acc3,$in2infty | |
1375 | and $t0,$t0,$in2infty | |
1376 | and $t1,$t1,$in2infty | |
1377 | and $t2,$t2,$in2infty | |
1378 | and $t3,$t3,$in2infty | |
1379 | or $acc0,$acc0,$t0 | |
1380 | or $acc1,$acc1,$t1 | |
1381 | or $acc2,$acc2,$t2 | |
1382 | or $acc3,$acc3,$t3 | |
1383 | std $acc0,$i+0($rp_real) | |
1384 | std $acc1,$i+8($rp_real) | |
1385 | std $acc2,$i+16($rp_real) | |
1386 | std $acc3,$i+24($rp_real) | |
1387 | ||
1388 | .Ladd_done: | |
1389 | mtlr r0 | |
1390 | ld r16,$FRAME-8*16($sp) | |
1391 | ld r17,$FRAME-8*15($sp) | |
1392 | ld r18,$FRAME-8*14($sp) | |
1393 | ld r19,$FRAME-8*13($sp) | |
1394 | ld r20,$FRAME-8*12($sp) | |
1395 | ld r21,$FRAME-8*11($sp) | |
1396 | ld r22,$FRAME-8*10($sp) | |
1397 | ld r23,$FRAME-8*9($sp) | |
1398 | ld r24,$FRAME-8*8($sp) | |
1399 | ld r25,$FRAME-8*7($sp) | |
1400 | ld r26,$FRAME-8*6($sp) | |
1401 | ld r27,$FRAME-8*5($sp) | |
1402 | ld r28,$FRAME-8*4($sp) | |
1403 | ld r29,$FRAME-8*3($sp) | |
1404 | ld r30,$FRAME-8*2($sp) | |
1405 | ld r31,$FRAME-8*1($sp) | |
1406 | addi $sp,$sp,$FRAME | |
1407 | blr | |
1408 | .long 0 | |
1409 | .byte 0,12,4,0,0x80,16,3,0 | |
1410 | .long 0 | |
1411 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add | |
1412 | ___ | |
1413 | } | |
1414 | ||
1415 | ######################################################################## | |
1416 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, | |
1417 | # const P256_POINT_AFFINE *in2); | |
1418 | if (1) { | |
1419 | my $FRAME = 64 + 32*10 + 16*8; | |
1420 | my ($res_x,$res_y,$res_z, | |
1421 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); | |
1422 | my $Z1sqr = $S2; | |
1423 | # above map() describes stack layout with 10 temporary | |
1424 | # 256-bit vectors on top. | |
1425 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); | |
1426 | ||
1427 | $code.=<<___; | |
1428 | .globl ecp_nistz256_point_add_affine | |
1429 | .align 5 | |
1430 | ecp_nistz256_point_add_affine: | |
1431 | stdu $sp,-$FRAME($sp) | |
1432 | mflr r0 | |
1433 | std r16,$FRAME-8*16($sp) | |
1434 | std r17,$FRAME-8*15($sp) | |
1435 | std r18,$FRAME-8*14($sp) | |
1436 | std r19,$FRAME-8*13($sp) | |
1437 | std r20,$FRAME-8*12($sp) | |
1438 | std r21,$FRAME-8*11($sp) | |
1439 | std r22,$FRAME-8*10($sp) | |
1440 | std r23,$FRAME-8*9($sp) | |
1441 | std r24,$FRAME-8*8($sp) | |
1442 | std r25,$FRAME-8*7($sp) | |
1443 | std r26,$FRAME-8*6($sp) | |
1444 | std r27,$FRAME-8*5($sp) | |
1445 | std r28,$FRAME-8*4($sp) | |
1446 | std r29,$FRAME-8*3($sp) | |
1447 | std r30,$FRAME-8*2($sp) | |
1448 | std r31,$FRAME-8*1($sp) | |
1449 | ||
1450 | li $poly1,-1 | |
1451 | srdi $poly1,$poly1,32 # 0x00000000ffffffff | |
1452 | li $poly3,1 | |
1453 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001 | |
1454 | ||
1455 | mr $rp_real,$rp | |
1456 | mr $ap_real,$ap | |
1457 | mr $bp_real,$bp | |
1458 | ||
1459 | ld $a0,64($ap) # in1_z | |
1460 | ld $a1,72($ap) | |
1461 | ld $a2,80($ap) | |
1462 | ld $a3,88($ap) | |
1463 | or $t0,$a0,$a1 | |
1464 | or $t2,$a2,$a3 | |
1465 | or $in1infty,$t0,$t2 | |
1466 | neg $t0,$in1infty | |
1467 | or $in1infty,$in1infty,$t0 | |
1468 | sradi $in1infty,$in1infty,63 # !in1infty | |
1469 | ||
1470 | ld $acc0,0($bp) # in2_x | |
1471 | ld $acc1,8($bp) | |
1472 | ld $acc2,16($bp) | |
1473 | ld $acc3,24($bp) | |
1474 | ld $t0,32($bp) # in2_y | |
1475 | ld $t1,40($bp) | |
1476 | ld $t2,48($bp) | |
1477 | ld $t3,56($bp) | |
1478 | or $acc0,$acc0,$acc1 | |
1479 | or $acc2,$acc2,$acc3 | |
1480 | or $acc0,$acc0,$acc2 | |
1481 | or $t0,$t0,$t1 | |
1482 | or $t2,$t2,$t3 | |
1483 | or $t0,$t0,$t2 | |
1484 | or $in2infty,$acc0,$t0 | |
1485 | neg $t0,$in2infty | |
1486 | or $in2infty,$in2infty,$t0 | |
1487 | sradi $in2infty,$in2infty,63 # !in2infty | |
1488 | ||
1489 | addi $rp,$sp,$Z1sqr | |
1490 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); | |
1491 | ||
1492 | mr $a0,$acc0 | |
1493 | mr $a1,$acc1 | |
1494 | mr $a2,$acc2 | |
1495 | mr $a3,$acc3 | |
1496 | ld $bi,0($bp_real) | |
1497 | addi $bp,$bp_real,0 | |
1498 | addi $rp,$sp,$U2 | |
1499 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); | |
1500 | ||
1501 | addi $bp,$ap_real,0 | |
1502 | ld $bi,64($ap_real) # forward load for p256_mul_mont | |
1503 | ld $a0,$Z1sqr+0($sp) | |
1504 | ld $a1,$Z1sqr+8($sp) | |
1505 | ld $a2,$Z1sqr+16($sp) | |
1506 | ld $a3,$Z1sqr+24($sp) | |
1507 | addi $rp,$sp,$H | |
1508 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); | |
1509 | ||
1510 | addi $bp,$ap_real,64 | |
1511 | addi $rp,$sp,$S2 | |
1512 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); | |
1513 | ||
1514 | ld $bi,64($ap_real) | |
1515 | ld $a0,$H+0($sp) | |
1516 | ld $a1,$H+8($sp) | |
1517 | ld $a2,$H+16($sp) | |
1518 | ld $a3,$H+24($sp) | |
1519 | addi $bp,$ap_real,64 | |
1520 | addi $rp,$sp,$res_z | |
1521 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); | |
1522 | ||
1523 | ld $bi,32($bp_real) | |
1524 | ld $a0,$S2+0($sp) | |
1525 | ld $a1,$S2+8($sp) | |
1526 | ld $a2,$S2+16($sp) | |
1527 | ld $a3,$S2+24($sp) | |
1528 | addi $bp,$bp_real,32 | |
1529 | addi $rp,$sp,$S2 | |
1530 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); | |
1531 | ||
1532 | addi $bp,$ap_real,32 | |
1533 | ld $a0,$H+0($sp) # forward load for p256_sqr_mont | |
1534 | ld $a1,$H+8($sp) | |
1535 | ld $a2,$H+16($sp) | |
1536 | ld $a3,$H+24($sp) | |
1537 | addi $rp,$sp,$R | |
1538 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); | |
1539 | ||
1540 | addi $rp,$sp,$Hsqr | |
1541 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); | |
1542 | ||
1543 | ld $a0,$R+0($sp) | |
1544 | ld $a1,$R+8($sp) | |
1545 | ld $a2,$R+16($sp) | |
1546 | ld $a3,$R+24($sp) | |
1547 | addi $rp,$sp,$Rsqr | |
1548 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); | |
1549 | ||
1550 | ld $bi,$H($sp) | |
1551 | ld $a0,$Hsqr+0($sp) | |
1552 | ld $a1,$Hsqr+8($sp) | |
1553 | ld $a2,$Hsqr+16($sp) | |
1554 | ld $a3,$Hsqr+24($sp) | |
1555 | addi $bp,$sp,$H | |
1556 | addi $rp,$sp,$Hcub | |
1557 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); | |
1558 | ||
1559 | ld $bi,0($ap_real) | |
1560 | ld $a0,$Hsqr+0($sp) | |
1561 | ld $a1,$Hsqr+8($sp) | |
1562 | ld $a2,$Hsqr+16($sp) | |
1563 | ld $a3,$Hsqr+24($sp) | |
1564 | addi $bp,$ap_real,0 | |
1565 | addi $rp,$sp,$U2 | |
1566 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); | |
1567 | ||
1568 | mr $t0,$acc0 | |
1569 | mr $t1,$acc1 | |
1570 | mr $t2,$acc2 | |
1571 | mr $t3,$acc3 | |
1572 | addi $rp,$sp,$Hsqr | |
1573 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); | |
1574 | ||
1575 | addi $bp,$sp,$Rsqr | |
1576 | addi $rp,$sp,$res_x | |
1577 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); | |
1578 | ||
1579 | addi $bp,$sp,$Hcub | |
1580 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); | |
1581 | ||
1582 | addi $bp,$sp,$U2 | |
1583 | ld $bi,32($ap_real) # forward load for p256_mul_mont | |
1584 | ld $a0,$Hcub+0($sp) | |
1585 | ld $a1,$Hcub+8($sp) | |
1586 | ld $a2,$Hcub+16($sp) | |
1587 | ld $a3,$Hcub+24($sp) | |
1588 | addi $rp,$sp,$res_y | |
1589 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); | |
1590 | ||
1591 | addi $bp,$ap_real,32 | |
1592 | addi $rp,$sp,$S2 | |
1593 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); | |
1594 | ||
1595 | ld $bi,$R($sp) | |
1596 | ld $a0,$res_y+0($sp) | |
1597 | ld $a1,$res_y+8($sp) | |
1598 | ld $a2,$res_y+16($sp) | |
1599 | ld $a3,$res_y+24($sp) | |
1600 | addi $bp,$sp,$R | |
1601 | addi $rp,$sp,$res_y | |
1602 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); | |
1603 | ||
1604 | addi $bp,$sp,$S2 | |
1605 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); | |
1606 | ||
1607 | ld $t0,0($bp_real) # in2 | |
1608 | ld $t1,8($bp_real) | |
1609 | ld $t2,16($bp_real) | |
1610 | ld $t3,24($bp_real) | |
1611 | ld $a0,$res_x+0($sp) # res | |
1612 | ld $a1,$res_x+8($sp) | |
1613 | ld $a2,$res_x+16($sp) | |
1614 | ld $a3,$res_x+24($sp) | |
1615 | ___ | |
1616 | for($i=0;$i<64;$i+=32) { # conditional moves | |
1617 | $code.=<<___; | |
1618 | ld $acc0,$i+0($ap_real) # in1 | |
1619 | ld $acc1,$i+8($ap_real) | |
1620 | ld $acc2,$i+16($ap_real) | |
1621 | ld $acc3,$i+24($ap_real) | |
1622 | andc $t0,$t0,$in1infty | |
1623 | andc $t1,$t1,$in1infty | |
1624 | andc $t2,$t2,$in1infty | |
1625 | andc $t3,$t3,$in1infty | |
1626 | and $a0,$a0,$in1infty | |
1627 | and $a1,$a1,$in1infty | |
1628 | and $a2,$a2,$in1infty | |
1629 | and $a3,$a3,$in1infty | |
1630 | or $t0,$t0,$a0 | |
1631 | or $t1,$t1,$a1 | |
1632 | or $t2,$t2,$a2 | |
1633 | or $t3,$t3,$a3 | |
1634 | andc $acc0,$acc0,$in2infty | |
1635 | andc $acc1,$acc1,$in2infty | |
1636 | andc $acc2,$acc2,$in2infty | |
1637 | andc $acc3,$acc3,$in2infty | |
1638 | and $t0,$t0,$in2infty | |
1639 | and $t1,$t1,$in2infty | |
1640 | and $t2,$t2,$in2infty | |
1641 | and $t3,$t3,$in2infty | |
1642 | or $acc0,$acc0,$t0 | |
1643 | or $acc1,$acc1,$t1 | |
1644 | or $acc2,$acc2,$t2 | |
1645 | or $acc3,$acc3,$t3 | |
1646 | ___ | |
1647 | $code.=<<___ if ($i==0); | |
1648 | ld $t0,32($bp_real) # in2 | |
1649 | ld $t1,40($bp_real) | |
1650 | ld $t2,48($bp_real) | |
1651 | ld $t3,56($bp_real) | |
1652 | ___ | |
1653 | $code.=<<___ if ($i==32); | |
1654 | li $t0,1 # Lone_mont | |
1655 | not $t1,$poly1 | |
1656 | li $t2,-1 | |
1657 | not $t3,$poly3 | |
1658 | ___ | |
1659 | $code.=<<___; | |
1660 | ld $a0,$res_x+$i+32($sp) | |
1661 | ld $a1,$res_x+$i+40($sp) | |
1662 | ld $a2,$res_x+$i+48($sp) | |
1663 | ld $a3,$res_x+$i+56($sp) | |
1664 | std $acc0,$i+0($rp_real) | |
1665 | std $acc1,$i+8($rp_real) | |
1666 | std $acc2,$i+16($rp_real) | |
1667 | std $acc3,$i+24($rp_real) | |
1668 | ___ | |
1669 | } | |
1670 | $code.=<<___; | |
1671 | ld $acc0,$i+0($ap_real) # in1 | |
1672 | ld $acc1,$i+8($ap_real) | |
1673 | ld $acc2,$i+16($ap_real) | |
1674 | ld $acc3,$i+24($ap_real) | |
1675 | andc $t0,$t0,$in1infty | |
1676 | andc $t1,$t1,$in1infty | |
1677 | andc $t2,$t2,$in1infty | |
1678 | andc $t3,$t3,$in1infty | |
1679 | and $a0,$a0,$in1infty | |
1680 | and $a1,$a1,$in1infty | |
1681 | and $a2,$a2,$in1infty | |
1682 | and $a3,$a3,$in1infty | |
1683 | or $t0,$t0,$a0 | |
1684 | or $t1,$t1,$a1 | |
1685 | or $t2,$t2,$a2 | |
1686 | or $t3,$t3,$a3 | |
1687 | andc $acc0,$acc0,$in2infty | |
1688 | andc $acc1,$acc1,$in2infty | |
1689 | andc $acc2,$acc2,$in2infty | |
1690 | andc $acc3,$acc3,$in2infty | |
1691 | and $t0,$t0,$in2infty | |
1692 | and $t1,$t1,$in2infty | |
1693 | and $t2,$t2,$in2infty | |
1694 | and $t3,$t3,$in2infty | |
1695 | or $acc0,$acc0,$t0 | |
1696 | or $acc1,$acc1,$t1 | |
1697 | or $acc2,$acc2,$t2 | |
1698 | or $acc3,$acc3,$t3 | |
1699 | std $acc0,$i+0($rp_real) | |
1700 | std $acc1,$i+8($rp_real) | |
1701 | std $acc2,$i+16($rp_real) | |
1702 | std $acc3,$i+24($rp_real) | |
1703 | ||
1704 | mtlr r0 | |
1705 | ld r16,$FRAME-8*16($sp) | |
1706 | ld r17,$FRAME-8*15($sp) | |
1707 | ld r18,$FRAME-8*14($sp) | |
1708 | ld r19,$FRAME-8*13($sp) | |
1709 | ld r20,$FRAME-8*12($sp) | |
1710 | ld r21,$FRAME-8*11($sp) | |
1711 | ld r22,$FRAME-8*10($sp) | |
1712 | ld r23,$FRAME-8*9($sp) | |
1713 | ld r24,$FRAME-8*8($sp) | |
1714 | ld r25,$FRAME-8*7($sp) | |
1715 | ld r26,$FRAME-8*6($sp) | |
1716 | ld r27,$FRAME-8*5($sp) | |
1717 | ld r28,$FRAME-8*4($sp) | |
1718 | ld r29,$FRAME-8*3($sp) | |
1719 | ld r30,$FRAME-8*2($sp) | |
1720 | ld r31,$FRAME-8*1($sp) | |
1721 | addi $sp,$sp,$FRAME | |
1722 | blr | |
1723 | .long 0 | |
1724 | .byte 0,12,4,0,0x80,16,3,0 | |
1725 | .long 0 | |
1726 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine | |
1727 | ___ | |
1728 | } | |
1729 | if (1) { | |
1730 | my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); | |
1731 | my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); | |
1732 | ||
1733 | $code.=<<___; | |
1734 | ######################################################################## | |
1735 | # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], | |
1736 | # uint64_t b[4]); | |
1737 | .globl ecp_nistz256_ord_mul_mont | |
1738 | .align 5 | |
1739 | ecp_nistz256_ord_mul_mont: | |
1740 | stdu $sp,-160($sp) | |
1741 | std r18,48($sp) | |
1742 | std r19,56($sp) | |
1743 | std r20,64($sp) | |
1744 | std r21,72($sp) | |
1745 | std r22,80($sp) | |
1746 | std r23,88($sp) | |
1747 | std r24,96($sp) | |
1748 | std r25,104($sp) | |
1749 | std r26,112($sp) | |
1750 | std r27,120($sp) | |
1751 | std r28,128($sp) | |
1752 | std r29,136($sp) | |
1753 | std r30,144($sp) | |
1754 | std r31,152($sp) | |
1755 | ||
1756 | ld $a0,0($ap) | |
1757 | ld $bi,0($bp) | |
1758 | ld $a1,8($ap) | |
1759 | ld $a2,16($ap) | |
1760 | ld $a3,24($ap) | |
1761 | ||
1762 | lis $ordk,0xccd1 | |
1763 | lis $ord0,0xf3b9 | |
1764 | lis $ord1,0xbce6 | |
1765 | ori $ordk,$ordk,0xc8aa | |
1766 | ori $ord0,$ord0,0xcac2 | |
1767 | ori $ord1,$ord1,0xfaad | |
1768 | sldi $ordk,$ordk,32 | |
1769 | sldi $ord0,$ord0,32 | |
1770 | sldi $ord1,$ord1,32 | |
1771 | oris $ordk,$ordk,0xee00 | |
1772 | oris $ord0,$ord0,0xfc63 | |
1773 | oris $ord1,$ord1,0xa717 | |
1774 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f | |
1775 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 | |
1776 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 | |
1777 | li $ord2,-1 # 0xffffffffffffffff | |
1778 | sldi $ord3,$ord2,32 # 0xffffffff00000000 | |
1779 | li $zr,0 | |
1780 | ||
1781 | mulld $acc0,$a0,$bi # a[0]*b[0] | |
1782 | mulhdu $t0,$a0,$bi | |
1783 | ||
1784 | mulld $acc1,$a1,$bi # a[1]*b[0] | |
1785 | mulhdu $t1,$a1,$bi | |
1786 | ||
1787 | mulld $acc2,$a2,$bi # a[2]*b[0] | |
1788 | mulhdu $t2,$a2,$bi | |
1789 | ||
1790 | mulld $acc3,$a3,$bi # a[3]*b[0] | |
1791 | mulhdu $acc4,$a3,$bi | |
1792 | ||
1793 | mulld $t4,$acc0,$ordk | |
1794 | ||
1795 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication | |
1796 | adde $acc2,$acc2,$t1 | |
1797 | adde $acc3,$acc3,$t2 | |
1798 | addze $acc4,$acc4 | |
1799 | li $acc5,0 | |
1800 | ___ | |
1801 | for ($i=1;$i<4;$i++) { | |
1802 | ################################################################ | |
1803 | # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz | |
1804 | # * abcdefgh | |
1805 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx | |
1806 | # | |
1807 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we | |
1808 | # rewrite above as: | |
1809 | # | |
1810 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx | |
1811 | # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 | |
1812 | # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh | |
1813 | $code.=<<___; | |
1814 | ld $bi,8*$i($bp) # b[i] | |
1815 | ||
1816 | sldi $t0,$t4,32 | |
1817 | subfc $acc2,$t4,$acc2 | |
1818 | srdi $t1,$t4,32 | |
1819 | subfe $acc3,$t0,$acc3 | |
1820 | subfe $acc4,$t1,$acc4 | |
1821 | subfe $acc5,$zr,$acc5 | |
1822 | ||
1823 | addic $t0,$acc0,-1 # discarded | |
1824 | mulhdu $t1,$ord0,$t4 | |
1825 | mulld $t2,$ord1,$t4 | |
1826 | mulhdu $t3,$ord1,$t4 | |
1827 | ||
1828 | adde $t2,$t2,$t1 | |
1829 | mulld $t0,$a0,$bi | |
1830 | addze $t3,$t3 | |
1831 | mulld $t1,$a1,$bi | |
1832 | ||
1833 | addc $acc0,$acc1,$t2 | |
1834 | mulld $t2,$a2,$bi | |
1835 | adde $acc1,$acc2,$t3 | |
1836 | mulld $t3,$a3,$bi | |
1837 | adde $acc2,$acc3,$t4 | |
1838 | adde $acc3,$acc4,$t4 | |
1839 | addze $acc4,$acc5 | |
1840 | ||
1841 | addc $acc0,$acc0,$t0 # accumulate low parts | |
1842 | mulhdu $t0,$a0,$bi | |
1843 | adde $acc1,$acc1,$t1 | |
1844 | mulhdu $t1,$a1,$bi | |
1845 | adde $acc2,$acc2,$t2 | |
1846 | mulhdu $t2,$a2,$bi | |
1847 | adde $acc3,$acc3,$t3 | |
1848 | mulhdu $t3,$a3,$bi | |
1849 | addze $acc4,$acc4 | |
1850 | mulld $t4,$acc0,$ordk | |
1851 | addc $acc1,$acc1,$t0 # accumulate high parts | |
1852 | adde $acc2,$acc2,$t1 | |
1853 | adde $acc3,$acc3,$t2 | |
1854 | adde $acc4,$acc4,$t3 | |
1855 | addze $acc5,$zr | |
1856 | ___ | |
1857 | } | |
1858 | $code.=<<___; | |
1859 | sldi $t0,$t4,32 # last reduction | |
1860 | subfc $acc2,$t4,$acc2 | |
1861 | srdi $t1,$t4,32 | |
1862 | subfe $acc3,$t0,$acc3 | |
1863 | subfe $acc4,$t1,$acc4 | |
1864 | subfe $acc5,$zr,$acc5 | |
1865 | ||
1866 | addic $t0,$acc0,-1 # discarded | |
1867 | mulhdu $t1,$ord0,$t4 | |
1868 | mulld $t2,$ord1,$t4 | |
1869 | mulhdu $t3,$ord1,$t4 | |
1870 | ||
1871 | adde $t2,$t2,$t1 | |
1872 | addze $t3,$t3 | |
1873 | ||
1874 | addc $acc0,$acc1,$t2 | |
1875 | adde $acc1,$acc2,$t3 | |
1876 | adde $acc2,$acc3,$t4 | |
1877 | adde $acc3,$acc4,$t4 | |
1878 | addze $acc4,$acc5 | |
1879 | ||
1880 | subfc $acc0,$ord0,$acc0 # ret -= modulus | |
1881 | subfe $acc1,$ord1,$acc1 | |
1882 | subfe $acc2,$ord2,$acc2 | |
1883 | subfe $acc3,$ord3,$acc3 | |
1884 | subfe $acc4,$zr,$acc4 | |
1885 | ||
1886 | and $t0,$ord0,$acc4 | |
1887 | and $t1,$ord1,$acc4 | |
1888 | addc $acc0,$acc0,$t0 # ret += modulus if borrow | |
1889 | and $t3,$ord3,$acc4 | |
1890 | adde $acc1,$acc1,$t1 | |
1891 | adde $acc2,$acc2,$acc4 | |
1892 | adde $acc3,$acc3,$t3 | |
1893 | ||
1894 | std $acc0,0($rp) | |
1895 | std $acc1,8($rp) | |
1896 | std $acc2,16($rp) | |
1897 | std $acc3,24($rp) | |
1898 | ||
1899 | ld r18,48($sp) | |
1900 | ld r19,56($sp) | |
1901 | ld r20,64($sp) | |
1902 | ld r21,72($sp) | |
1903 | ld r22,80($sp) | |
1904 | ld r23,88($sp) | |
1905 | ld r24,96($sp) | |
1906 | ld r25,104($sp) | |
1907 | ld r26,112($sp) | |
1908 | ld r27,120($sp) | |
1909 | ld r28,128($sp) | |
1910 | ld r29,136($sp) | |
1911 | ld r30,144($sp) | |
1912 | ld r31,152($sp) | |
1913 | addi $sp,$sp,160 | |
1914 | blr | |
1915 | .long 0 | |
1916 | .byte 0,12,4,0,0x80,14,3,0 | |
1917 | .long 0 | |
1918 | .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont | |
1919 | ||
1920 | ################################################################################ | |
1921 | # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], | |
1922 | # int rep); | |
1923 | .globl ecp_nistz256_ord_sqr_mont | |
1924 | .align 5 | |
1925 | ecp_nistz256_ord_sqr_mont: | |
1926 | stdu $sp,-160($sp) | |
1927 | std r18,48($sp) | |
1928 | std r19,56($sp) | |
1929 | std r20,64($sp) | |
1930 | std r21,72($sp) | |
1931 | std r22,80($sp) | |
1932 | std r23,88($sp) | |
1933 | std r24,96($sp) | |
1934 | std r25,104($sp) | |
1935 | std r26,112($sp) | |
1936 | std r27,120($sp) | |
1937 | std r28,128($sp) | |
1938 | std r29,136($sp) | |
1939 | std r30,144($sp) | |
1940 | std r31,152($sp) | |
1941 | ||
1942 | mtctr $bp | |
1943 | ||
1944 | ld $a0,0($ap) | |
1945 | ld $a1,8($ap) | |
1946 | ld $a2,16($ap) | |
1947 | ld $a3,24($ap) | |
1948 | ||
1949 | lis $ordk,0xccd1 | |
1950 | lis $ord0,0xf3b9 | |
1951 | lis $ord1,0xbce6 | |
1952 | ori $ordk,$ordk,0xc8aa | |
1953 | ori $ord0,$ord0,0xcac2 | |
1954 | ori $ord1,$ord1,0xfaad | |
1955 | sldi $ordk,$ordk,32 | |
1956 | sldi $ord0,$ord0,32 | |
1957 | sldi $ord1,$ord1,32 | |
1958 | oris $ordk,$ordk,0xee00 | |
1959 | oris $ord0,$ord0,0xfc63 | |
1960 | oris $ord1,$ord1,0xa717 | |
1961 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f | |
1962 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 | |
1963 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 | |
1964 | li $ord2,-1 # 0xffffffffffffffff | |
1965 | sldi $ord3,$ord2,32 # 0xffffffff00000000 | |
1966 | li $zr,0 | |
1967 | b .Loop_ord_sqr | |
1968 | ||
1969 | .align 5 | |
1970 | .Loop_ord_sqr: | |
1971 | ################################################################ | |
1972 | # | | | | | |a1*a0| | | |
1973 | # | | | | |a2*a0| | | | |
1974 | # | |a3*a2|a3*a0| | | | | |
1975 | # | | | |a2*a1| | | | | |
1976 | # | | |a3*a1| | | | | | |
1977 | # *| | | | | | | | 2| | |
1978 | # +|a3*a3|a2*a2|a1*a1|a0*a0| | |
1979 | # |--+--+--+--+--+--+--+--| | |
1980 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx | |
1981 | # | |
1982 | # "can't overflow" below mark carrying into high part of | |
1983 | # multiplication result, which can't overflow, because it | |
1984 | # can never be all ones. | |
1985 | ||
1986 | mulld $acc1,$a1,$a0 # a[1]*a[0] | |
1987 | mulhdu $t1,$a1,$a0 | |
1988 | mulld $acc2,$a2,$a0 # a[2]*a[0] | |
1989 | mulhdu $t2,$a2,$a0 | |
1990 | mulld $acc3,$a3,$a0 # a[3]*a[0] | |
1991 | mulhdu $acc4,$a3,$a0 | |
1992 | ||
1993 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication | |
1994 | mulld $t0,$a2,$a1 # a[2]*a[1] | |
1995 | mulhdu $t1,$a2,$a1 | |
1996 | adde $acc3,$acc3,$t2 | |
1997 | mulld $t2,$a3,$a1 # a[3]*a[1] | |
1998 | mulhdu $t3,$a3,$a1 | |
1999 | addze $acc4,$acc4 # can't overflow | |
2000 | ||
2001 | mulld $acc5,$a3,$a2 # a[3]*a[2] | |
2002 | mulhdu $acc6,$a3,$a2 | |
2003 | ||
2004 | addc $t1,$t1,$t2 # accumulate high parts of multiplication | |
2005 | mulld $acc0,$a0,$a0 # a[0]*a[0] | |
2006 | addze $t2,$t3 # can't overflow | |
2007 | ||
2008 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication | |
2009 | mulhdu $a0,$a0,$a0 | |
2010 | adde $acc4,$acc4,$t1 | |
2011 | mulld $t1,$a1,$a1 # a[1]*a[1] | |
2012 | adde $acc5,$acc5,$t2 | |
2013 | mulhdu $a1,$a1,$a1 | |
2014 | addze $acc6,$acc6 # can't overflow | |
2015 | ||
2016 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2 | |
2017 | mulld $t2,$a2,$a2 # a[2]*a[2] | |
2018 | adde $acc2,$acc2,$acc2 | |
2019 | mulhdu $a2,$a2,$a2 | |
2020 | adde $acc3,$acc3,$acc3 | |
2021 | mulld $t3,$a3,$a3 # a[3]*a[3] | |
2022 | adde $acc4,$acc4,$acc4 | |
2023 | mulhdu $a3,$a3,$a3 | |
2024 | adde $acc5,$acc5,$acc5 | |
2025 | adde $acc6,$acc6,$acc6 | |
2026 | addze $acc7,$zr | |
2027 | ||
2028 | addc $acc1,$acc1,$a0 # +a[i]*a[i] | |
2029 | mulld $t4,$acc0,$ordk | |
2030 | adde $acc2,$acc2,$t1 | |
2031 | adde $acc3,$acc3,$a1 | |
2032 | adde $acc4,$acc4,$t2 | |
2033 | adde $acc5,$acc5,$a2 | |
2034 | adde $acc6,$acc6,$t3 | |
2035 | adde $acc7,$acc7,$a3 | |
2036 | ___ | |
2037 | for($i=0; $i<4; $i++) { # reductions | |
2038 | $code.=<<___; | |
2039 | addic $t0,$acc0,-1 # discarded | |
2040 | mulhdu $t1,$ord0,$t4 | |
2041 | mulld $t2,$ord1,$t4 | |
2042 | mulhdu $t3,$ord1,$t4 | |
2043 | ||
2044 | adde $t2,$t2,$t1 | |
2045 | addze $t3,$t3 | |
2046 | ||
2047 | addc $acc0,$acc1,$t2 | |
2048 | adde $acc1,$acc2,$t3 | |
2049 | adde $acc2,$acc3,$t4 | |
2050 | adde $acc3,$zr,$t4 # can't overflow | |
2051 | ___ | |
2052 | $code.=<<___ if ($i<3); | |
2053 | mulld $t3,$acc0,$ordk | |
2054 | ___ | |
2055 | $code.=<<___; | |
2056 | sldi $t0,$t4,32 | |
2057 | subfc $acc1,$t4,$acc1 | |
2058 | srdi $t1,$t4,32 | |
2059 | subfe $acc2,$t0,$acc2 | |
2060 | subfe $acc3,$t1,$acc3 # can't borrow | |
2061 | ___ | |
2062 | ($t3,$t4) = ($t4,$t3); | |
2063 | } | |
2064 | $code.=<<___; | |
2065 | addc $acc0,$acc0,$acc4 # accumulate upper half | |
2066 | adde $acc1,$acc1,$acc5 | |
2067 | adde $acc2,$acc2,$acc6 | |
2068 | adde $acc3,$acc3,$acc7 | |
2069 | addze $acc4,$zr | |
2070 | ||
2071 | subfc $acc0,$ord0,$acc0 # ret -= modulus | |
2072 | subfe $acc1,$ord1,$acc1 | |
2073 | subfe $acc2,$ord2,$acc2 | |
2074 | subfe $acc3,$ord3,$acc3 | |
2075 | subfe $acc4,$zr,$acc4 | |
2076 | ||
2077 | and $t0,$ord0,$acc4 | |
2078 | and $t1,$ord1,$acc4 | |
2079 | addc $a0,$acc0,$t0 # ret += modulus if borrow | |
2080 | and $t3,$ord3,$acc4 | |
2081 | adde $a1,$acc1,$t1 | |
2082 | adde $a2,$acc2,$acc4 | |
2083 | adde $a3,$acc3,$t3 | |
2084 | ||
2085 | bdnz .Loop_ord_sqr | |
2086 | ||
2087 | std $a0,0($rp) | |
2088 | std $a1,8($rp) | |
2089 | std $a2,16($rp) | |
2090 | std $a3,24($rp) | |
2091 | ||
2092 | ld r18,48($sp) | |
2093 | ld r19,56($sp) | |
2094 | ld r20,64($sp) | |
2095 | ld r21,72($sp) | |
2096 | ld r22,80($sp) | |
2097 | ld r23,88($sp) | |
2098 | ld r24,96($sp) | |
2099 | ld r25,104($sp) | |
2100 | ld r26,112($sp) | |
2101 | ld r27,120($sp) | |
2102 | ld r28,128($sp) | |
2103 | ld r29,136($sp) | |
2104 | ld r30,144($sp) | |
2105 | ld r31,152($sp) | |
2106 | addi $sp,$sp,160 | |
2107 | blr | |
2108 | .long 0 | |
2109 | .byte 0,12,4,0,0x80,14,3,0 | |
2110 | .long 0 | |
2111 | .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont | |
2112 | ___ | |
2113 | } } | |
2114 | ||
2115 | ######################################################################## | |
2116 | # scatter-gather subroutines | |
2117 | { | |
2118 | my ($out,$inp,$index,$mask)=map("r$_",(3..7)); | |
2119 | $code.=<<___; | |
2120 | ######################################################################## | |
2121 | # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, | |
2122 | # int index); | |
2123 | .globl ecp_nistz256_scatter_w5 | |
2124 | .align 4 | |
2125 | ecp_nistz256_scatter_w5: | |
2126 | slwi $index,$index,2 | |
2127 | add $out,$out,$index | |
2128 | ||
2129 | ld r8, 0($inp) # X | |
2130 | ld r9, 8($inp) | |
2131 | ld r10,16($inp) | |
2132 | ld r11,24($inp) | |
2133 | ||
2134 | stw r8, 64*0-4($out) | |
2135 | srdi r8, r8, 32 | |
2136 | stw r9, 64*1-4($out) | |
2137 | srdi r9, r9, 32 | |
2138 | stw r10,64*2-4($out) | |
2139 | srdi r10,r10,32 | |
2140 | stw r11,64*3-4($out) | |
2141 | srdi r11,r11,32 | |
2142 | stw r8, 64*4-4($out) | |
2143 | stw r9, 64*5-4($out) | |
2144 | stw r10,64*6-4($out) | |
2145 | stw r11,64*7-4($out) | |
2146 | addi $out,$out,64*8 | |
2147 | ||
2148 | ld r8, 32($inp) # Y | |
2149 | ld r9, 40($inp) | |
2150 | ld r10,48($inp) | |
2151 | ld r11,56($inp) | |
2152 | ||
2153 | stw r8, 64*0-4($out) | |
2154 | srdi r8, r8, 32 | |
2155 | stw r9, 64*1-4($out) | |
2156 | srdi r9, r9, 32 | |
2157 | stw r10,64*2-4($out) | |
2158 | srdi r10,r10,32 | |
2159 | stw r11,64*3-4($out) | |
2160 | srdi r11,r11,32 | |
2161 | stw r8, 64*4-4($out) | |
2162 | stw r9, 64*5-4($out) | |
2163 | stw r10,64*6-4($out) | |
2164 | stw r11,64*7-4($out) | |
2165 | addi $out,$out,64*8 | |
2166 | ||
2167 | ld r8, 64($inp) # Z | |
2168 | ld r9, 72($inp) | |
2169 | ld r10,80($inp) | |
2170 | ld r11,88($inp) | |
2171 | ||
2172 | stw r8, 64*0-4($out) | |
2173 | srdi r8, r8, 32 | |
2174 | stw r9, 64*1-4($out) | |
2175 | srdi r9, r9, 32 | |
2176 | stw r10,64*2-4($out) | |
2177 | srdi r10,r10,32 | |
2178 | stw r11,64*3-4($out) | |
2179 | srdi r11,r11,32 | |
2180 | stw r8, 64*4-4($out) | |
2181 | stw r9, 64*5-4($out) | |
2182 | stw r10,64*6-4($out) | |
2183 | stw r11,64*7-4($out) | |
2184 | ||
2185 | blr | |
2186 | .long 0 | |
2187 | .byte 0,12,0x14,0,0,0,3,0 | |
2188 | .long 0 | |
2189 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 | |
2190 | ||
2191 | ######################################################################## | |
2192 | # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, | |
2193 | # int index); | |
2194 | .globl ecp_nistz256_gather_w5 | |
2195 | .align 4 | |
2196 | ecp_nistz256_gather_w5: | |
2197 | neg r0,$index | |
2198 | sradi r0,r0,63 | |
2199 | ||
2200 | add $index,$index,r0 | |
2201 | slwi $index,$index,2 | |
2202 | add $inp,$inp,$index | |
2203 | ||
2204 | lwz r5, 64*0($inp) | |
2205 | lwz r6, 64*1($inp) | |
2206 | lwz r7, 64*2($inp) | |
2207 | lwz r8, 64*3($inp) | |
2208 | lwz r9, 64*4($inp) | |
2209 | lwz r10,64*5($inp) | |
2210 | lwz r11,64*6($inp) | |
2211 | lwz r12,64*7($inp) | |
2212 | addi $inp,$inp,64*8 | |
2213 | sldi r9, r9, 32 | |
2214 | sldi r10,r10,32 | |
2215 | sldi r11,r11,32 | |
2216 | sldi r12,r12,32 | |
2217 | or r5,r5,r9 | |
2218 | or r6,r6,r10 | |
2219 | or r7,r7,r11 | |
2220 | or r8,r8,r12 | |
2221 | and r5,r5,r0 | |
2222 | and r6,r6,r0 | |
2223 | and r7,r7,r0 | |
2224 | and r8,r8,r0 | |
2225 | std r5,0($out) # X | |
2226 | std r6,8($out) | |
2227 | std r7,16($out) | |
2228 | std r8,24($out) | |
2229 | ||
2230 | lwz r5, 64*0($inp) | |
2231 | lwz r6, 64*1($inp) | |
2232 | lwz r7, 64*2($inp) | |
2233 | lwz r8, 64*3($inp) | |
2234 | lwz r9, 64*4($inp) | |
2235 | lwz r10,64*5($inp) | |
2236 | lwz r11,64*6($inp) | |
2237 | lwz r12,64*7($inp) | |
2238 | addi $inp,$inp,64*8 | |
2239 | sldi r9, r9, 32 | |
2240 | sldi r10,r10,32 | |
2241 | sldi r11,r11,32 | |
2242 | sldi r12,r12,32 | |
2243 | or r5,r5,r9 | |
2244 | or r6,r6,r10 | |
2245 | or r7,r7,r11 | |
2246 | or r8,r8,r12 | |
2247 | and r5,r5,r0 | |
2248 | and r6,r6,r0 | |
2249 | and r7,r7,r0 | |
2250 | and r8,r8,r0 | |
2251 | std r5,32($out) # Y | |
2252 | std r6,40($out) | |
2253 | std r7,48($out) | |
2254 | std r8,56($out) | |
2255 | ||
2256 | lwz r5, 64*0($inp) | |
2257 | lwz r6, 64*1($inp) | |
2258 | lwz r7, 64*2($inp) | |
2259 | lwz r8, 64*3($inp) | |
2260 | lwz r9, 64*4($inp) | |
2261 | lwz r10,64*5($inp) | |
2262 | lwz r11,64*6($inp) | |
2263 | lwz r12,64*7($inp) | |
2264 | sldi r9, r9, 32 | |
2265 | sldi r10,r10,32 | |
2266 | sldi r11,r11,32 | |
2267 | sldi r12,r12,32 | |
2268 | or r5,r5,r9 | |
2269 | or r6,r6,r10 | |
2270 | or r7,r7,r11 | |
2271 | or r8,r8,r12 | |
2272 | and r5,r5,r0 | |
2273 | and r6,r6,r0 | |
2274 | and r7,r7,r0 | |
2275 | and r8,r8,r0 | |
2276 | std r5,64($out) # Z | |
2277 | std r6,72($out) | |
2278 | std r7,80($out) | |
2279 | std r8,88($out) | |
2280 | ||
2281 | blr | |
2282 | .long 0 | |
2283 | .byte 0,12,0x14,0,0,0,3,0 | |
2284 | .long 0 | |
2285 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 | |
2286 | ||
2287 | ######################################################################## | |
2288 | # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, | |
2289 | # int index); | |
2290 | .globl ecp_nistz256_scatter_w7 | |
2291 | .align 4 | |
2292 | ecp_nistz256_scatter_w7: | |
2293 | li r0,8 | |
2294 | mtctr r0 | |
2295 | add $out,$out,$index | |
2296 | subi $inp,$inp,8 | |
2297 | ||
2298 | .Loop_scatter_w7: | |
2299 | ldu r0,8($inp) | |
87a75b3e | 2300 | stb r0,64*0($out) |
d8f432aa | 2301 | srdi r0,r0,8 |
87a75b3e | 2302 | stb r0,64*1($out) |
d8f432aa | 2303 | srdi r0,r0,8 |
87a75b3e | 2304 | stb r0,64*2($out) |
d8f432aa | 2305 | srdi r0,r0,8 |
87a75b3e | 2306 | stb r0,64*3($out) |
d8f432aa | 2307 | srdi r0,r0,8 |
87a75b3e | 2308 | stb r0,64*4($out) |
d8f432aa | 2309 | srdi r0,r0,8 |
87a75b3e | 2310 | stb r0,64*5($out) |
d8f432aa | 2311 | srdi r0,r0,8 |
87a75b3e | 2312 | stb r0,64*6($out) |
d8f432aa | 2313 | srdi r0,r0,8 |
87a75b3e | 2314 | stb r0,64*7($out) |
d8f432aa AP |
2315 | addi $out,$out,64*8 |
2316 | bdnz .Loop_scatter_w7 | |
2317 | ||
2318 | blr | |
2319 | .long 0 | |
2320 | .byte 0,12,0x14,0,0,0,3,0 | |
2321 | .long 0 | |
2322 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 | |
2323 | ||
2324 | ######################################################################## | |
2325 | # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, | |
2326 | # int index); | |
2327 | .globl ecp_nistz256_gather_w7 | |
2328 | .align 4 | |
2329 | ecp_nistz256_gather_w7: | |
2330 | li r0,8 | |
2331 | mtctr r0 | |
2332 | neg r0,$index | |
2333 | sradi r0,r0,63 | |
2334 | ||
2335 | add $index,$index,r0 | |
2336 | add $inp,$inp,$index | |
2337 | subi $out,$out,8 | |
2338 | ||
2339 | .Loop_gather_w7: | |
2340 | lbz r5, 64*0($inp) | |
2341 | lbz r6, 64*1($inp) | |
2342 | lbz r7, 64*2($inp) | |
2343 | lbz r8, 64*3($inp) | |
2344 | lbz r9, 64*4($inp) | |
2345 | lbz r10,64*5($inp) | |
2346 | lbz r11,64*6($inp) | |
2347 | lbz r12,64*7($inp) | |
2348 | addi $inp,$inp,64*8 | |
2349 | ||
2350 | sldi r6, r6, 8 | |
2351 | sldi r7, r7, 16 | |
2352 | sldi r8, r8, 24 | |
2353 | sldi r9, r9, 32 | |
2354 | sldi r10,r10,40 | |
2355 | sldi r11,r11,48 | |
2356 | sldi r12,r12,56 | |
2357 | ||
2358 | or r5,r5,r6 | |
2359 | or r7,r7,r8 | |
2360 | or r9,r9,r10 | |
2361 | or r11,r11,r12 | |
2362 | or r5,r5,r7 | |
2363 | or r9,r9,r11 | |
2364 | or r5,r5,r9 | |
2365 | and r5,r5,r0 | |
2366 | stdu r5,8($out) | |
2367 | bdnz .Loop_gather_w7 | |
2368 | ||
2369 | blr | |
2370 | .long 0 | |
2371 | .byte 0,12,0x14,0,0,0,3,0 | |
2372 | .long 0 | |
2373 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 | |
2374 | ___ | |
2375 | } | |
2376 | ||
2377 | foreach (split("\n",$code)) { | |
2378 | s/\`([^\`]*)\`/eval $1/ge; | |
2379 | ||
2380 | print $_,"\n"; | |
2381 | } | |
2382 | close STDOUT; # enforce flush |