]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_nistz256-ppc64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-ppc64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # ECP_NISTZ256 module for PPC64.
18 #
19 # August 2016.
20 #
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
23 #
24 # with/without -DECP_NISTZ256_ASM
25 # POWER7 +260-530%
26 # POWER8 +220-340%
27
28 # $output is the last argument if it looks like a file (it has an extension)
29 # $flavour is the first argument if it doesn't look like a file
30 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36 die "can't locate ppc-xlate.pl";
37
38 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
39 or die "can't call $xlate: $!";
40 *STDOUT=*OUT;
41
42 my $sp="r1";
43
44 {
45 my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
46 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
47 map("r$_",(3..12,22..31));
48
49 my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
50
51 $code.=<<___;
52 .machine "any"
53 .text
54 ___
55 ########################################################################
56 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
57 #
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 open TABLE,"<ecp_nistz256_table.c" or
60 open TABLE,"<${dir}../ecp_nistz256_table.c" or
61 die "failed to open ecp_nistz256_table.c:",$!;
62
63 use integer;
64
65 foreach(<TABLE>) {
66 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67 }
68 close TABLE;
69
70 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
72 # amount of elements.
73 die "insane number of elements" if ($#arr != 64*16*37-1);
74
75 $code.=<<___;
76 .type ecp_nistz256_precomputed,\@object
77 .globl ecp_nistz256_precomputed
78 .align 12
79 ecp_nistz256_precomputed:
80 ___
81 ########################################################################
82 # this conversion smashes P256_POINT_AFFINE by individual bytes with
83 # 64 byte interval, similar to
84 # 1111222233334444
85 # 1234123412341234
86 for(1..37) {
87 @tbl = splice(@arr,0,64*16);
88 for($i=0;$i<64;$i++) {
89 undef @line;
90 for($j=0;$j<64;$j++) {
91 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
92 }
93 $code.=".byte\t";
94 $code.=join(',',map { sprintf "0x%02x",$_} @line);
95 $code.="\n";
96 }
97 }
98
99 $code.=<<___;
100 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
101 .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
102
103 # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
104 # const BN_ULONG x2[4]);
105 .globl ecp_nistz256_mul_mont
106 .align 5
107 ecp_nistz256_mul_mont:
108 stdu $sp,-128($sp)
109 mflr r0
110 std r22,48($sp)
111 std r23,56($sp)
112 std r24,64($sp)
113 std r25,72($sp)
114 std r26,80($sp)
115 std r27,88($sp)
116 std r28,96($sp)
117 std r29,104($sp)
118 std r30,112($sp)
119 std r31,120($sp)
120
121 ld $a0,0($ap)
122 ld $bi,0($bp)
123 ld $a1,8($ap)
124 ld $a2,16($ap)
125 ld $a3,24($ap)
126
127 li $poly1,-1
128 srdi $poly1,$poly1,32 # 0x00000000ffffffff
129 li $poly3,1
130 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
131
132 bl __ecp_nistz256_mul_mont
133
134 mtlr r0
135 ld r22,48($sp)
136 ld r23,56($sp)
137 ld r24,64($sp)
138 ld r25,72($sp)
139 ld r26,80($sp)
140 ld r27,88($sp)
141 ld r28,96($sp)
142 ld r29,104($sp)
143 ld r30,112($sp)
144 ld r31,120($sp)
145 addi $sp,$sp,128
146 blr
147 .long 0
148 .byte 0,12,4,0,0x80,10,3,0
149 .long 0
150 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
151
152 # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
153 .globl ecp_nistz256_sqr_mont
154 .align 4
155 ecp_nistz256_sqr_mont:
156 stdu $sp,-128($sp)
157 mflr r0
158 std r22,48($sp)
159 std r23,56($sp)
160 std r24,64($sp)
161 std r25,72($sp)
162 std r26,80($sp)
163 std r27,88($sp)
164 std r28,96($sp)
165 std r29,104($sp)
166 std r30,112($sp)
167 std r31,120($sp)
168
169 ld $a0,0($ap)
170 ld $a1,8($ap)
171 ld $a2,16($ap)
172 ld $a3,24($ap)
173
174 li $poly1,-1
175 srdi $poly1,$poly1,32 # 0x00000000ffffffff
176 li $poly3,1
177 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
178
179 bl __ecp_nistz256_sqr_mont
180
181 mtlr r0
182 ld r22,48($sp)
183 ld r23,56($sp)
184 ld r24,64($sp)
185 ld r25,72($sp)
186 ld r26,80($sp)
187 ld r27,88($sp)
188 ld r28,96($sp)
189 ld r29,104($sp)
190 ld r30,112($sp)
191 ld r31,120($sp)
192 addi $sp,$sp,128
193 blr
194 .long 0
195 .byte 0,12,4,0,0x80,10,2,0
196 .long 0
197 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
198
199 # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
200 # const BN_ULONG x2[4]);
201 .globl ecp_nistz256_add
202 .align 4
203 ecp_nistz256_add:
204 stdu $sp,-128($sp)
205 mflr r0
206 std r28,96($sp)
207 std r29,104($sp)
208 std r30,112($sp)
209 std r31,120($sp)
210
211 ld $acc0,0($ap)
212 ld $t0, 0($bp)
213 ld $acc1,8($ap)
214 ld $t1, 8($bp)
215 ld $acc2,16($ap)
216 ld $t2, 16($bp)
217 ld $acc3,24($ap)
218 ld $t3, 24($bp)
219
220 li $poly1,-1
221 srdi $poly1,$poly1,32 # 0x00000000ffffffff
222 li $poly3,1
223 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
224
225 bl __ecp_nistz256_add
226
227 mtlr r0
228 ld r28,96($sp)
229 ld r29,104($sp)
230 ld r30,112($sp)
231 ld r31,120($sp)
232 addi $sp,$sp,128
233 blr
234 .long 0
235 .byte 0,12,4,0,0x80,4,3,0
236 .long 0
237 .size ecp_nistz256_add,.-ecp_nistz256_add
238
239 # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240 .globl ecp_nistz256_div_by_2
241 .align 4
242 ecp_nistz256_div_by_2:
243 stdu $sp,-128($sp)
244 mflr r0
245 std r28,96($sp)
246 std r29,104($sp)
247 std r30,112($sp)
248 std r31,120($sp)
249
250 ld $acc0,0($ap)
251 ld $acc1,8($ap)
252 ld $acc2,16($ap)
253 ld $acc3,24($ap)
254
255 li $poly1,-1
256 srdi $poly1,$poly1,32 # 0x00000000ffffffff
257 li $poly3,1
258 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
259
260 bl __ecp_nistz256_div_by_2
261
262 mtlr r0
263 ld r28,96($sp)
264 ld r29,104($sp)
265 ld r30,112($sp)
266 ld r31,120($sp)
267 addi $sp,$sp,128
268 blr
269 .long 0
270 .byte 0,12,4,0,0x80,4,2,0
271 .long 0
272 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
273
274 # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
275 .globl ecp_nistz256_mul_by_2
276 .align 4
277 ecp_nistz256_mul_by_2:
278 stdu $sp,-128($sp)
279 mflr r0
280 std r28,96($sp)
281 std r29,104($sp)
282 std r30,112($sp)
283 std r31,120($sp)
284
285 ld $acc0,0($ap)
286 ld $acc1,8($ap)
287 ld $acc2,16($ap)
288 ld $acc3,24($ap)
289
290 mr $t0,$acc0
291 mr $t1,$acc1
292 mr $t2,$acc2
293 mr $t3,$acc3
294
295 li $poly1,-1
296 srdi $poly1,$poly1,32 # 0x00000000ffffffff
297 li $poly3,1
298 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
299
300 bl __ecp_nistz256_add # ret = a+a // 2*a
301
302 mtlr r0
303 ld r28,96($sp)
304 ld r29,104($sp)
305 ld r30,112($sp)
306 ld r31,120($sp)
307 addi $sp,$sp,128
308 blr
309 .long 0
310 .byte 0,12,4,0,0x80,4,3,0
311 .long 0
312 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
313
314 # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
315 .globl ecp_nistz256_mul_by_3
316 .align 4
317 ecp_nistz256_mul_by_3:
318 stdu $sp,-128($sp)
319 mflr r0
320 std r28,96($sp)
321 std r29,104($sp)
322 std r30,112($sp)
323 std r31,120($sp)
324
325 ld $acc0,0($ap)
326 ld $acc1,8($ap)
327 ld $acc2,16($ap)
328 ld $acc3,24($ap)
329
330 mr $t0,$acc0
331 std $acc0,64($sp)
332 mr $t1,$acc1
333 std $acc1,72($sp)
334 mr $t2,$acc2
335 std $acc2,80($sp)
336 mr $t3,$acc3
337 std $acc3,88($sp)
338
339 li $poly1,-1
340 srdi $poly1,$poly1,32 # 0x00000000ffffffff
341 li $poly3,1
342 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
343
344 bl __ecp_nistz256_add # ret = a+a // 2*a
345
346 ld $t0,64($sp)
347 ld $t1,72($sp)
348 ld $t2,80($sp)
349 ld $t3,88($sp)
350
351 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
352
353 mtlr r0
354 ld r28,96($sp)
355 ld r29,104($sp)
356 ld r30,112($sp)
357 ld r31,120($sp)
358 addi $sp,$sp,128
359 blr
360 .long 0
361 .byte 0,12,4,0,0x80,4,2,0
362 .long 0
363 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
364
365 # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
366 # const BN_ULONG x2[4]);
367 .globl ecp_nistz256_sub
368 .align 4
369 ecp_nistz256_sub:
370 stdu $sp,-128($sp)
371 mflr r0
372 std r28,96($sp)
373 std r29,104($sp)
374 std r30,112($sp)
375 std r31,120($sp)
376
377 ld $acc0,0($ap)
378 ld $acc1,8($ap)
379 ld $acc2,16($ap)
380 ld $acc3,24($ap)
381
382 li $poly1,-1
383 srdi $poly1,$poly1,32 # 0x00000000ffffffff
384 li $poly3,1
385 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
386
387 bl __ecp_nistz256_sub_from
388
389 mtlr r0
390 ld r28,96($sp)
391 ld r29,104($sp)
392 ld r30,112($sp)
393 ld r31,120($sp)
394 addi $sp,$sp,128
395 blr
396 .long 0
397 .byte 0,12,4,0,0x80,4,3,0
398 .long 0
399 .size ecp_nistz256_sub,.-ecp_nistz256_sub
400
401 # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
402 .globl ecp_nistz256_neg
403 .align 4
404 ecp_nistz256_neg:
405 stdu $sp,-128($sp)
406 mflr r0
407 std r28,96($sp)
408 std r29,104($sp)
409 std r30,112($sp)
410 std r31,120($sp)
411
412 mr $bp,$ap
413 li $acc0,0
414 li $acc1,0
415 li $acc2,0
416 li $acc3,0
417
418 li $poly1,-1
419 srdi $poly1,$poly1,32 # 0x00000000ffffffff
420 li $poly3,1
421 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
422
423 bl __ecp_nistz256_sub_from
424
425 mtlr r0
426 ld r28,96($sp)
427 ld r29,104($sp)
428 ld r30,112($sp)
429 ld r31,120($sp)
430 addi $sp,$sp,128
431 blr
432 .long 0
433 .byte 0,12,4,0,0x80,4,2,0
434 .long 0
435 .size ecp_nistz256_neg,.-ecp_nistz256_neg
436
437 # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
438 # to $a0-$a3 and b[0] - to $bi
439 .type __ecp_nistz256_mul_mont,\@function
440 .align 4
441 __ecp_nistz256_mul_mont:
442 mulld $acc0,$a0,$bi # a[0]*b[0]
443 mulhdu $t0,$a0,$bi
444
445 mulld $acc1,$a1,$bi # a[1]*b[0]
446 mulhdu $t1,$a1,$bi
447
448 mulld $acc2,$a2,$bi # a[2]*b[0]
449 mulhdu $t2,$a2,$bi
450
451 mulld $acc3,$a3,$bi # a[3]*b[0]
452 mulhdu $t3,$a3,$bi
453 ld $bi,8($bp) # b[1]
454
455 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
456 sldi $t0,$acc0,32
457 adde $acc2,$acc2,$t1
458 srdi $t1,$acc0,32
459 adde $acc3,$acc3,$t2
460 addze $acc4,$t3
461 li $acc5,0
462 ___
463 for($i=1;$i<4;$i++) {
464 ################################################################
465 # Reduction iteration is normally performed by accumulating
466 # result of multiplication of modulus by "magic" digit [and
467 # omitting least significant word, which is guaranteed to
468 # be 0], but thanks to special form of modulus and "magic"
469 # digit being equal to least significant word, it can be
470 # performed with additions and subtractions alone. Indeed:
471 #
472 # ffff0001.00000000.0000ffff.ffffffff
473 # * abcdefgh
474 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
475 #
476 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
477 # rewrite above as:
478 #
479 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
480 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
481 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
482 #
483 # or marking redundant operations:
484 #
485 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
486 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
487 # - 0000abcd.efgh0000.--------.--------.--------
488
489 $code.=<<___;
490 subfc $t2,$t0,$acc0 # "*0xffff0001"
491 subfe $t3,$t1,$acc0
492 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
493 adde $acc1,$acc2,$t1
494 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
495 adde $acc3,$acc4,$t3
496 addze $acc4,$acc5
497
498 mulld $t0,$a0,$bi # lo(a[0]*b[i])
499 mulld $t1,$a1,$bi # lo(a[1]*b[i])
500 mulld $t2,$a2,$bi # lo(a[2]*b[i])
501 mulld $t3,$a3,$bi # lo(a[3]*b[i])
502 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
503 mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
504 adde $acc1,$acc1,$t1
505 mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
506 adde $acc2,$acc2,$t2
507 mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
508 adde $acc3,$acc3,$t3
509 mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
510 addze $acc4,$acc4
511 ___
512 $code.=<<___ if ($i<3);
513 ld $bi,8*($i+1)($bp) # b[$i+1]
514 ___
515 $code.=<<___;
516 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
517 sldi $t0,$acc0,32
518 adde $acc2,$acc2,$t1
519 srdi $t1,$acc0,32
520 adde $acc3,$acc3,$t2
521 adde $acc4,$acc4,$t3
522 li $acc5,0
523 addze $acc5,$acc5
524 ___
525 }
526 $code.=<<___;
527 # last reduction
528 subfc $t2,$t0,$acc0 # "*0xffff0001"
529 subfe $t3,$t1,$acc0
530 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
531 adde $acc1,$acc2,$t1
532 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
533 adde $acc3,$acc4,$t3
534 addze $acc4,$acc5
535
536 li $t2,0
537 addic $acc0,$acc0,1 # ret -= modulus
538 subfe $acc1,$poly1,$acc1
539 subfe $acc2,$t2,$acc2
540 subfe $acc3,$poly3,$acc3
541 subfe $acc4,$t2,$acc4
542
543 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
544 and $t1,$poly1,$acc4
545 and $t3,$poly3,$acc4
546 adde $acc1,$acc1,$t1
547 addze $acc2,$acc2
548 adde $acc3,$acc3,$t3
549
550 std $acc0,0($rp)
551 std $acc1,8($rp)
552 std $acc2,16($rp)
553 std $acc3,24($rp)
554
555 blr
556 .long 0
557 .byte 0,12,0x14,0,0,0,1,0
558 .long 0
559 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
560
561 # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
562 # to $a0-$a3
563 .type __ecp_nistz256_sqr_mont,\@function
564 .align 4
565 __ecp_nistz256_sqr_mont:
566 ################################################################
567 # | | | | | |a1*a0| |
568 # | | | | |a2*a0| | |
569 # | |a3*a2|a3*a0| | | |
570 # | | | |a2*a1| | | |
571 # | | |a3*a1| | | | |
572 # *| | | | | | | | 2|
573 # +|a3*a3|a2*a2|a1*a1|a0*a0|
574 # |--+--+--+--+--+--+--+--|
575 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
576 #
577 # "can't overflow" below mark carrying into high part of
578 # multiplication result, which can't overflow, because it
579 # can never be all ones.
580
581 mulld $acc1,$a1,$a0 # a[1]*a[0]
582 mulhdu $t1,$a1,$a0
583 mulld $acc2,$a2,$a0 # a[2]*a[0]
584 mulhdu $t2,$a2,$a0
585 mulld $acc3,$a3,$a0 # a[3]*a[0]
586 mulhdu $acc4,$a3,$a0
587
588 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
589 mulld $t0,$a2,$a1 # a[2]*a[1]
590 mulhdu $t1,$a2,$a1
591 adde $acc3,$acc3,$t2
592 mulld $t2,$a3,$a1 # a[3]*a[1]
593 mulhdu $t3,$a3,$a1
594 addze $acc4,$acc4 # can't overflow
595
596 mulld $acc5,$a3,$a2 # a[3]*a[2]
597 mulhdu $acc6,$a3,$a2
598
599 addc $t1,$t1,$t2 # accumulate high parts of multiplication
600 addze $t2,$t3 # can't overflow
601
602 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
603 adde $acc4,$acc4,$t1
604 adde $acc5,$acc5,$t2
605 addze $acc6,$acc6 # can't overflow
606
607 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
608 adde $acc2,$acc2,$acc2
609 adde $acc3,$acc3,$acc3
610 adde $acc4,$acc4,$acc4
611 adde $acc5,$acc5,$acc5
612 adde $acc6,$acc6,$acc6
613 li $acc7,0
614 addze $acc7,$acc7
615
616 mulld $acc0,$a0,$a0 # a[0]*a[0]
617 mulhdu $a0,$a0,$a0
618 mulld $t1,$a1,$a1 # a[1]*a[1]
619 mulhdu $a1,$a1,$a1
620 mulld $t2,$a2,$a2 # a[2]*a[2]
621 mulhdu $a2,$a2,$a2
622 mulld $t3,$a3,$a3 # a[3]*a[3]
623 mulhdu $a3,$a3,$a3
624 addc $acc1,$acc1,$a0 # +a[i]*a[i]
625 sldi $t0,$acc0,32
626 adde $acc2,$acc2,$t1
627 srdi $t1,$acc0,32
628 adde $acc3,$acc3,$a1
629 adde $acc4,$acc4,$t2
630 adde $acc5,$acc5,$a2
631 adde $acc6,$acc6,$t3
632 adde $acc7,$acc7,$a3
633 ___
634 for($i=0;$i<3;$i++) { # reductions, see commentary in
635 # multiplication for details
636 $code.=<<___;
637 subfc $t2,$t0,$acc0 # "*0xffff0001"
638 subfe $t3,$t1,$acc0
639 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
640 sldi $t0,$acc0,32
641 adde $acc1,$acc2,$t1
642 srdi $t1,$acc0,32
643 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
644 addze $acc3,$t3 # can't overflow
645 ___
646 }
647 $code.=<<___;
648 subfc $t2,$t0,$acc0 # "*0xffff0001"
649 subfe $t3,$t1,$acc0
650 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
651 adde $acc1,$acc2,$t1
652 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
653 addze $acc3,$t3 # can't overflow
654
655 addc $acc0,$acc0,$acc4 # accumulate upper half
656 adde $acc1,$acc1,$acc5
657 adde $acc2,$acc2,$acc6
658 adde $acc3,$acc3,$acc7
659 li $t2,0
660 addze $acc4,$t2
661
662 addic $acc0,$acc0,1 # ret -= modulus
663 subfe $acc1,$poly1,$acc1
664 subfe $acc2,$t2,$acc2
665 subfe $acc3,$poly3,$acc3
666 subfe $acc4,$t2,$acc4
667
668 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
669 and $t1,$poly1,$acc4
670 and $t3,$poly3,$acc4
671 adde $acc1,$acc1,$t1
672 addze $acc2,$acc2
673 adde $acc3,$acc3,$t3
674
675 std $acc0,0($rp)
676 std $acc1,8($rp)
677 std $acc2,16($rp)
678 std $acc3,24($rp)
679
680 blr
681 .long 0
682 .byte 0,12,0x14,0,0,0,1,0
683 .long 0
684 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
685
686 # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
687 # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
688 # contexts, e.g. in multiplication by 2 and 3...
689 .type __ecp_nistz256_add,\@function
690 .align 4
691 __ecp_nistz256_add:
692 addc $acc0,$acc0,$t0 # ret = a+b
693 adde $acc1,$acc1,$t1
694 adde $acc2,$acc2,$t2
695 li $t2,0
696 adde $acc3,$acc3,$t3
697 addze $t0,$t2
698
699 # if a+b >= modulus, subtract modulus
700 #
701 # But since comparison implies subtraction, we subtract
702 # modulus and then add it back if subtraction borrowed.
703
704 subic $acc0,$acc0,-1
705 subfe $acc1,$poly1,$acc1
706 subfe $acc2,$t2,$acc2
707 subfe $acc3,$poly3,$acc3
708 subfe $t0,$t2,$t0
709
710 addc $acc0,$acc0,$t0
711 and $t1,$poly1,$t0
712 and $t3,$poly3,$t0
713 adde $acc1,$acc1,$t1
714 addze $acc2,$acc2
715 adde $acc3,$acc3,$t3
716
717 std $acc0,0($rp)
718 std $acc1,8($rp)
719 std $acc2,16($rp)
720 std $acc3,24($rp)
721
722 blr
723 .long 0
724 .byte 0,12,0x14,0,0,0,3,0
725 .long 0
726 .size __ecp_nistz256_add,.-__ecp_nistz256_add
727
728 .type __ecp_nistz256_sub_from,\@function
729 .align 4
730 __ecp_nistz256_sub_from:
731 ld $t0,0($bp)
732 ld $t1,8($bp)
733 ld $t2,16($bp)
734 ld $t3,24($bp)
735 subfc $acc0,$t0,$acc0 # ret = a-b
736 subfe $acc1,$t1,$acc1
737 subfe $acc2,$t2,$acc2
738 subfe $acc3,$t3,$acc3
739 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
740
741 # if a-b borrowed, add modulus
742
743 addc $acc0,$acc0,$t0 # ret -= modulus & t0
744 and $t1,$poly1,$t0
745 and $t3,$poly3,$t0
746 adde $acc1,$acc1,$t1
747 addze $acc2,$acc2
748 adde $acc3,$acc3,$t3
749
750 std $acc0,0($rp)
751 std $acc1,8($rp)
752 std $acc2,16($rp)
753 std $acc3,24($rp)
754
755 blr
756 .long 0
757 .byte 0,12,0x14,0,0,0,3,0
758 .long 0
759 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
760
761 .type __ecp_nistz256_sub_morf,\@function
762 .align 4
763 __ecp_nistz256_sub_morf:
764 ld $t0,0($bp)
765 ld $t1,8($bp)
766 ld $t2,16($bp)
767 ld $t3,24($bp)
768 subfc $acc0,$acc0,$t0 # ret = b-a
769 subfe $acc1,$acc1,$t1
770 subfe $acc2,$acc2,$t2
771 subfe $acc3,$acc3,$t3
772 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
773
774 # if b-a borrowed, add modulus
775
776 addc $acc0,$acc0,$t0 # ret -= modulus & t0
777 and $t1,$poly1,$t0
778 and $t3,$poly3,$t0
779 adde $acc1,$acc1,$t1
780 addze $acc2,$acc2
781 adde $acc3,$acc3,$t3
782
783 std $acc0,0($rp)
784 std $acc1,8($rp)
785 std $acc2,16($rp)
786 std $acc3,24($rp)
787
788 blr
789 .long 0
790 .byte 0,12,0x14,0,0,0,3,0
791 .long 0
792 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
793
794 .type __ecp_nistz256_div_by_2,\@function
795 .align 4
796 __ecp_nistz256_div_by_2:
797 andi. $t0,$acc0,1
798 addic $acc0,$acc0,-1 # a += modulus
799 neg $t0,$t0
800 adde $acc1,$acc1,$poly1
801 not $t0,$t0
802 addze $acc2,$acc2
803 li $t2,0
804 adde $acc3,$acc3,$poly3
805 and $t1,$poly1,$t0
806 addze $ap,$t2 # ap = carry
807 and $t3,$poly3,$t0
808
809 subfc $acc0,$t0,$acc0 # a -= modulus if a was even
810 subfe $acc1,$t1,$acc1
811 subfe $acc2,$t2,$acc2
812 subfe $acc3,$t3,$acc3
813 subfe $ap, $t2,$ap
814
815 srdi $acc0,$acc0,1
816 sldi $t0,$acc1,63
817 srdi $acc1,$acc1,1
818 sldi $t1,$acc2,63
819 srdi $acc2,$acc2,1
820 sldi $t2,$acc3,63
821 srdi $acc3,$acc3,1
822 sldi $t3,$ap,63
823 or $acc0,$acc0,$t0
824 or $acc1,$acc1,$t1
825 or $acc2,$acc2,$t2
826 or $acc3,$acc3,$t3
827
828 std $acc0,0($rp)
829 std $acc1,8($rp)
830 std $acc2,16($rp)
831 std $acc3,24($rp)
832
833 blr
834 .long 0
835 .byte 0,12,0x14,0,0,0,1,0
836 .long 0
837 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
838 ___
839 ########################################################################
840 # following subroutines are "literal" implementation of those found in
841 # ecp_nistz256.c
842 #
843 ########################################################################
844 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
845 #
846 if (1) {
847 my $FRAME=64+32*4+12*8;
848 my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
849 # above map() describes stack layout with 4 temporary
850 # 256-bit vectors on top.
851 my ($rp_real,$ap_real) = map("r$_",(20,21));
852
853 $code.=<<___;
854 .globl ecp_nistz256_point_double
855 .align 5
856 ecp_nistz256_point_double:
857 stdu $sp,-$FRAME($sp)
858 mflr r0
859 std r20,$FRAME-8*12($sp)
860 std r21,$FRAME-8*11($sp)
861 std r22,$FRAME-8*10($sp)
862 std r23,$FRAME-8*9($sp)
863 std r24,$FRAME-8*8($sp)
864 std r25,$FRAME-8*7($sp)
865 std r26,$FRAME-8*6($sp)
866 std r27,$FRAME-8*5($sp)
867 std r28,$FRAME-8*4($sp)
868 std r29,$FRAME-8*3($sp)
869 std r30,$FRAME-8*2($sp)
870 std r31,$FRAME-8*1($sp)
871
872 li $poly1,-1
873 srdi $poly1,$poly1,32 # 0x00000000ffffffff
874 li $poly3,1
875 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
876 .Ldouble_shortcut:
877 ld $acc0,32($ap)
878 ld $acc1,40($ap)
879 ld $acc2,48($ap)
880 ld $acc3,56($ap)
881 mr $t0,$acc0
882 mr $t1,$acc1
883 mr $t2,$acc2
884 mr $t3,$acc3
885 ld $a0,64($ap) # forward load for p256_sqr_mont
886 ld $a1,72($ap)
887 ld $a2,80($ap)
888 ld $a3,88($ap)
889 mr $rp_real,$rp
890 mr $ap_real,$ap
891 addi $rp,$sp,$S
892 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
893
894 addi $rp,$sp,$Zsqr
895 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
896
897 ld $t0,0($ap_real)
898 ld $t1,8($ap_real)
899 ld $t2,16($ap_real)
900 ld $t3,24($ap_real)
901 mr $a0,$acc0 # put Zsqr aside for p256_sub
902 mr $a1,$acc1
903 mr $a2,$acc2
904 mr $a3,$acc3
905 addi $rp,$sp,$M
906 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
907
908 addi $bp,$ap_real,0
909 mr $acc0,$a0 # restore Zsqr
910 mr $acc1,$a1
911 mr $acc2,$a2
912 mr $acc3,$a3
913 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
914 ld $a1,$S+8($sp)
915 ld $a2,$S+16($sp)
916 ld $a3,$S+24($sp)
917 addi $rp,$sp,$Zsqr
918 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
919
920 addi $rp,$sp,$S
921 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
922
923 ld $bi,32($ap_real)
924 ld $a0,64($ap_real)
925 ld $a1,72($ap_real)
926 ld $a2,80($ap_real)
927 ld $a3,88($ap_real)
928 addi $bp,$ap_real,32
929 addi $rp,$sp,$tmp0
930 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
931
932 mr $t0,$acc0
933 mr $t1,$acc1
934 mr $t2,$acc2
935 mr $t3,$acc3
936 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
937 ld $a1,$S+8($sp)
938 ld $a2,$S+16($sp)
939 ld $a3,$S+24($sp)
940 addi $rp,$rp_real,64
941 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
942
943 addi $rp,$sp,$tmp0
944 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
945
946 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
947 ld $a0,$M+0($sp)
948 ld $a1,$M+8($sp)
949 ld $a2,$M+16($sp)
950 ld $a3,$M+24($sp)
951 addi $rp,$rp_real,32
952 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
953
954 addi $bp,$sp,$Zsqr
955 addi $rp,$sp,$M
956 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
957
958 mr $t0,$acc0 # duplicate M
959 mr $t1,$acc1
960 mr $t2,$acc2
961 mr $t3,$acc3
962 mr $a0,$acc0 # put M aside
963 mr $a1,$acc1
964 mr $a2,$acc2
965 mr $a3,$acc3
966 addi $rp,$sp,$M
967 bl __ecp_nistz256_add
968 mr $t0,$a0 # restore M
969 mr $t1,$a1
970 mr $t2,$a2
971 mr $t3,$a3
972 ld $bi,0($ap_real) # forward load for p256_mul_mont
973 ld $a0,$S+0($sp)
974 ld $a1,$S+8($sp)
975 ld $a2,$S+16($sp)
976 ld $a3,$S+24($sp)
977 bl __ecp_nistz256_add # p256_mul_by_3(M, M);
978
979 addi $bp,$ap_real,0
980 addi $rp,$sp,$S
981 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
982
983 mr $t0,$acc0
984 mr $t1,$acc1
985 mr $t2,$acc2
986 mr $t3,$acc3
987 ld $a0,$M+0($sp) # forward load for p256_sqr_mont
988 ld $a1,$M+8($sp)
989 ld $a2,$M+16($sp)
990 ld $a3,$M+24($sp)
991 addi $rp,$sp,$tmp0
992 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
993
994 addi $rp,$rp_real,0
995 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
996
997 addi $bp,$sp,$tmp0
998 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
999
1000 addi $bp,$sp,$S
1001 addi $rp,$sp,$S
1002 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
1003
1004 ld $bi,$M($sp)
1005 mr $a0,$acc0 # copy S
1006 mr $a1,$acc1
1007 mr $a2,$acc2
1008 mr $a3,$acc3
1009 addi $bp,$sp,$M
1010 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
1011
1012 addi $bp,$rp_real,32
1013 addi $rp,$rp_real,32
1014 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
1015
1016 mtlr r0
1017 ld r20,$FRAME-8*12($sp)
1018 ld r21,$FRAME-8*11($sp)
1019 ld r22,$FRAME-8*10($sp)
1020 ld r23,$FRAME-8*9($sp)
1021 ld r24,$FRAME-8*8($sp)
1022 ld r25,$FRAME-8*7($sp)
1023 ld r26,$FRAME-8*6($sp)
1024 ld r27,$FRAME-8*5($sp)
1025 ld r28,$FRAME-8*4($sp)
1026 ld r29,$FRAME-8*3($sp)
1027 ld r30,$FRAME-8*2($sp)
1028 ld r31,$FRAME-8*1($sp)
1029 addi $sp,$sp,$FRAME
1030 blr
1031 .long 0
1032 .byte 0,12,4,0,0x80,12,2,0
1033 .long 0
1034 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1035 ___
1036 }
1037
1038 ########################################################################
1039 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1040 # const P256_POINT *in2);
1041 if (1) {
1042 my $FRAME = 64 + 32*12 + 16*8;
1043 my ($res_x,$res_y,$res_z,
1044 $H,$Hsqr,$R,$Rsqr,$Hcub,
1045 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1046 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1047 # above map() describes stack layout with 12 temporary
1048 # 256-bit vectors on top.
1049 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1050
1051 $code.=<<___;
1052 .globl ecp_nistz256_point_add
1053 .align 5
1054 ecp_nistz256_point_add:
1055 stdu $sp,-$FRAME($sp)
1056 mflr r0
1057 std r16,$FRAME-8*16($sp)
1058 std r17,$FRAME-8*15($sp)
1059 std r18,$FRAME-8*14($sp)
1060 std r19,$FRAME-8*13($sp)
1061 std r20,$FRAME-8*12($sp)
1062 std r21,$FRAME-8*11($sp)
1063 std r22,$FRAME-8*10($sp)
1064 std r23,$FRAME-8*9($sp)
1065 std r24,$FRAME-8*8($sp)
1066 std r25,$FRAME-8*7($sp)
1067 std r26,$FRAME-8*6($sp)
1068 std r27,$FRAME-8*5($sp)
1069 std r28,$FRAME-8*4($sp)
1070 std r29,$FRAME-8*3($sp)
1071 std r30,$FRAME-8*2($sp)
1072 std r31,$FRAME-8*1($sp)
1073
1074 li $poly1,-1
1075 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1076 li $poly3,1
1077 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1078
1079 ld $a0,64($bp) # in2_z
1080 ld $a1,72($bp)
1081 ld $a2,80($bp)
1082 ld $a3,88($bp)
1083 mr $rp_real,$rp
1084 mr $ap_real,$ap
1085 mr $bp_real,$bp
1086 or $t0,$a0,$a1
1087 or $t2,$a2,$a3
1088 or $in2infty,$t0,$t2
1089 neg $t0,$in2infty
1090 or $in2infty,$in2infty,$t0
1091 sradi $in2infty,$in2infty,63 # !in2infty
1092 addi $rp,$sp,$Z2sqr
1093 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
1094
1095 ld $a0,64($ap_real) # in1_z
1096 ld $a1,72($ap_real)
1097 ld $a2,80($ap_real)
1098 ld $a3,88($ap_real)
1099 or $t0,$a0,$a1
1100 or $t2,$a2,$a3
1101 or $in1infty,$t0,$t2
1102 neg $t0,$in1infty
1103 or $in1infty,$in1infty,$t0
1104 sradi $in1infty,$in1infty,63 # !in1infty
1105 addi $rp,$sp,$Z1sqr
1106 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1107
1108 ld $bi,64($bp_real)
1109 ld $a0,$Z2sqr+0($sp)
1110 ld $a1,$Z2sqr+8($sp)
1111 ld $a2,$Z2sqr+16($sp)
1112 ld $a3,$Z2sqr+24($sp)
1113 addi $bp,$bp_real,64
1114 addi $rp,$sp,$S1
1115 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
1116
1117 ld $bi,64($ap_real)
1118 ld $a0,$Z1sqr+0($sp)
1119 ld $a1,$Z1sqr+8($sp)
1120 ld $a2,$Z1sqr+16($sp)
1121 ld $a3,$Z1sqr+24($sp)
1122 addi $bp,$ap_real,64
1123 addi $rp,$sp,$S2
1124 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1125
1126 ld $bi,32($ap_real)
1127 ld $a0,$S1+0($sp)
1128 ld $a1,$S1+8($sp)
1129 ld $a2,$S1+16($sp)
1130 ld $a3,$S1+24($sp)
1131 addi $bp,$ap_real,32
1132 addi $rp,$sp,$S1
1133 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
1134
1135 ld $bi,32($bp_real)
1136 ld $a0,$S2+0($sp)
1137 ld $a1,$S2+8($sp)
1138 ld $a2,$S2+16($sp)
1139 ld $a3,$S2+24($sp)
1140 addi $bp,$bp_real,32
1141 addi $rp,$sp,$S2
1142 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1143
1144 addi $bp,$sp,$S1
1145 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
1146 ld $a0,0($ap_real)
1147 ld $a1,8($ap_real)
1148 ld $a2,16($ap_real)
1149 ld $a3,24($ap_real)
1150 addi $rp,$sp,$R
1151 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
1152
1153 or $acc0,$acc0,$acc1 # see if result is zero
1154 or $acc2,$acc2,$acc3
1155 or $temp,$acc0,$acc2
1156
1157 addi $bp,$sp,$Z2sqr
1158 addi $rp,$sp,$U1
1159 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
1160
1161 ld $bi,$Z1sqr($sp)
1162 ld $a0,0($bp_real)
1163 ld $a1,8($bp_real)
1164 ld $a2,16($bp_real)
1165 ld $a3,24($bp_real)
1166 addi $bp,$sp,$Z1sqr
1167 addi $rp,$sp,$U2
1168 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
1169
1170 addi $bp,$sp,$U1
1171 ld $a0,$R+0($sp) # forward load for p256_sqr_mont
1172 ld $a1,$R+8($sp)
1173 ld $a2,$R+16($sp)
1174 ld $a3,$R+24($sp)
1175 addi $rp,$sp,$H
1176 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
1177
1178 or $acc0,$acc0,$acc1 # see if result is zero
1179 or $acc2,$acc2,$acc3
1180 or. $acc0,$acc0,$acc2
1181 bne .Ladd_proceed # is_equal(U1,U2)?
1182
1183 and. $t0,$in1infty,$in2infty
1184 beq .Ladd_proceed # (in1infty || in2infty)?
1185
1186 cmpldi $temp,0
1187 beq .Ladd_double # is_equal(S1,S2)?
1188
1189 xor $a0,$a0,$a0
1190 std $a0,0($rp_real)
1191 std $a0,8($rp_real)
1192 std $a0,16($rp_real)
1193 std $a0,24($rp_real)
1194 std $a0,32($rp_real)
1195 std $a0,40($rp_real)
1196 std $a0,48($rp_real)
1197 std $a0,56($rp_real)
1198 std $a0,64($rp_real)
1199 std $a0,72($rp_real)
1200 std $a0,80($rp_real)
1201 std $a0,88($rp_real)
1202 b .Ladd_done
1203
1204 .align 4
1205 .Ladd_double:
1206 ld $bp,0($sp) # back-link
1207 mr $ap,$ap_real
1208 mr $rp,$rp_real
1209 ld r16,$FRAME-8*16($sp)
1210 ld r17,$FRAME-8*15($sp)
1211 ld r18,$FRAME-8*14($sp)
1212 ld r19,$FRAME-8*13($sp)
1213 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
1214 b .Ldouble_shortcut
1215
1216 .align 4
1217 .Ladd_proceed:
1218 addi $rp,$sp,$Rsqr
1219 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1220
1221 ld $bi,64($ap_real)
1222 ld $a0,$H+0($sp)
1223 ld $a1,$H+8($sp)
1224 ld $a2,$H+16($sp)
1225 ld $a3,$H+24($sp)
1226 addi $bp,$ap_real,64
1227 addi $rp,$sp,$res_z
1228 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1229
1230 ld $a0,$H+0($sp)
1231 ld $a1,$H+8($sp)
1232 ld $a2,$H+16($sp)
1233 ld $a3,$H+24($sp)
1234 addi $rp,$sp,$Hsqr
1235 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1236
1237 ld $bi,64($bp_real)
1238 ld $a0,$res_z+0($sp)
1239 ld $a1,$res_z+8($sp)
1240 ld $a2,$res_z+16($sp)
1241 ld $a3,$res_z+24($sp)
1242 addi $bp,$bp_real,64
1243 addi $rp,$sp,$res_z
1244 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
1245
1246 ld $bi,$H($sp)
1247 ld $a0,$Hsqr+0($sp)
1248 ld $a1,$Hsqr+8($sp)
1249 ld $a2,$Hsqr+16($sp)
1250 ld $a3,$Hsqr+24($sp)
1251 addi $bp,$sp,$H
1252 addi $rp,$sp,$Hcub
1253 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1254
1255 ld $bi,$Hsqr($sp)
1256 ld $a0,$U1+0($sp)
1257 ld $a1,$U1+8($sp)
1258 ld $a2,$U1+16($sp)
1259 ld $a3,$U1+24($sp)
1260 addi $bp,$sp,$Hsqr
1261 addi $rp,$sp,$U2
1262 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
1263
1264 mr $t0,$acc0
1265 mr $t1,$acc1
1266 mr $t2,$acc2
1267 mr $t3,$acc3
1268 addi $rp,$sp,$Hsqr
1269 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1270
1271 addi $bp,$sp,$Rsqr
1272 addi $rp,$sp,$res_x
1273 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1274
1275 addi $bp,$sp,$Hcub
1276 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1277
1278 addi $bp,$sp,$U2
1279 ld $bi,$Hcub($sp) # forward load for p256_mul_mont
1280 ld $a0,$S1+0($sp)
1281 ld $a1,$S1+8($sp)
1282 ld $a2,$S1+16($sp)
1283 ld $a3,$S1+24($sp)
1284 addi $rp,$sp,$res_y
1285 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1286
1287 addi $bp,$sp,$Hcub
1288 addi $rp,$sp,$S2
1289 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
1290
1291 ld $bi,$R($sp)
1292 ld $a0,$res_y+0($sp)
1293 ld $a1,$res_y+8($sp)
1294 ld $a2,$res_y+16($sp)
1295 ld $a3,$res_y+24($sp)
1296 addi $bp,$sp,$R
1297 addi $rp,$sp,$res_y
1298 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1299
1300 addi $bp,$sp,$S2
1301 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1302
1303 ld $t0,0($bp_real) # in2
1304 ld $t1,8($bp_real)
1305 ld $t2,16($bp_real)
1306 ld $t3,24($bp_real)
1307 ld $a0,$res_x+0($sp) # res
1308 ld $a1,$res_x+8($sp)
1309 ld $a2,$res_x+16($sp)
1310 ld $a3,$res_x+24($sp)
1311 ___
1312 for($i=0;$i<64;$i+=32) { # conditional moves
1313 $code.=<<___;
1314 ld $acc0,$i+0($ap_real) # in1
1315 ld $acc1,$i+8($ap_real)
1316 ld $acc2,$i+16($ap_real)
1317 ld $acc3,$i+24($ap_real)
1318 andc $t0,$t0,$in1infty
1319 andc $t1,$t1,$in1infty
1320 andc $t2,$t2,$in1infty
1321 andc $t3,$t3,$in1infty
1322 and $a0,$a0,$in1infty
1323 and $a1,$a1,$in1infty
1324 and $a2,$a2,$in1infty
1325 and $a3,$a3,$in1infty
1326 or $t0,$t0,$a0
1327 or $t1,$t1,$a1
1328 or $t2,$t2,$a2
1329 or $t3,$t3,$a3
1330 andc $acc0,$acc0,$in2infty
1331 andc $acc1,$acc1,$in2infty
1332 andc $acc2,$acc2,$in2infty
1333 andc $acc3,$acc3,$in2infty
1334 and $t0,$t0,$in2infty
1335 and $t1,$t1,$in2infty
1336 and $t2,$t2,$in2infty
1337 and $t3,$t3,$in2infty
1338 or $acc0,$acc0,$t0
1339 or $acc1,$acc1,$t1
1340 or $acc2,$acc2,$t2
1341 or $acc3,$acc3,$t3
1342
1343 ld $t0,$i+32($bp_real) # in2
1344 ld $t1,$i+40($bp_real)
1345 ld $t2,$i+48($bp_real)
1346 ld $t3,$i+56($bp_real)
1347 ld $a0,$res_x+$i+32($sp)
1348 ld $a1,$res_x+$i+40($sp)
1349 ld $a2,$res_x+$i+48($sp)
1350 ld $a3,$res_x+$i+56($sp)
1351 std $acc0,$i+0($rp_real)
1352 std $acc1,$i+8($rp_real)
1353 std $acc2,$i+16($rp_real)
1354 std $acc3,$i+24($rp_real)
1355 ___
1356 }
1357 $code.=<<___;
1358 ld $acc0,$i+0($ap_real) # in1
1359 ld $acc1,$i+8($ap_real)
1360 ld $acc2,$i+16($ap_real)
1361 ld $acc3,$i+24($ap_real)
1362 andc $t0,$t0,$in1infty
1363 andc $t1,$t1,$in1infty
1364 andc $t2,$t2,$in1infty
1365 andc $t3,$t3,$in1infty
1366 and $a0,$a0,$in1infty
1367 and $a1,$a1,$in1infty
1368 and $a2,$a2,$in1infty
1369 and $a3,$a3,$in1infty
1370 or $t0,$t0,$a0
1371 or $t1,$t1,$a1
1372 or $t2,$t2,$a2
1373 or $t3,$t3,$a3
1374 andc $acc0,$acc0,$in2infty
1375 andc $acc1,$acc1,$in2infty
1376 andc $acc2,$acc2,$in2infty
1377 andc $acc3,$acc3,$in2infty
1378 and $t0,$t0,$in2infty
1379 and $t1,$t1,$in2infty
1380 and $t2,$t2,$in2infty
1381 and $t3,$t3,$in2infty
1382 or $acc0,$acc0,$t0
1383 or $acc1,$acc1,$t1
1384 or $acc2,$acc2,$t2
1385 or $acc3,$acc3,$t3
1386 std $acc0,$i+0($rp_real)
1387 std $acc1,$i+8($rp_real)
1388 std $acc2,$i+16($rp_real)
1389 std $acc3,$i+24($rp_real)
1390
1391 .Ladd_done:
1392 mtlr r0
1393 ld r16,$FRAME-8*16($sp)
1394 ld r17,$FRAME-8*15($sp)
1395 ld r18,$FRAME-8*14($sp)
1396 ld r19,$FRAME-8*13($sp)
1397 ld r20,$FRAME-8*12($sp)
1398 ld r21,$FRAME-8*11($sp)
1399 ld r22,$FRAME-8*10($sp)
1400 ld r23,$FRAME-8*9($sp)
1401 ld r24,$FRAME-8*8($sp)
1402 ld r25,$FRAME-8*7($sp)
1403 ld r26,$FRAME-8*6($sp)
1404 ld r27,$FRAME-8*5($sp)
1405 ld r28,$FRAME-8*4($sp)
1406 ld r29,$FRAME-8*3($sp)
1407 ld r30,$FRAME-8*2($sp)
1408 ld r31,$FRAME-8*1($sp)
1409 addi $sp,$sp,$FRAME
1410 blr
1411 .long 0
1412 .byte 0,12,4,0,0x80,16,3,0
1413 .long 0
1414 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1415 ___
1416 }
1417
1418 ########################################################################
1419 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1420 # const P256_POINT_AFFINE *in2);
1421 if (1) {
1422 my $FRAME = 64 + 32*10 + 16*8;
1423 my ($res_x,$res_y,$res_z,
1424 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1425 my $Z1sqr = $S2;
1426 # above map() describes stack layout with 10 temporary
1427 # 256-bit vectors on top.
1428 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1429
1430 $code.=<<___;
1431 .globl ecp_nistz256_point_add_affine
1432 .align 5
1433 ecp_nistz256_point_add_affine:
1434 stdu $sp,-$FRAME($sp)
1435 mflr r0
1436 std r16,$FRAME-8*16($sp)
1437 std r17,$FRAME-8*15($sp)
1438 std r18,$FRAME-8*14($sp)
1439 std r19,$FRAME-8*13($sp)
1440 std r20,$FRAME-8*12($sp)
1441 std r21,$FRAME-8*11($sp)
1442 std r22,$FRAME-8*10($sp)
1443 std r23,$FRAME-8*9($sp)
1444 std r24,$FRAME-8*8($sp)
1445 std r25,$FRAME-8*7($sp)
1446 std r26,$FRAME-8*6($sp)
1447 std r27,$FRAME-8*5($sp)
1448 std r28,$FRAME-8*4($sp)
1449 std r29,$FRAME-8*3($sp)
1450 std r30,$FRAME-8*2($sp)
1451 std r31,$FRAME-8*1($sp)
1452
1453 li $poly1,-1
1454 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1455 li $poly3,1
1456 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1457
1458 mr $rp_real,$rp
1459 mr $ap_real,$ap
1460 mr $bp_real,$bp
1461
1462 ld $a0,64($ap) # in1_z
1463 ld $a1,72($ap)
1464 ld $a2,80($ap)
1465 ld $a3,88($ap)
1466 or $t0,$a0,$a1
1467 or $t2,$a2,$a3
1468 or $in1infty,$t0,$t2
1469 neg $t0,$in1infty
1470 or $in1infty,$in1infty,$t0
1471 sradi $in1infty,$in1infty,63 # !in1infty
1472
1473 ld $acc0,0($bp) # in2_x
1474 ld $acc1,8($bp)
1475 ld $acc2,16($bp)
1476 ld $acc3,24($bp)
1477 ld $t0,32($bp) # in2_y
1478 ld $t1,40($bp)
1479 ld $t2,48($bp)
1480 ld $t3,56($bp)
1481 or $acc0,$acc0,$acc1
1482 or $acc2,$acc2,$acc3
1483 or $acc0,$acc0,$acc2
1484 or $t0,$t0,$t1
1485 or $t2,$t2,$t3
1486 or $t0,$t0,$t2
1487 or $in2infty,$acc0,$t0
1488 neg $t0,$in2infty
1489 or $in2infty,$in2infty,$t0
1490 sradi $in2infty,$in2infty,63 # !in2infty
1491
1492 addi $rp,$sp,$Z1sqr
1493 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1494
1495 mr $a0,$acc0
1496 mr $a1,$acc1
1497 mr $a2,$acc2
1498 mr $a3,$acc3
1499 ld $bi,0($bp_real)
1500 addi $bp,$bp_real,0
1501 addi $rp,$sp,$U2
1502 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
1503
1504 addi $bp,$ap_real,0
1505 ld $bi,64($ap_real) # forward load for p256_mul_mont
1506 ld $a0,$Z1sqr+0($sp)
1507 ld $a1,$Z1sqr+8($sp)
1508 ld $a2,$Z1sqr+16($sp)
1509 ld $a3,$Z1sqr+24($sp)
1510 addi $rp,$sp,$H
1511 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
1512
1513 addi $bp,$ap_real,64
1514 addi $rp,$sp,$S2
1515 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1516
1517 ld $bi,64($ap_real)
1518 ld $a0,$H+0($sp)
1519 ld $a1,$H+8($sp)
1520 ld $a2,$H+16($sp)
1521 ld $a3,$H+24($sp)
1522 addi $bp,$ap_real,64
1523 addi $rp,$sp,$res_z
1524 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1525
1526 ld $bi,32($bp_real)
1527 ld $a0,$S2+0($sp)
1528 ld $a1,$S2+8($sp)
1529 ld $a2,$S2+16($sp)
1530 ld $a3,$S2+24($sp)
1531 addi $bp,$bp_real,32
1532 addi $rp,$sp,$S2
1533 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1534
1535 addi $bp,$ap_real,32
1536 ld $a0,$H+0($sp) # forward load for p256_sqr_mont
1537 ld $a1,$H+8($sp)
1538 ld $a2,$H+16($sp)
1539 ld $a3,$H+24($sp)
1540 addi $rp,$sp,$R
1541 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
1542
1543 addi $rp,$sp,$Hsqr
1544 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1545
1546 ld $a0,$R+0($sp)
1547 ld $a1,$R+8($sp)
1548 ld $a2,$R+16($sp)
1549 ld $a3,$R+24($sp)
1550 addi $rp,$sp,$Rsqr
1551 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1552
1553 ld $bi,$H($sp)
1554 ld $a0,$Hsqr+0($sp)
1555 ld $a1,$Hsqr+8($sp)
1556 ld $a2,$Hsqr+16($sp)
1557 ld $a3,$Hsqr+24($sp)
1558 addi $bp,$sp,$H
1559 addi $rp,$sp,$Hcub
1560 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1561
1562 ld $bi,0($ap_real)
1563 ld $a0,$Hsqr+0($sp)
1564 ld $a1,$Hsqr+8($sp)
1565 ld $a2,$Hsqr+16($sp)
1566 ld $a3,$Hsqr+24($sp)
1567 addi $bp,$ap_real,0
1568 addi $rp,$sp,$U2
1569 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
1570
1571 mr $t0,$acc0
1572 mr $t1,$acc1
1573 mr $t2,$acc2
1574 mr $t3,$acc3
1575 addi $rp,$sp,$Hsqr
1576 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1577
1578 addi $bp,$sp,$Rsqr
1579 addi $rp,$sp,$res_x
1580 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1581
1582 addi $bp,$sp,$Hcub
1583 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1584
1585 addi $bp,$sp,$U2
1586 ld $bi,32($ap_real) # forward load for p256_mul_mont
1587 ld $a0,$Hcub+0($sp)
1588 ld $a1,$Hcub+8($sp)
1589 ld $a2,$Hcub+16($sp)
1590 ld $a3,$Hcub+24($sp)
1591 addi $rp,$sp,$res_y
1592 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1593
1594 addi $bp,$ap_real,32
1595 addi $rp,$sp,$S2
1596 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
1597
1598 ld $bi,$R($sp)
1599 ld $a0,$res_y+0($sp)
1600 ld $a1,$res_y+8($sp)
1601 ld $a2,$res_y+16($sp)
1602 ld $a3,$res_y+24($sp)
1603 addi $bp,$sp,$R
1604 addi $rp,$sp,$res_y
1605 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1606
1607 addi $bp,$sp,$S2
1608 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1609
1610 ld $t0,0($bp_real) # in2
1611 ld $t1,8($bp_real)
1612 ld $t2,16($bp_real)
1613 ld $t3,24($bp_real)
1614 ld $a0,$res_x+0($sp) # res
1615 ld $a1,$res_x+8($sp)
1616 ld $a2,$res_x+16($sp)
1617 ld $a3,$res_x+24($sp)
1618 ___
1619 for($i=0;$i<64;$i+=32) { # conditional moves
1620 $code.=<<___;
1621 ld $acc0,$i+0($ap_real) # in1
1622 ld $acc1,$i+8($ap_real)
1623 ld $acc2,$i+16($ap_real)
1624 ld $acc3,$i+24($ap_real)
1625 andc $t0,$t0,$in1infty
1626 andc $t1,$t1,$in1infty
1627 andc $t2,$t2,$in1infty
1628 andc $t3,$t3,$in1infty
1629 and $a0,$a0,$in1infty
1630 and $a1,$a1,$in1infty
1631 and $a2,$a2,$in1infty
1632 and $a3,$a3,$in1infty
1633 or $t0,$t0,$a0
1634 or $t1,$t1,$a1
1635 or $t2,$t2,$a2
1636 or $t3,$t3,$a3
1637 andc $acc0,$acc0,$in2infty
1638 andc $acc1,$acc1,$in2infty
1639 andc $acc2,$acc2,$in2infty
1640 andc $acc3,$acc3,$in2infty
1641 and $t0,$t0,$in2infty
1642 and $t1,$t1,$in2infty
1643 and $t2,$t2,$in2infty
1644 and $t3,$t3,$in2infty
1645 or $acc0,$acc0,$t0
1646 or $acc1,$acc1,$t1
1647 or $acc2,$acc2,$t2
1648 or $acc3,$acc3,$t3
1649 ___
1650 $code.=<<___ if ($i==0);
1651 ld $t0,32($bp_real) # in2
1652 ld $t1,40($bp_real)
1653 ld $t2,48($bp_real)
1654 ld $t3,56($bp_real)
1655 ___
1656 $code.=<<___ if ($i==32);
1657 li $t0,1 # Lone_mont
1658 not $t1,$poly1
1659 li $t2,-1
1660 not $t3,$poly3
1661 ___
1662 $code.=<<___;
1663 ld $a0,$res_x+$i+32($sp)
1664 ld $a1,$res_x+$i+40($sp)
1665 ld $a2,$res_x+$i+48($sp)
1666 ld $a3,$res_x+$i+56($sp)
1667 std $acc0,$i+0($rp_real)
1668 std $acc1,$i+8($rp_real)
1669 std $acc2,$i+16($rp_real)
1670 std $acc3,$i+24($rp_real)
1671 ___
1672 }
1673 $code.=<<___;
1674 ld $acc0,$i+0($ap_real) # in1
1675 ld $acc1,$i+8($ap_real)
1676 ld $acc2,$i+16($ap_real)
1677 ld $acc3,$i+24($ap_real)
1678 andc $t0,$t0,$in1infty
1679 andc $t1,$t1,$in1infty
1680 andc $t2,$t2,$in1infty
1681 andc $t3,$t3,$in1infty
1682 and $a0,$a0,$in1infty
1683 and $a1,$a1,$in1infty
1684 and $a2,$a2,$in1infty
1685 and $a3,$a3,$in1infty
1686 or $t0,$t0,$a0
1687 or $t1,$t1,$a1
1688 or $t2,$t2,$a2
1689 or $t3,$t3,$a3
1690 andc $acc0,$acc0,$in2infty
1691 andc $acc1,$acc1,$in2infty
1692 andc $acc2,$acc2,$in2infty
1693 andc $acc3,$acc3,$in2infty
1694 and $t0,$t0,$in2infty
1695 and $t1,$t1,$in2infty
1696 and $t2,$t2,$in2infty
1697 and $t3,$t3,$in2infty
1698 or $acc0,$acc0,$t0
1699 or $acc1,$acc1,$t1
1700 or $acc2,$acc2,$t2
1701 or $acc3,$acc3,$t3
1702 std $acc0,$i+0($rp_real)
1703 std $acc1,$i+8($rp_real)
1704 std $acc2,$i+16($rp_real)
1705 std $acc3,$i+24($rp_real)
1706
1707 mtlr r0
1708 ld r16,$FRAME-8*16($sp)
1709 ld r17,$FRAME-8*15($sp)
1710 ld r18,$FRAME-8*14($sp)
1711 ld r19,$FRAME-8*13($sp)
1712 ld r20,$FRAME-8*12($sp)
1713 ld r21,$FRAME-8*11($sp)
1714 ld r22,$FRAME-8*10($sp)
1715 ld r23,$FRAME-8*9($sp)
1716 ld r24,$FRAME-8*8($sp)
1717 ld r25,$FRAME-8*7($sp)
1718 ld r26,$FRAME-8*6($sp)
1719 ld r27,$FRAME-8*5($sp)
1720 ld r28,$FRAME-8*4($sp)
1721 ld r29,$FRAME-8*3($sp)
1722 ld r30,$FRAME-8*2($sp)
1723 ld r31,$FRAME-8*1($sp)
1724 addi $sp,$sp,$FRAME
1725 blr
1726 .long 0
1727 .byte 0,12,4,0,0x80,16,3,0
1728 .long 0
1729 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1730 ___
1731 }
1732 if (1) {
1733 my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1734 my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1735
1736 $code.=<<___;
1737 ########################################################################
1738 # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1739 # uint64_t b[4]);
1740 .globl ecp_nistz256_ord_mul_mont
1741 .align 5
1742 ecp_nistz256_ord_mul_mont:
1743 stdu $sp,-160($sp)
1744 std r18,48($sp)
1745 std r19,56($sp)
1746 std r20,64($sp)
1747 std r21,72($sp)
1748 std r22,80($sp)
1749 std r23,88($sp)
1750 std r24,96($sp)
1751 std r25,104($sp)
1752 std r26,112($sp)
1753 std r27,120($sp)
1754 std r28,128($sp)
1755 std r29,136($sp)
1756 std r30,144($sp)
1757 std r31,152($sp)
1758
1759 ld $a0,0($ap)
1760 ld $bi,0($bp)
1761 ld $a1,8($ap)
1762 ld $a2,16($ap)
1763 ld $a3,24($ap)
1764
1765 lis $ordk,0xccd1
1766 lis $ord0,0xf3b9
1767 lis $ord1,0xbce6
1768 ori $ordk,$ordk,0xc8aa
1769 ori $ord0,$ord0,0xcac2
1770 ori $ord1,$ord1,0xfaad
1771 sldi $ordk,$ordk,32
1772 sldi $ord0,$ord0,32
1773 sldi $ord1,$ord1,32
1774 oris $ordk,$ordk,0xee00
1775 oris $ord0,$ord0,0xfc63
1776 oris $ord1,$ord1,0xa717
1777 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1778 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1779 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1780 li $ord2,-1 # 0xffffffffffffffff
1781 sldi $ord3,$ord2,32 # 0xffffffff00000000
1782 li $zr,0
1783
1784 mulld $acc0,$a0,$bi # a[0]*b[0]
1785 mulhdu $t0,$a0,$bi
1786
1787 mulld $acc1,$a1,$bi # a[1]*b[0]
1788 mulhdu $t1,$a1,$bi
1789
1790 mulld $acc2,$a2,$bi # a[2]*b[0]
1791 mulhdu $t2,$a2,$bi
1792
1793 mulld $acc3,$a3,$bi # a[3]*b[0]
1794 mulhdu $acc4,$a3,$bi
1795
1796 mulld $t4,$acc0,$ordk
1797
1798 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
1799 adde $acc2,$acc2,$t1
1800 adde $acc3,$acc3,$t2
1801 addze $acc4,$acc4
1802 li $acc5,0
1803 ___
1804 for ($i=1;$i<4;$i++) {
1805 ################################################################
1806 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1807 # * abcdefgh
1808 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1809 #
1810 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1811 # rewrite above as:
1812 #
1813 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1814 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1815 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1816 $code.=<<___;
1817 ld $bi,8*$i($bp) # b[i]
1818
1819 sldi $t0,$t4,32
1820 subfc $acc2,$t4,$acc2
1821 srdi $t1,$t4,32
1822 subfe $acc3,$t0,$acc3
1823 subfe $acc4,$t1,$acc4
1824 subfe $acc5,$zr,$acc5
1825
1826 addic $t0,$acc0,-1 # discarded
1827 mulhdu $t1,$ord0,$t4
1828 mulld $t2,$ord1,$t4
1829 mulhdu $t3,$ord1,$t4
1830
1831 adde $t2,$t2,$t1
1832 mulld $t0,$a0,$bi
1833 addze $t3,$t3
1834 mulld $t1,$a1,$bi
1835
1836 addc $acc0,$acc1,$t2
1837 mulld $t2,$a2,$bi
1838 adde $acc1,$acc2,$t3
1839 mulld $t3,$a3,$bi
1840 adde $acc2,$acc3,$t4
1841 adde $acc3,$acc4,$t4
1842 addze $acc4,$acc5
1843
1844 addc $acc0,$acc0,$t0 # accumulate low parts
1845 mulhdu $t0,$a0,$bi
1846 adde $acc1,$acc1,$t1
1847 mulhdu $t1,$a1,$bi
1848 adde $acc2,$acc2,$t2
1849 mulhdu $t2,$a2,$bi
1850 adde $acc3,$acc3,$t3
1851 mulhdu $t3,$a3,$bi
1852 addze $acc4,$acc4
1853 mulld $t4,$acc0,$ordk
1854 addc $acc1,$acc1,$t0 # accumulate high parts
1855 adde $acc2,$acc2,$t1
1856 adde $acc3,$acc3,$t2
1857 adde $acc4,$acc4,$t3
1858 addze $acc5,$zr
1859 ___
1860 }
1861 $code.=<<___;
1862 sldi $t0,$t4,32 # last reduction
1863 subfc $acc2,$t4,$acc2
1864 srdi $t1,$t4,32
1865 subfe $acc3,$t0,$acc3
1866 subfe $acc4,$t1,$acc4
1867 subfe $acc5,$zr,$acc5
1868
1869 addic $t0,$acc0,-1 # discarded
1870 mulhdu $t1,$ord0,$t4
1871 mulld $t2,$ord1,$t4
1872 mulhdu $t3,$ord1,$t4
1873
1874 adde $t2,$t2,$t1
1875 addze $t3,$t3
1876
1877 addc $acc0,$acc1,$t2
1878 adde $acc1,$acc2,$t3
1879 adde $acc2,$acc3,$t4
1880 adde $acc3,$acc4,$t4
1881 addze $acc4,$acc5
1882
1883 subfc $acc0,$ord0,$acc0 # ret -= modulus
1884 subfe $acc1,$ord1,$acc1
1885 subfe $acc2,$ord2,$acc2
1886 subfe $acc3,$ord3,$acc3
1887 subfe $acc4,$zr,$acc4
1888
1889 and $t0,$ord0,$acc4
1890 and $t1,$ord1,$acc4
1891 addc $acc0,$acc0,$t0 # ret += modulus if borrow
1892 and $t3,$ord3,$acc4
1893 adde $acc1,$acc1,$t1
1894 adde $acc2,$acc2,$acc4
1895 adde $acc3,$acc3,$t3
1896
1897 std $acc0,0($rp)
1898 std $acc1,8($rp)
1899 std $acc2,16($rp)
1900 std $acc3,24($rp)
1901
1902 ld r18,48($sp)
1903 ld r19,56($sp)
1904 ld r20,64($sp)
1905 ld r21,72($sp)
1906 ld r22,80($sp)
1907 ld r23,88($sp)
1908 ld r24,96($sp)
1909 ld r25,104($sp)
1910 ld r26,112($sp)
1911 ld r27,120($sp)
1912 ld r28,128($sp)
1913 ld r29,136($sp)
1914 ld r30,144($sp)
1915 ld r31,152($sp)
1916 addi $sp,$sp,160
1917 blr
1918 .long 0
1919 .byte 0,12,4,0,0x80,14,3,0
1920 .long 0
1921 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1922
1923 ################################################################################
1924 # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1925 # uint64_t rep);
1926 .globl ecp_nistz256_ord_sqr_mont
1927 .align 5
1928 ecp_nistz256_ord_sqr_mont:
1929 stdu $sp,-160($sp)
1930 std r18,48($sp)
1931 std r19,56($sp)
1932 std r20,64($sp)
1933 std r21,72($sp)
1934 std r22,80($sp)
1935 std r23,88($sp)
1936 std r24,96($sp)
1937 std r25,104($sp)
1938 std r26,112($sp)
1939 std r27,120($sp)
1940 std r28,128($sp)
1941 std r29,136($sp)
1942 std r30,144($sp)
1943 std r31,152($sp)
1944
1945 mtctr $bp
1946
1947 ld $a0,0($ap)
1948 ld $a1,8($ap)
1949 ld $a2,16($ap)
1950 ld $a3,24($ap)
1951
1952 lis $ordk,0xccd1
1953 lis $ord0,0xf3b9
1954 lis $ord1,0xbce6
1955 ori $ordk,$ordk,0xc8aa
1956 ori $ord0,$ord0,0xcac2
1957 ori $ord1,$ord1,0xfaad
1958 sldi $ordk,$ordk,32
1959 sldi $ord0,$ord0,32
1960 sldi $ord1,$ord1,32
1961 oris $ordk,$ordk,0xee00
1962 oris $ord0,$ord0,0xfc63
1963 oris $ord1,$ord1,0xa717
1964 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1965 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1966 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1967 li $ord2,-1 # 0xffffffffffffffff
1968 sldi $ord3,$ord2,32 # 0xffffffff00000000
1969 li $zr,0
1970 b .Loop_ord_sqr
1971
1972 .align 5
1973 .Loop_ord_sqr:
1974 ################################################################
1975 # | | | | | |a1*a0| |
1976 # | | | | |a2*a0| | |
1977 # | |a3*a2|a3*a0| | | |
1978 # | | | |a2*a1| | | |
1979 # | | |a3*a1| | | | |
1980 # *| | | | | | | | 2|
1981 # +|a3*a3|a2*a2|a1*a1|a0*a0|
1982 # |--+--+--+--+--+--+--+--|
1983 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1984 #
1985 # "can't overflow" below mark carrying into high part of
1986 # multiplication result, which can't overflow, because it
1987 # can never be all ones.
1988
1989 mulld $acc1,$a1,$a0 # a[1]*a[0]
1990 mulhdu $t1,$a1,$a0
1991 mulld $acc2,$a2,$a0 # a[2]*a[0]
1992 mulhdu $t2,$a2,$a0
1993 mulld $acc3,$a3,$a0 # a[3]*a[0]
1994 mulhdu $acc4,$a3,$a0
1995
1996 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
1997 mulld $t0,$a2,$a1 # a[2]*a[1]
1998 mulhdu $t1,$a2,$a1
1999 adde $acc3,$acc3,$t2
2000 mulld $t2,$a3,$a1 # a[3]*a[1]
2001 mulhdu $t3,$a3,$a1
2002 addze $acc4,$acc4 # can't overflow
2003
2004 mulld $acc5,$a3,$a2 # a[3]*a[2]
2005 mulhdu $acc6,$a3,$a2
2006
2007 addc $t1,$t1,$t2 # accumulate high parts of multiplication
2008 mulld $acc0,$a0,$a0 # a[0]*a[0]
2009 addze $t2,$t3 # can't overflow
2010
2011 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
2012 mulhdu $a0,$a0,$a0
2013 adde $acc4,$acc4,$t1
2014 mulld $t1,$a1,$a1 # a[1]*a[1]
2015 adde $acc5,$acc5,$t2
2016 mulhdu $a1,$a1,$a1
2017 addze $acc6,$acc6 # can't overflow
2018
2019 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
2020 mulld $t2,$a2,$a2 # a[2]*a[2]
2021 adde $acc2,$acc2,$acc2
2022 mulhdu $a2,$a2,$a2
2023 adde $acc3,$acc3,$acc3
2024 mulld $t3,$a3,$a3 # a[3]*a[3]
2025 adde $acc4,$acc4,$acc4
2026 mulhdu $a3,$a3,$a3
2027 adde $acc5,$acc5,$acc5
2028 adde $acc6,$acc6,$acc6
2029 addze $acc7,$zr
2030
2031 addc $acc1,$acc1,$a0 # +a[i]*a[i]
2032 mulld $t4,$acc0,$ordk
2033 adde $acc2,$acc2,$t1
2034 adde $acc3,$acc3,$a1
2035 adde $acc4,$acc4,$t2
2036 adde $acc5,$acc5,$a2
2037 adde $acc6,$acc6,$t3
2038 adde $acc7,$acc7,$a3
2039 ___
2040 for($i=0; $i<4; $i++) { # reductions
2041 $code.=<<___;
2042 addic $t0,$acc0,-1 # discarded
2043 mulhdu $t1,$ord0,$t4
2044 mulld $t2,$ord1,$t4
2045 mulhdu $t3,$ord1,$t4
2046
2047 adde $t2,$t2,$t1
2048 addze $t3,$t3
2049
2050 addc $acc0,$acc1,$t2
2051 adde $acc1,$acc2,$t3
2052 adde $acc2,$acc3,$t4
2053 adde $acc3,$zr,$t4 # can't overflow
2054 ___
2055 $code.=<<___ if ($i<3);
2056 mulld $t3,$acc0,$ordk
2057 ___
2058 $code.=<<___;
2059 sldi $t0,$t4,32
2060 subfc $acc1,$t4,$acc1
2061 srdi $t1,$t4,32
2062 subfe $acc2,$t0,$acc2
2063 subfe $acc3,$t1,$acc3 # can't borrow
2064 ___
2065 ($t3,$t4) = ($t4,$t3);
2066 }
2067 $code.=<<___;
2068 addc $acc0,$acc0,$acc4 # accumulate upper half
2069 adde $acc1,$acc1,$acc5
2070 adde $acc2,$acc2,$acc6
2071 adde $acc3,$acc3,$acc7
2072 addze $acc4,$zr
2073
2074 subfc $acc0,$ord0,$acc0 # ret -= modulus
2075 subfe $acc1,$ord1,$acc1
2076 subfe $acc2,$ord2,$acc2
2077 subfe $acc3,$ord3,$acc3
2078 subfe $acc4,$zr,$acc4
2079
2080 and $t0,$ord0,$acc4
2081 and $t1,$ord1,$acc4
2082 addc $a0,$acc0,$t0 # ret += modulus if borrow
2083 and $t3,$ord3,$acc4
2084 adde $a1,$acc1,$t1
2085 adde $a2,$acc2,$acc4
2086 adde $a3,$acc3,$t3
2087
2088 bdnz .Loop_ord_sqr
2089
2090 std $a0,0($rp)
2091 std $a1,8($rp)
2092 std $a2,16($rp)
2093 std $a3,24($rp)
2094
2095 ld r18,48($sp)
2096 ld r19,56($sp)
2097 ld r20,64($sp)
2098 ld r21,72($sp)
2099 ld r22,80($sp)
2100 ld r23,88($sp)
2101 ld r24,96($sp)
2102 ld r25,104($sp)
2103 ld r26,112($sp)
2104 ld r27,120($sp)
2105 ld r28,128($sp)
2106 ld r29,136($sp)
2107 ld r30,144($sp)
2108 ld r31,152($sp)
2109 addi $sp,$sp,160
2110 blr
2111 .long 0
2112 .byte 0,12,4,0,0x80,14,3,0
2113 .long 0
2114 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2115 ___
2116 } }
2117
2118 ########################################################################
2119 # scatter-gather subroutines
2120 {
2121 my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2122 $code.=<<___;
2123 ########################################################################
2124 # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2125 # int index);
2126 .globl ecp_nistz256_scatter_w5
2127 .align 4
2128 ecp_nistz256_scatter_w5:
2129 slwi $index,$index,2
2130 add $out,$out,$index
2131
2132 ld r8, 0($inp) # X
2133 ld r9, 8($inp)
2134 ld r10,16($inp)
2135 ld r11,24($inp)
2136
2137 stw r8, 64*0-4($out)
2138 srdi r8, r8, 32
2139 stw r9, 64*1-4($out)
2140 srdi r9, r9, 32
2141 stw r10,64*2-4($out)
2142 srdi r10,r10,32
2143 stw r11,64*3-4($out)
2144 srdi r11,r11,32
2145 stw r8, 64*4-4($out)
2146 stw r9, 64*5-4($out)
2147 stw r10,64*6-4($out)
2148 stw r11,64*7-4($out)
2149 addi $out,$out,64*8
2150
2151 ld r8, 32($inp) # Y
2152 ld r9, 40($inp)
2153 ld r10,48($inp)
2154 ld r11,56($inp)
2155
2156 stw r8, 64*0-4($out)
2157 srdi r8, r8, 32
2158 stw r9, 64*1-4($out)
2159 srdi r9, r9, 32
2160 stw r10,64*2-4($out)
2161 srdi r10,r10,32
2162 stw r11,64*3-4($out)
2163 srdi r11,r11,32
2164 stw r8, 64*4-4($out)
2165 stw r9, 64*5-4($out)
2166 stw r10,64*6-4($out)
2167 stw r11,64*7-4($out)
2168 addi $out,$out,64*8
2169
2170 ld r8, 64($inp) # Z
2171 ld r9, 72($inp)
2172 ld r10,80($inp)
2173 ld r11,88($inp)
2174
2175 stw r8, 64*0-4($out)
2176 srdi r8, r8, 32
2177 stw r9, 64*1-4($out)
2178 srdi r9, r9, 32
2179 stw r10,64*2-4($out)
2180 srdi r10,r10,32
2181 stw r11,64*3-4($out)
2182 srdi r11,r11,32
2183 stw r8, 64*4-4($out)
2184 stw r9, 64*5-4($out)
2185 stw r10,64*6-4($out)
2186 stw r11,64*7-4($out)
2187
2188 blr
2189 .long 0
2190 .byte 0,12,0x14,0,0,0,3,0
2191 .long 0
2192 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2193
2194 ########################################################################
2195 # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2196 # int index);
2197 .globl ecp_nistz256_gather_w5
2198 .align 4
2199 ecp_nistz256_gather_w5:
2200 neg r0,$index
2201 sradi r0,r0,63
2202
2203 add $index,$index,r0
2204 slwi $index,$index,2
2205 add $inp,$inp,$index
2206
2207 lwz r5, 64*0($inp)
2208 lwz r6, 64*1($inp)
2209 lwz r7, 64*2($inp)
2210 lwz r8, 64*3($inp)
2211 lwz r9, 64*4($inp)
2212 lwz r10,64*5($inp)
2213 lwz r11,64*6($inp)
2214 lwz r12,64*7($inp)
2215 addi $inp,$inp,64*8
2216 sldi r9, r9, 32
2217 sldi r10,r10,32
2218 sldi r11,r11,32
2219 sldi r12,r12,32
2220 or r5,r5,r9
2221 or r6,r6,r10
2222 or r7,r7,r11
2223 or r8,r8,r12
2224 and r5,r5,r0
2225 and r6,r6,r0
2226 and r7,r7,r0
2227 and r8,r8,r0
2228 std r5,0($out) # X
2229 std r6,8($out)
2230 std r7,16($out)
2231 std r8,24($out)
2232
2233 lwz r5, 64*0($inp)
2234 lwz r6, 64*1($inp)
2235 lwz r7, 64*2($inp)
2236 lwz r8, 64*3($inp)
2237 lwz r9, 64*4($inp)
2238 lwz r10,64*5($inp)
2239 lwz r11,64*6($inp)
2240 lwz r12,64*7($inp)
2241 addi $inp,$inp,64*8
2242 sldi r9, r9, 32
2243 sldi r10,r10,32
2244 sldi r11,r11,32
2245 sldi r12,r12,32
2246 or r5,r5,r9
2247 or r6,r6,r10
2248 or r7,r7,r11
2249 or r8,r8,r12
2250 and r5,r5,r0
2251 and r6,r6,r0
2252 and r7,r7,r0
2253 and r8,r8,r0
2254 std r5,32($out) # Y
2255 std r6,40($out)
2256 std r7,48($out)
2257 std r8,56($out)
2258
2259 lwz r5, 64*0($inp)
2260 lwz r6, 64*1($inp)
2261 lwz r7, 64*2($inp)
2262 lwz r8, 64*3($inp)
2263 lwz r9, 64*4($inp)
2264 lwz r10,64*5($inp)
2265 lwz r11,64*6($inp)
2266 lwz r12,64*7($inp)
2267 sldi r9, r9, 32
2268 sldi r10,r10,32
2269 sldi r11,r11,32
2270 sldi r12,r12,32
2271 or r5,r5,r9
2272 or r6,r6,r10
2273 or r7,r7,r11
2274 or r8,r8,r12
2275 and r5,r5,r0
2276 and r6,r6,r0
2277 and r7,r7,r0
2278 and r8,r8,r0
2279 std r5,64($out) # Z
2280 std r6,72($out)
2281 std r7,80($out)
2282 std r8,88($out)
2283
2284 blr
2285 .long 0
2286 .byte 0,12,0x14,0,0,0,3,0
2287 .long 0
2288 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2289
2290 ########################################################################
2291 # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2292 # int index);
2293 .globl ecp_nistz256_scatter_w7
2294 .align 4
2295 ecp_nistz256_scatter_w7:
2296 li r0,8
2297 mtctr r0
2298 add $out,$out,$index
2299 subi $inp,$inp,8
2300
2301 .Loop_scatter_w7:
2302 ldu r0,8($inp)
2303 stb r0,64*0($out)
2304 srdi r0,r0,8
2305 stb r0,64*1($out)
2306 srdi r0,r0,8
2307 stb r0,64*2($out)
2308 srdi r0,r0,8
2309 stb r0,64*3($out)
2310 srdi r0,r0,8
2311 stb r0,64*4($out)
2312 srdi r0,r0,8
2313 stb r0,64*5($out)
2314 srdi r0,r0,8
2315 stb r0,64*6($out)
2316 srdi r0,r0,8
2317 stb r0,64*7($out)
2318 addi $out,$out,64*8
2319 bdnz .Loop_scatter_w7
2320
2321 blr
2322 .long 0
2323 .byte 0,12,0x14,0,0,0,3,0
2324 .long 0
2325 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2326
2327 ########################################################################
2328 # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2329 # int index);
2330 .globl ecp_nistz256_gather_w7
2331 .align 4
2332 ecp_nistz256_gather_w7:
2333 li r0,8
2334 mtctr r0
2335 neg r0,$index
2336 sradi r0,r0,63
2337
2338 add $index,$index,r0
2339 add $inp,$inp,$index
2340 subi $out,$out,8
2341
2342 .Loop_gather_w7:
2343 lbz r5, 64*0($inp)
2344 lbz r6, 64*1($inp)
2345 lbz r7, 64*2($inp)
2346 lbz r8, 64*3($inp)
2347 lbz r9, 64*4($inp)
2348 lbz r10,64*5($inp)
2349 lbz r11,64*6($inp)
2350 lbz r12,64*7($inp)
2351 addi $inp,$inp,64*8
2352
2353 sldi r6, r6, 8
2354 sldi r7, r7, 16
2355 sldi r8, r8, 24
2356 sldi r9, r9, 32
2357 sldi r10,r10,40
2358 sldi r11,r11,48
2359 sldi r12,r12,56
2360
2361 or r5,r5,r6
2362 or r7,r7,r8
2363 or r9,r9,r10
2364 or r11,r11,r12
2365 or r5,r5,r7
2366 or r9,r9,r11
2367 or r5,r5,r9
2368 and r5,r5,r0
2369 stdu r5,8($out)
2370 bdnz .Loop_gather_w7
2371
2372 blr
2373 .long 0
2374 .byte 0,12,0x14,0,0,0,3,0
2375 .long 0
2376 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2377 ___
2378 }
2379
2380 foreach (split("\n",$code)) {
2381 s/\`([^\`]*)\`/eval $1/ge;
2382
2383 print $_,"\n";
2384 }
2385 close STDOUT or die "error closing STDOUT: $!"; # enforce flush