]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/ec/asm/ecp_nistz256-ppc64.pl
Fix calling convention bug in ecp_nistz256_ord_sqr_mont
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-ppc64.pl
CommitLineData
d8f432aa 1#! /usr/bin/env perl
1212818e 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
624265c6 3#
a7f182b7 4# Licensed under the Apache License 2.0 (the "License"). You may not use
624265c6
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
d8f432aa
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for PPC64.
18#
19# August 2016.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24# with/without -DECP_NISTZ256_ASM
25# POWER7 +260-530%
26# POWER8 +220-340%
27
28$flavour = shift;
29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30
31$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
32( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
33( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
34die "can't locate ppc-xlate.pl";
35
36open OUT,"| \"$^X\" $xlate $flavour $output";
37*STDOUT=*OUT;
38
39my $sp="r1";
40
41{
42my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
43 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
44 map("r$_",(3..12,22..31));
45
46my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
47
48$code.=<<___;
49.machine "any"
50.text
51___
52########################################################################
53# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
54#
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56open TABLE,"<ecp_nistz256_table.c" or
57open TABLE,"<${dir}../ecp_nistz256_table.c" or
58die "failed to open ecp_nistz256_table.c:",$!;
59
60use integer;
61
62foreach(<TABLE>) {
63 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
64}
65close TABLE;
66
67# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
68# 64*16*37-1 is because $#arr returns last valid index or @arr, not
69# amount of elements.
70die "insane number of elements" if ($#arr != 64*16*37-1);
71
72$code.=<<___;
73.type ecp_nistz256_precomputed,\@object
74.globl ecp_nistz256_precomputed
75.align 12
76ecp_nistz256_precomputed:
77___
78########################################################################
79# this conversion smashes P256_POINT_AFFINE by individual bytes with
80# 64 byte interval, similar to
81# 1111222233334444
82# 1234123412341234
83for(1..37) {
84 @tbl = splice(@arr,0,64*16);
85 for($i=0;$i<64;$i++) {
86 undef @line;
87 for($j=0;$j<64;$j++) {
88 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
89 }
90 $code.=".byte\t";
91 $code.=join(',',map { sprintf "0x%02x",$_} @line);
92 $code.="\n";
93 }
94}
95
96$code.=<<___;
97.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
98.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
99
100# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
101# const BN_ULONG x2[4]);
102.globl ecp_nistz256_mul_mont
103.align 5
104ecp_nistz256_mul_mont:
105 stdu $sp,-128($sp)
106 mflr r0
107 std r22,48($sp)
108 std r23,56($sp)
109 std r24,64($sp)
110 std r25,72($sp)
111 std r26,80($sp)
112 std r27,88($sp)
113 std r28,96($sp)
114 std r29,104($sp)
115 std r30,112($sp)
116 std r31,120($sp)
117
118 ld $a0,0($ap)
119 ld $bi,0($bp)
120 ld $a1,8($ap)
121 ld $a2,16($ap)
122 ld $a3,24($ap)
123
124 li $poly1,-1
125 srdi $poly1,$poly1,32 # 0x00000000ffffffff
126 li $poly3,1
127 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
128
129 bl __ecp_nistz256_mul_mont
130
131 mtlr r0
132 ld r22,48($sp)
133 ld r23,56($sp)
134 ld r24,64($sp)
135 ld r25,72($sp)
136 ld r26,80($sp)
137 ld r27,88($sp)
138 ld r28,96($sp)
139 ld r29,104($sp)
140 ld r30,112($sp)
141 ld r31,120($sp)
142 addi $sp,$sp,128
143 blr
144 .long 0
145 .byte 0,12,4,0,0x80,10,3,0
146 .long 0
147.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
148
149# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
150.globl ecp_nistz256_sqr_mont
151.align 4
152ecp_nistz256_sqr_mont:
153 stdu $sp,-128($sp)
154 mflr r0
155 std r22,48($sp)
156 std r23,56($sp)
157 std r24,64($sp)
158 std r25,72($sp)
159 std r26,80($sp)
160 std r27,88($sp)
161 std r28,96($sp)
162 std r29,104($sp)
163 std r30,112($sp)
164 std r31,120($sp)
165
166 ld $a0,0($ap)
167 ld $a1,8($ap)
168 ld $a2,16($ap)
169 ld $a3,24($ap)
170
171 li $poly1,-1
172 srdi $poly1,$poly1,32 # 0x00000000ffffffff
173 li $poly3,1
174 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
175
176 bl __ecp_nistz256_sqr_mont
177
178 mtlr r0
179 ld r22,48($sp)
180 ld r23,56($sp)
181 ld r24,64($sp)
182 ld r25,72($sp)
183 ld r26,80($sp)
184 ld r27,88($sp)
185 ld r28,96($sp)
186 ld r29,104($sp)
187 ld r30,112($sp)
188 ld r31,120($sp)
189 addi $sp,$sp,128
190 blr
191 .long 0
192 .byte 0,12,4,0,0x80,10,2,0
193 .long 0
194.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
195
196# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
197# const BN_ULONG x2[4]);
198.globl ecp_nistz256_add
199.align 4
200ecp_nistz256_add:
201 stdu $sp,-128($sp)
202 mflr r0
203 std r28,96($sp)
204 std r29,104($sp)
205 std r30,112($sp)
206 std r31,120($sp)
207
208 ld $acc0,0($ap)
209 ld $t0, 0($bp)
210 ld $acc1,8($ap)
211 ld $t1, 8($bp)
212 ld $acc2,16($ap)
213 ld $t2, 16($bp)
214 ld $acc3,24($ap)
215 ld $t3, 24($bp)
216
217 li $poly1,-1
218 srdi $poly1,$poly1,32 # 0x00000000ffffffff
219 li $poly3,1
220 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
221
222 bl __ecp_nistz256_add
223
224 mtlr r0
225 ld r28,96($sp)
226 ld r29,104($sp)
227 ld r30,112($sp)
228 ld r31,120($sp)
229 addi $sp,$sp,128
230 blr
231 .long 0
232 .byte 0,12,4,0,0x80,4,3,0
233 .long 0
234.size ecp_nistz256_add,.-ecp_nistz256_add
235
236# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
237.globl ecp_nistz256_div_by_2
238.align 4
239ecp_nistz256_div_by_2:
240 stdu $sp,-128($sp)
241 mflr r0
242 std r28,96($sp)
243 std r29,104($sp)
244 std r30,112($sp)
245 std r31,120($sp)
246
247 ld $acc0,0($ap)
248 ld $acc1,8($ap)
249 ld $acc2,16($ap)
250 ld $acc3,24($ap)
251
252 li $poly1,-1
253 srdi $poly1,$poly1,32 # 0x00000000ffffffff
254 li $poly3,1
255 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
256
257 bl __ecp_nistz256_div_by_2
258
259 mtlr r0
260 ld r28,96($sp)
261 ld r29,104($sp)
262 ld r30,112($sp)
263 ld r31,120($sp)
264 addi $sp,$sp,128
265 blr
266 .long 0
267 .byte 0,12,4,0,0x80,4,2,0
268 .long 0
269.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
270
271# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
272.globl ecp_nistz256_mul_by_2
273.align 4
274ecp_nistz256_mul_by_2:
275 stdu $sp,-128($sp)
276 mflr r0
277 std r28,96($sp)
278 std r29,104($sp)
279 std r30,112($sp)
280 std r31,120($sp)
281
282 ld $acc0,0($ap)
283 ld $acc1,8($ap)
284 ld $acc2,16($ap)
285 ld $acc3,24($ap)
286
287 mr $t0,$acc0
288 mr $t1,$acc1
289 mr $t2,$acc2
290 mr $t3,$acc3
291
292 li $poly1,-1
293 srdi $poly1,$poly1,32 # 0x00000000ffffffff
294 li $poly3,1
295 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
296
297 bl __ecp_nistz256_add # ret = a+a // 2*a
298
299 mtlr r0
300 ld r28,96($sp)
301 ld r29,104($sp)
302 ld r30,112($sp)
303 ld r31,120($sp)
304 addi $sp,$sp,128
305 blr
306 .long 0
307 .byte 0,12,4,0,0x80,4,3,0
308 .long 0
309.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
310
311# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
312.globl ecp_nistz256_mul_by_3
313.align 4
314ecp_nistz256_mul_by_3:
315 stdu $sp,-128($sp)
316 mflr r0
317 std r28,96($sp)
318 std r29,104($sp)
319 std r30,112($sp)
320 std r31,120($sp)
321
322 ld $acc0,0($ap)
323 ld $acc1,8($ap)
324 ld $acc2,16($ap)
325 ld $acc3,24($ap)
326
327 mr $t0,$acc0
328 std $acc0,64($sp)
329 mr $t1,$acc1
330 std $acc1,72($sp)
331 mr $t2,$acc2
332 std $acc2,80($sp)
333 mr $t3,$acc3
334 std $acc3,88($sp)
335
336 li $poly1,-1
337 srdi $poly1,$poly1,32 # 0x00000000ffffffff
338 li $poly3,1
339 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
340
341 bl __ecp_nistz256_add # ret = a+a // 2*a
342
343 ld $t0,64($sp)
344 ld $t1,72($sp)
345 ld $t2,80($sp)
346 ld $t3,88($sp)
347
348 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
349
350 mtlr r0
351 ld r28,96($sp)
352 ld r29,104($sp)
353 ld r30,112($sp)
354 ld r31,120($sp)
355 addi $sp,$sp,128
356 blr
357 .long 0
358 .byte 0,12,4,0,0x80,4,2,0
359 .long 0
360.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
361
362# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
363# const BN_ULONG x2[4]);
364.globl ecp_nistz256_sub
365.align 4
366ecp_nistz256_sub:
367 stdu $sp,-128($sp)
368 mflr r0
369 std r28,96($sp)
370 std r29,104($sp)
371 std r30,112($sp)
372 std r31,120($sp)
373
374 ld $acc0,0($ap)
375 ld $acc1,8($ap)
376 ld $acc2,16($ap)
377 ld $acc3,24($ap)
378
379 li $poly1,-1
380 srdi $poly1,$poly1,32 # 0x00000000ffffffff
381 li $poly3,1
382 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
383
384 bl __ecp_nistz256_sub_from
385
386 mtlr r0
387 ld r28,96($sp)
388 ld r29,104($sp)
389 ld r30,112($sp)
390 ld r31,120($sp)
391 addi $sp,$sp,128
392 blr
393 .long 0
394 .byte 0,12,4,0,0x80,4,3,0
395 .long 0
396.size ecp_nistz256_sub,.-ecp_nistz256_sub
397
398# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
399.globl ecp_nistz256_neg
400.align 4
401ecp_nistz256_neg:
402 stdu $sp,-128($sp)
403 mflr r0
404 std r28,96($sp)
405 std r29,104($sp)
406 std r30,112($sp)
407 std r31,120($sp)
408
409 mr $bp,$ap
410 li $acc0,0
411 li $acc1,0
412 li $acc2,0
413 li $acc3,0
414
415 li $poly1,-1
416 srdi $poly1,$poly1,32 # 0x00000000ffffffff
417 li $poly3,1
418 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
419
420 bl __ecp_nistz256_sub_from
421
422 mtlr r0
423 ld r28,96($sp)
424 ld r29,104($sp)
425 ld r30,112($sp)
426 ld r31,120($sp)
427 addi $sp,$sp,128
428 blr
429 .long 0
430 .byte 0,12,4,0,0x80,4,2,0
431 .long 0
432.size ecp_nistz256_neg,.-ecp_nistz256_neg
433
434# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
435# to $a0-$a3 and b[0] - to $bi
436.type __ecp_nistz256_mul_mont,\@function
437.align 4
438__ecp_nistz256_mul_mont:
439 mulld $acc0,$a0,$bi # a[0]*b[0]
440 mulhdu $t0,$a0,$bi
441
442 mulld $acc1,$a1,$bi # a[1]*b[0]
443 mulhdu $t1,$a1,$bi
444
445 mulld $acc2,$a2,$bi # a[2]*b[0]
446 mulhdu $t2,$a2,$bi
447
448 mulld $acc3,$a3,$bi # a[3]*b[0]
449 mulhdu $t3,$a3,$bi
450 ld $bi,8($bp) # b[1]
451
452 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
453 sldi $t0,$acc0,32
454 adde $acc2,$acc2,$t1
455 srdi $t1,$acc0,32
456 adde $acc3,$acc3,$t2
457 addze $acc4,$t3
458 li $acc5,0
459___
460for($i=1;$i<4;$i++) {
461 ################################################################
462 # Reduction iteration is normally performed by accumulating
463 # result of multiplication of modulus by "magic" digit [and
464 # omitting least significant word, which is guaranteed to
465 # be 0], but thanks to special form of modulus and "magic"
466 # digit being equal to least significant word, it can be
467 # performed with additions and subtractions alone. Indeed:
468 #
469 # ffff0001.00000000.0000ffff.ffffffff
470 # * abcdefgh
471 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
472 #
473 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
474 # rewrite above as:
475 #
476 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
477 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
478 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
479 #
480 # or marking redundant operations:
481 #
482 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
483 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
484 # - 0000abcd.efgh0000.--------.--------.--------
485
486$code.=<<___;
487 subfc $t2,$t0,$acc0 # "*0xffff0001"
488 subfe $t3,$t1,$acc0
489 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
d8f432aa 490 adde $acc1,$acc2,$t1
d8f432aa 491 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
d8f432aa 492 adde $acc3,$acc4,$t3
d8f432aa
AP
493 addze $acc4,$acc5
494
6f553edb
AP
495 mulld $t0,$a0,$bi # lo(a[0]*b[i])
496 mulld $t1,$a1,$bi # lo(a[1]*b[i])
497 mulld $t2,$a2,$bi # lo(a[2]*b[i])
498 mulld $t3,$a3,$bi # lo(a[3]*b[i])
d8f432aa
AP
499 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
500 mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
501 adde $acc1,$acc1,$t1
502 mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
503 adde $acc2,$acc2,$t2
504 mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
505 adde $acc3,$acc3,$t3
506 mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
507 addze $acc4,$acc4
508___
509$code.=<<___ if ($i<3);
510 ld $bi,8*($i+1)($bp) # b[$i+1]
511___
512$code.=<<___;
513 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
514 sldi $t0,$acc0,32
515 adde $acc2,$acc2,$t1
516 srdi $t1,$acc0,32
517 adde $acc3,$acc3,$t2
d8f432aa 518 adde $acc4,$acc4,$t3
6f553edb 519 li $acc5,0
d8f432aa
AP
520 addze $acc5,$acc5
521___
522}
523$code.=<<___;
524 # last reduction
525 subfc $t2,$t0,$acc0 # "*0xffff0001"
526 subfe $t3,$t1,$acc0
527 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
528 adde $acc1,$acc2,$t1
529 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
530 adde $acc3,$acc4,$t3
531 addze $acc4,$acc5
532
533 li $t2,0
534 addic $acc0,$acc0,1 # ret -= modulus
535 subfe $acc1,$poly1,$acc1
536 subfe $acc2,$t2,$acc2
537 subfe $acc3,$poly3,$acc3
538 subfe $acc4,$t2,$acc4
539
540 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
541 and $t1,$poly1,$acc4
542 and $t3,$poly3,$acc4
543 adde $acc1,$acc1,$t1
544 addze $acc2,$acc2
545 adde $acc3,$acc3,$t3
546
547 std $acc0,0($rp)
548 std $acc1,8($rp)
549 std $acc2,16($rp)
550 std $acc3,24($rp)
551
552 blr
553 .long 0
554 .byte 0,12,0x14,0,0,0,1,0
555 .long 0
556.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
557
558# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
559# to $a0-$a3
560.type __ecp_nistz256_sqr_mont,\@function
561.align 4
562__ecp_nistz256_sqr_mont:
563 ################################################################
564 # | | | | | |a1*a0| |
565 # | | | | |a2*a0| | |
566 # | |a3*a2|a3*a0| | | |
567 # | | | |a2*a1| | | |
568 # | | |a3*a1| | | | |
569 # *| | | | | | | | 2|
570 # +|a3*a3|a2*a2|a1*a1|a0*a0|
571 # |--+--+--+--+--+--+--+--|
572 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
573 #
574 # "can't overflow" below mark carrying into high part of
575 # multiplication result, which can't overflow, because it
576 # can never be all ones.
577
578 mulld $acc1,$a1,$a0 # a[1]*a[0]
579 mulhdu $t1,$a1,$a0
580 mulld $acc2,$a2,$a0 # a[2]*a[0]
581 mulhdu $t2,$a2,$a0
582 mulld $acc3,$a3,$a0 # a[3]*a[0]
583 mulhdu $acc4,$a3,$a0
584
585 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
586 mulld $t0,$a2,$a1 # a[2]*a[1]
587 mulhdu $t1,$a2,$a1
588 adde $acc3,$acc3,$t2
589 mulld $t2,$a3,$a1 # a[3]*a[1]
590 mulhdu $t3,$a3,$a1
591 addze $acc4,$acc4 # can't overflow
592
593 mulld $acc5,$a3,$a2 # a[3]*a[2]
594 mulhdu $acc6,$a3,$a2
595
596 addc $t1,$t1,$t2 # accumulate high parts of multiplication
d8f432aa
AP
597 addze $t2,$t3 # can't overflow
598
599 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
d8f432aa 600 adde $acc4,$acc4,$t1
d8f432aa 601 adde $acc5,$acc5,$t2
d8f432aa
AP
602 addze $acc6,$acc6 # can't overflow
603
604 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
d8f432aa 605 adde $acc2,$acc2,$acc2
d8f432aa 606 adde $acc3,$acc3,$acc3
d8f432aa 607 adde $acc4,$acc4,$acc4
d8f432aa
AP
608 adde $acc5,$acc5,$acc5
609 adde $acc6,$acc6,$acc6
610 li $acc7,0
611 addze $acc7,$acc7
612
6f553edb
AP
613 mulld $acc0,$a0,$a0 # a[0]*a[0]
614 mulhdu $a0,$a0,$a0
615 mulld $t1,$a1,$a1 # a[1]*a[1]
616 mulhdu $a1,$a1,$a1
617 mulld $t2,$a2,$a2 # a[2]*a[2]
618 mulhdu $a2,$a2,$a2
619 mulld $t3,$a3,$a3 # a[3]*a[3]
620 mulhdu $a3,$a3,$a3
d8f432aa 621 addc $acc1,$acc1,$a0 # +a[i]*a[i]
6f553edb 622 sldi $t0,$acc0,32
d8f432aa 623 adde $acc2,$acc2,$t1
6f553edb 624 srdi $t1,$acc0,32
d8f432aa
AP
625 adde $acc3,$acc3,$a1
626 adde $acc4,$acc4,$t2
627 adde $acc5,$acc5,$a2
d8f432aa 628 adde $acc6,$acc6,$t3
d8f432aa
AP
629 adde $acc7,$acc7,$a3
630___
631for($i=0;$i<3;$i++) { # reductions, see commentary in
632 # multiplication for details
633$code.=<<___;
634 subfc $t2,$t0,$acc0 # "*0xffff0001"
635 subfe $t3,$t1,$acc0
636 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
d8f432aa 637 sldi $t0,$acc0,32
6f553edb 638 adde $acc1,$acc2,$t1
d8f432aa 639 srdi $t1,$acc0,32
6f553edb 640 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
d8f432aa
AP
641 addze $acc3,$t3 # can't overflow
642___
643}
644$code.=<<___;
645 subfc $t2,$t0,$acc0 # "*0xffff0001"
646 subfe $t3,$t1,$acc0
647 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
648 adde $acc1,$acc2,$t1
649 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
d8f432aa
AP
650 addze $acc3,$t3 # can't overflow
651
652 addc $acc0,$acc0,$acc4 # accumulate upper half
653 adde $acc1,$acc1,$acc5
654 adde $acc2,$acc2,$acc6
655 adde $acc3,$acc3,$acc7
6f553edb 656 li $t2,0
d8f432aa
AP
657 addze $acc4,$t2
658
659 addic $acc0,$acc0,1 # ret -= modulus
660 subfe $acc1,$poly1,$acc1
661 subfe $acc2,$t2,$acc2
662 subfe $acc3,$poly3,$acc3
663 subfe $acc4,$t2,$acc4
664
665 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
666 and $t1,$poly1,$acc4
667 and $t3,$poly3,$acc4
668 adde $acc1,$acc1,$t1
669 addze $acc2,$acc2
670 adde $acc3,$acc3,$t3
671
672 std $acc0,0($rp)
673 std $acc1,8($rp)
674 std $acc2,16($rp)
675 std $acc3,24($rp)
676
677 blr
678 .long 0
679 .byte 0,12,0x14,0,0,0,1,0
680 .long 0
681.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
682
683# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
684# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
685# contexts, e.g. in multiplication by 2 and 3...
686.type __ecp_nistz256_add,\@function
687.align 4
688__ecp_nistz256_add:
689 addc $acc0,$acc0,$t0 # ret = a+b
690 adde $acc1,$acc1,$t1
691 adde $acc2,$acc2,$t2
692 li $t2,0
693 adde $acc3,$acc3,$t3
694 addze $t0,$t2
695
696 # if a+b >= modulus, subtract modulus
697 #
698 # But since comparison implies subtraction, we subtract
46f4e1be 699 # modulus and then add it back if subtraction borrowed.
d8f432aa
AP
700
701 subic $acc0,$acc0,-1
702 subfe $acc1,$poly1,$acc1
703 subfe $acc2,$t2,$acc2
704 subfe $acc3,$poly3,$acc3
705 subfe $t0,$t2,$t0
706
707 addc $acc0,$acc0,$t0
708 and $t1,$poly1,$t0
709 and $t3,$poly3,$t0
710 adde $acc1,$acc1,$t1
711 addze $acc2,$acc2
712 adde $acc3,$acc3,$t3
713
714 std $acc0,0($rp)
715 std $acc1,8($rp)
716 std $acc2,16($rp)
717 std $acc3,24($rp)
718
719 blr
720 .long 0
721 .byte 0,12,0x14,0,0,0,3,0
722 .long 0
723.size __ecp_nistz256_add,.-__ecp_nistz256_add
724
725.type __ecp_nistz256_sub_from,\@function
726.align 4
727__ecp_nistz256_sub_from:
728 ld $t0,0($bp)
729 ld $t1,8($bp)
730 ld $t2,16($bp)
731 ld $t3,24($bp)
732 subfc $acc0,$t0,$acc0 # ret = a-b
733 subfe $acc1,$t1,$acc1
734 subfe $acc2,$t2,$acc2
735 subfe $acc3,$t3,$acc3
736 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
737
738 # if a-b borrowed, add modulus
739
740 addc $acc0,$acc0,$t0 # ret -= modulus & t0
741 and $t1,$poly1,$t0
742 and $t3,$poly3,$t0
743 adde $acc1,$acc1,$t1
744 addze $acc2,$acc2
745 adde $acc3,$acc3,$t3
746
747 std $acc0,0($rp)
748 std $acc1,8($rp)
749 std $acc2,16($rp)
750 std $acc3,24($rp)
751
752 blr
753 .long 0
754 .byte 0,12,0x14,0,0,0,3,0
755 .long 0
756.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
757
758.type __ecp_nistz256_sub_morf,\@function
759.align 4
760__ecp_nistz256_sub_morf:
761 ld $t0,0($bp)
762 ld $t1,8($bp)
763 ld $t2,16($bp)
764 ld $t3,24($bp)
765 subfc $acc0,$acc0,$t0 # ret = b-a
766 subfe $acc1,$acc1,$t1
767 subfe $acc2,$acc2,$t2
768 subfe $acc3,$acc3,$t3
769 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
770
771 # if b-a borrowed, add modulus
772
773 addc $acc0,$acc0,$t0 # ret -= modulus & t0
774 and $t1,$poly1,$t0
775 and $t3,$poly3,$t0
776 adde $acc1,$acc1,$t1
777 addze $acc2,$acc2
778 adde $acc3,$acc3,$t3
779
780 std $acc0,0($rp)
781 std $acc1,8($rp)
782 std $acc2,16($rp)
783 std $acc3,24($rp)
784
785 blr
786 .long 0
787 .byte 0,12,0x14,0,0,0,3,0
788 .long 0
789.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
790
791.type __ecp_nistz256_div_by_2,\@function
792.align 4
793__ecp_nistz256_div_by_2:
794 andi. $t0,$acc0,1
795 addic $acc0,$acc0,-1 # a += modulus
796 neg $t0,$t0
797 adde $acc1,$acc1,$poly1
798 not $t0,$t0
799 addze $acc2,$acc2
800 li $t2,0
801 adde $acc3,$acc3,$poly3
802 and $t1,$poly1,$t0
803 addze $ap,$t2 # ap = carry
804 and $t3,$poly3,$t0
805
806 subfc $acc0,$t0,$acc0 # a -= modulus if a was even
807 subfe $acc1,$t1,$acc1
808 subfe $acc2,$t2,$acc2
809 subfe $acc3,$t3,$acc3
810 subfe $ap, $t2,$ap
811
812 srdi $acc0,$acc0,1
813 sldi $t0,$acc1,63
814 srdi $acc1,$acc1,1
815 sldi $t1,$acc2,63
816 srdi $acc2,$acc2,1
817 sldi $t2,$acc3,63
818 srdi $acc3,$acc3,1
819 sldi $t3,$ap,63
820 or $acc0,$acc0,$t0
821 or $acc1,$acc1,$t1
822 or $acc2,$acc2,$t2
823 or $acc3,$acc3,$t3
824
825 std $acc0,0($rp)
826 std $acc1,8($rp)
827 std $acc2,16($rp)
828 std $acc3,24($rp)
829
830 blr
831 .long 0
832 .byte 0,12,0x14,0,0,0,1,0
833 .long 0
834.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
835___
836########################################################################
837# following subroutines are "literal" implementation of those found in
838# ecp_nistz256.c
839#
840########################################################################
841# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
842#
843if (1) {
844my $FRAME=64+32*4+12*8;
845my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
846# above map() describes stack layout with 4 temporary
847# 256-bit vectors on top.
848my ($rp_real,$ap_real) = map("r$_",(20,21));
849
850$code.=<<___;
851.globl ecp_nistz256_point_double
852.align 5
853ecp_nistz256_point_double:
854 stdu $sp,-$FRAME($sp)
855 mflr r0
856 std r20,$FRAME-8*12($sp)
857 std r21,$FRAME-8*11($sp)
858 std r22,$FRAME-8*10($sp)
859 std r23,$FRAME-8*9($sp)
860 std r24,$FRAME-8*8($sp)
861 std r25,$FRAME-8*7($sp)
862 std r26,$FRAME-8*6($sp)
863 std r27,$FRAME-8*5($sp)
864 std r28,$FRAME-8*4($sp)
865 std r29,$FRAME-8*3($sp)
866 std r30,$FRAME-8*2($sp)
867 std r31,$FRAME-8*1($sp)
868
869 li $poly1,-1
870 srdi $poly1,$poly1,32 # 0x00000000ffffffff
871 li $poly3,1
872 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
873.Ldouble_shortcut:
874 ld $acc0,32($ap)
875 ld $acc1,40($ap)
876 ld $acc2,48($ap)
877 ld $acc3,56($ap)
878 mr $t0,$acc0
879 mr $t1,$acc1
880 mr $t2,$acc2
881 mr $t3,$acc3
882 ld $a0,64($ap) # forward load for p256_sqr_mont
883 ld $a1,72($ap)
884 ld $a2,80($ap)
885 ld $a3,88($ap)
886 mr $rp_real,$rp
887 mr $ap_real,$ap
888 addi $rp,$sp,$S
889 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
890
891 addi $rp,$sp,$Zsqr
892 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
893
894 ld $t0,0($ap_real)
895 ld $t1,8($ap_real)
896 ld $t2,16($ap_real)
897 ld $t3,24($ap_real)
898 mr $a0,$acc0 # put Zsqr aside for p256_sub
899 mr $a1,$acc1
900 mr $a2,$acc2
901 mr $a3,$acc3
902 addi $rp,$sp,$M
903 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
904
905 addi $bp,$ap_real,0
906 mr $acc0,$a0 # restore Zsqr
907 mr $acc1,$a1
908 mr $acc2,$a2
909 mr $acc3,$a3
910 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
911 ld $a1,$S+8($sp)
912 ld $a2,$S+16($sp)
913 ld $a3,$S+24($sp)
914 addi $rp,$sp,$Zsqr
915 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
916
917 addi $rp,$sp,$S
918 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
919
920 ld $bi,32($ap_real)
921 ld $a0,64($ap_real)
922 ld $a1,72($ap_real)
923 ld $a2,80($ap_real)
924 ld $a3,88($ap_real)
925 addi $bp,$ap_real,32
926 addi $rp,$sp,$tmp0
927 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
928
929 mr $t0,$acc0
930 mr $t1,$acc1
931 mr $t2,$acc2
932 mr $t3,$acc3
933 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
934 ld $a1,$S+8($sp)
935 ld $a2,$S+16($sp)
936 ld $a3,$S+24($sp)
937 addi $rp,$rp_real,64
938 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
939
940 addi $rp,$sp,$tmp0
941 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
942
943 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
944 ld $a0,$M+0($sp)
945 ld $a1,$M+8($sp)
946 ld $a2,$M+16($sp)
947 ld $a3,$M+24($sp)
948 addi $rp,$rp_real,32
949 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
950
951 addi $bp,$sp,$Zsqr
952 addi $rp,$sp,$M
953 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
954
955 mr $t0,$acc0 # duplicate M
956 mr $t1,$acc1
957 mr $t2,$acc2
958 mr $t3,$acc3
959 mr $a0,$acc0 # put M aside
960 mr $a1,$acc1
961 mr $a2,$acc2
962 mr $a3,$acc3
963 addi $rp,$sp,$M
964 bl __ecp_nistz256_add
965 mr $t0,$a0 # restore M
966 mr $t1,$a1
967 mr $t2,$a2
968 mr $t3,$a3
969 ld $bi,0($ap_real) # forward load for p256_mul_mont
970 ld $a0,$S+0($sp)
971 ld $a1,$S+8($sp)
972 ld $a2,$S+16($sp)
973 ld $a3,$S+24($sp)
974 bl __ecp_nistz256_add # p256_mul_by_3(M, M);
975
976 addi $bp,$ap_real,0
977 addi $rp,$sp,$S
978 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
979
980 mr $t0,$acc0
981 mr $t1,$acc1
982 mr $t2,$acc2
983 mr $t3,$acc3
984 ld $a0,$M+0($sp) # forward load for p256_sqr_mont
985 ld $a1,$M+8($sp)
986 ld $a2,$M+16($sp)
987 ld $a3,$M+24($sp)
988 addi $rp,$sp,$tmp0
989 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
990
991 addi $rp,$rp_real,0
992 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
993
994 addi $bp,$sp,$tmp0
995 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
996
997 addi $bp,$sp,$S
998 addi $rp,$sp,$S
999 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
1000
1001 ld $bi,$M($sp)
1002 mr $a0,$acc0 # copy S
1003 mr $a1,$acc1
1004 mr $a2,$acc2
1005 mr $a3,$acc3
1006 addi $bp,$sp,$M
1007 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
1008
1009 addi $bp,$rp_real,32
1010 addi $rp,$rp_real,32
1011 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
1012
1013 mtlr r0
1014 ld r20,$FRAME-8*12($sp)
1015 ld r21,$FRAME-8*11($sp)
1016 ld r22,$FRAME-8*10($sp)
1017 ld r23,$FRAME-8*9($sp)
1018 ld r24,$FRAME-8*8($sp)
1019 ld r25,$FRAME-8*7($sp)
1020 ld r26,$FRAME-8*6($sp)
1021 ld r27,$FRAME-8*5($sp)
1022 ld r28,$FRAME-8*4($sp)
1023 ld r29,$FRAME-8*3($sp)
1024 ld r30,$FRAME-8*2($sp)
1025 ld r31,$FRAME-8*1($sp)
1026 addi $sp,$sp,$FRAME
1027 blr
1028 .long 0
1029 .byte 0,12,4,0,0x80,12,2,0
1030 .long 0
1031.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1032___
1033}
1034
1035########################################################################
1036# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1037# const P256_POINT *in2);
1038if (1) {
1039my $FRAME = 64 + 32*12 + 16*8;
1040my ($res_x,$res_y,$res_z,
1041 $H,$Hsqr,$R,$Rsqr,$Hcub,
1042 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1043my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1044# above map() describes stack layout with 12 temporary
1045# 256-bit vectors on top.
1046my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1047
1048$code.=<<___;
1049.globl ecp_nistz256_point_add
1050.align 5
1051ecp_nistz256_point_add:
1052 stdu $sp,-$FRAME($sp)
1053 mflr r0
1054 std r16,$FRAME-8*16($sp)
1055 std r17,$FRAME-8*15($sp)
1056 std r18,$FRAME-8*14($sp)
1057 std r19,$FRAME-8*13($sp)
1058 std r20,$FRAME-8*12($sp)
1059 std r21,$FRAME-8*11($sp)
1060 std r22,$FRAME-8*10($sp)
1061 std r23,$FRAME-8*9($sp)
1062 std r24,$FRAME-8*8($sp)
1063 std r25,$FRAME-8*7($sp)
1064 std r26,$FRAME-8*6($sp)
1065 std r27,$FRAME-8*5($sp)
1066 std r28,$FRAME-8*4($sp)
1067 std r29,$FRAME-8*3($sp)
1068 std r30,$FRAME-8*2($sp)
1069 std r31,$FRAME-8*1($sp)
1070
1071 li $poly1,-1
1072 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1073 li $poly3,1
1074 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1075
1076 ld $a0,64($bp) # in2_z
1077 ld $a1,72($bp)
1078 ld $a2,80($bp)
1079 ld $a3,88($bp)
1080 mr $rp_real,$rp
1081 mr $ap_real,$ap
1082 mr $bp_real,$bp
1083 or $t0,$a0,$a1
1084 or $t2,$a2,$a3
1085 or $in2infty,$t0,$t2
1086 neg $t0,$in2infty
1087 or $in2infty,$in2infty,$t0
1088 sradi $in2infty,$in2infty,63 # !in2infty
1089 addi $rp,$sp,$Z2sqr
1090 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
1091
1092 ld $a0,64($ap_real) # in1_z
1093 ld $a1,72($ap_real)
1094 ld $a2,80($ap_real)
1095 ld $a3,88($ap_real)
1096 or $t0,$a0,$a1
1097 or $t2,$a2,$a3
1098 or $in1infty,$t0,$t2
1099 neg $t0,$in1infty
1100 or $in1infty,$in1infty,$t0
1101 sradi $in1infty,$in1infty,63 # !in1infty
1102 addi $rp,$sp,$Z1sqr
1103 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1104
1105 ld $bi,64($bp_real)
1106 ld $a0,$Z2sqr+0($sp)
1107 ld $a1,$Z2sqr+8($sp)
1108 ld $a2,$Z2sqr+16($sp)
1109 ld $a3,$Z2sqr+24($sp)
1110 addi $bp,$bp_real,64
1111 addi $rp,$sp,$S1
1112 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
1113
1114 ld $bi,64($ap_real)
1115 ld $a0,$Z1sqr+0($sp)
1116 ld $a1,$Z1sqr+8($sp)
1117 ld $a2,$Z1sqr+16($sp)
1118 ld $a3,$Z1sqr+24($sp)
1119 addi $bp,$ap_real,64
1120 addi $rp,$sp,$S2
1121 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1122
1123 ld $bi,32($ap_real)
1124 ld $a0,$S1+0($sp)
1125 ld $a1,$S1+8($sp)
1126 ld $a2,$S1+16($sp)
1127 ld $a3,$S1+24($sp)
1128 addi $bp,$ap_real,32
1129 addi $rp,$sp,$S1
1130 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
1131
1132 ld $bi,32($bp_real)
1133 ld $a0,$S2+0($sp)
1134 ld $a1,$S2+8($sp)
1135 ld $a2,$S2+16($sp)
1136 ld $a3,$S2+24($sp)
1137 addi $bp,$bp_real,32
1138 addi $rp,$sp,$S2
1139 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1140
1141 addi $bp,$sp,$S1
1142 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
1143 ld $a0,0($ap_real)
1144 ld $a1,8($ap_real)
1145 ld $a2,16($ap_real)
1146 ld $a3,24($ap_real)
1147 addi $rp,$sp,$R
1148 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
1149
1150 or $acc0,$acc0,$acc1 # see if result is zero
1151 or $acc2,$acc2,$acc3
1152 or $temp,$acc0,$acc2
1153
1154 addi $bp,$sp,$Z2sqr
1155 addi $rp,$sp,$U1
1156 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
1157
1158 ld $bi,$Z1sqr($sp)
1159 ld $a0,0($bp_real)
1160 ld $a1,8($bp_real)
1161 ld $a2,16($bp_real)
1162 ld $a3,24($bp_real)
1163 addi $bp,$sp,$Z1sqr
1164 addi $rp,$sp,$U2
1165 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
1166
1167 addi $bp,$sp,$U1
1168 ld $a0,$R+0($sp) # forward load for p256_sqr_mont
1169 ld $a1,$R+8($sp)
1170 ld $a2,$R+16($sp)
1171 ld $a3,$R+24($sp)
1172 addi $rp,$sp,$H
1173 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
1174
1175 or $acc0,$acc0,$acc1 # see if result is zero
1176 or $acc2,$acc2,$acc3
1177 or. $acc0,$acc0,$acc2
1178 bne .Ladd_proceed # is_equal(U1,U2)?
1179
1180 and. $t0,$in1infty,$in2infty
1181 beq .Ladd_proceed # (in1infty || in2infty)?
1182
1183 cmpldi $temp,0
1184 beq .Ladd_double # is_equal(S1,S2)?
1185
1186 xor $a0,$a0,$a0
1187 std $a0,0($rp_real)
1188 std $a0,8($rp_real)
1189 std $a0,16($rp_real)
1190 std $a0,24($rp_real)
1191 std $a0,32($rp_real)
1192 std $a0,40($rp_real)
1193 std $a0,48($rp_real)
1194 std $a0,56($rp_real)
1195 std $a0,64($rp_real)
1196 std $a0,72($rp_real)
1197 std $a0,80($rp_real)
1198 std $a0,88($rp_real)
1199 b .Ladd_done
1200
1201.align 4
1202.Ladd_double:
1203 ld $bp,0($sp) # back-link
1204 mr $ap,$ap_real
1205 mr $rp,$rp_real
1206 ld r16,$FRAME-8*16($sp)
1207 ld r17,$FRAME-8*15($sp)
1208 ld r18,$FRAME-8*14($sp)
1209 ld r19,$FRAME-8*13($sp)
1210 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
1211 b .Ldouble_shortcut
1212
1213.align 4
1214.Ladd_proceed:
1215 addi $rp,$sp,$Rsqr
1216 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1217
1218 ld $bi,64($ap_real)
1219 ld $a0,$H+0($sp)
1220 ld $a1,$H+8($sp)
1221 ld $a2,$H+16($sp)
1222 ld $a3,$H+24($sp)
1223 addi $bp,$ap_real,64
1224 addi $rp,$sp,$res_z
1225 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1226
1227 ld $a0,$H+0($sp)
1228 ld $a1,$H+8($sp)
1229 ld $a2,$H+16($sp)
1230 ld $a3,$H+24($sp)
1231 addi $rp,$sp,$Hsqr
1232 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1233
1234 ld $bi,64($bp_real)
1235 ld $a0,$res_z+0($sp)
1236 ld $a1,$res_z+8($sp)
1237 ld $a2,$res_z+16($sp)
1238 ld $a3,$res_z+24($sp)
1239 addi $bp,$bp_real,64
1240 addi $rp,$sp,$res_z
1241 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
1242
1243 ld $bi,$H($sp)
1244 ld $a0,$Hsqr+0($sp)
1245 ld $a1,$Hsqr+8($sp)
1246 ld $a2,$Hsqr+16($sp)
1247 ld $a3,$Hsqr+24($sp)
1248 addi $bp,$sp,$H
1249 addi $rp,$sp,$Hcub
1250 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1251
1252 ld $bi,$Hsqr($sp)
1253 ld $a0,$U1+0($sp)
1254 ld $a1,$U1+8($sp)
1255 ld $a2,$U1+16($sp)
1256 ld $a3,$U1+24($sp)
1257 addi $bp,$sp,$Hsqr
1258 addi $rp,$sp,$U2
1259 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
1260
1261 mr $t0,$acc0
1262 mr $t1,$acc1
1263 mr $t2,$acc2
1264 mr $t3,$acc3
1265 addi $rp,$sp,$Hsqr
1266 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1267
1268 addi $bp,$sp,$Rsqr
1269 addi $rp,$sp,$res_x
1270 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1271
1272 addi $bp,$sp,$Hcub
1273 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1274
1275 addi $bp,$sp,$U2
1276 ld $bi,$Hcub($sp) # forward load for p256_mul_mont
1277 ld $a0,$S1+0($sp)
1278 ld $a1,$S1+8($sp)
1279 ld $a2,$S1+16($sp)
1280 ld $a3,$S1+24($sp)
1281 addi $rp,$sp,$res_y
1282 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1283
1284 addi $bp,$sp,$Hcub
1285 addi $rp,$sp,$S2
1286 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
1287
1288 ld $bi,$R($sp)
1289 ld $a0,$res_y+0($sp)
1290 ld $a1,$res_y+8($sp)
1291 ld $a2,$res_y+16($sp)
1292 ld $a3,$res_y+24($sp)
1293 addi $bp,$sp,$R
1294 addi $rp,$sp,$res_y
1295 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1296
1297 addi $bp,$sp,$S2
1298 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1299
1300 ld $t0,0($bp_real) # in2
1301 ld $t1,8($bp_real)
1302 ld $t2,16($bp_real)
1303 ld $t3,24($bp_real)
1304 ld $a0,$res_x+0($sp) # res
1305 ld $a1,$res_x+8($sp)
1306 ld $a2,$res_x+16($sp)
1307 ld $a3,$res_x+24($sp)
1308___
1309for($i=0;$i<64;$i+=32) { # conditional moves
1310$code.=<<___;
1311 ld $acc0,$i+0($ap_real) # in1
1312 ld $acc1,$i+8($ap_real)
1313 ld $acc2,$i+16($ap_real)
1314 ld $acc3,$i+24($ap_real)
1315 andc $t0,$t0,$in1infty
1316 andc $t1,$t1,$in1infty
1317 andc $t2,$t2,$in1infty
1318 andc $t3,$t3,$in1infty
1319 and $a0,$a0,$in1infty
1320 and $a1,$a1,$in1infty
1321 and $a2,$a2,$in1infty
1322 and $a3,$a3,$in1infty
1323 or $t0,$t0,$a0
1324 or $t1,$t1,$a1
1325 or $t2,$t2,$a2
1326 or $t3,$t3,$a3
1327 andc $acc0,$acc0,$in2infty
1328 andc $acc1,$acc1,$in2infty
1329 andc $acc2,$acc2,$in2infty
1330 andc $acc3,$acc3,$in2infty
1331 and $t0,$t0,$in2infty
1332 and $t1,$t1,$in2infty
1333 and $t2,$t2,$in2infty
1334 and $t3,$t3,$in2infty
1335 or $acc0,$acc0,$t0
1336 or $acc1,$acc1,$t1
1337 or $acc2,$acc2,$t2
1338 or $acc3,$acc3,$t3
1339
1340 ld $t0,$i+32($bp_real) # in2
1341 ld $t1,$i+40($bp_real)
1342 ld $t2,$i+48($bp_real)
1343 ld $t3,$i+56($bp_real)
1344 ld $a0,$res_x+$i+32($sp)
1345 ld $a1,$res_x+$i+40($sp)
1346 ld $a2,$res_x+$i+48($sp)
1347 ld $a3,$res_x+$i+56($sp)
1348 std $acc0,$i+0($rp_real)
1349 std $acc1,$i+8($rp_real)
1350 std $acc2,$i+16($rp_real)
1351 std $acc3,$i+24($rp_real)
1352___
1353}
1354$code.=<<___;
1355 ld $acc0,$i+0($ap_real) # in1
1356 ld $acc1,$i+8($ap_real)
1357 ld $acc2,$i+16($ap_real)
1358 ld $acc3,$i+24($ap_real)
1359 andc $t0,$t0,$in1infty
1360 andc $t1,$t1,$in1infty
1361 andc $t2,$t2,$in1infty
1362 andc $t3,$t3,$in1infty
1363 and $a0,$a0,$in1infty
1364 and $a1,$a1,$in1infty
1365 and $a2,$a2,$in1infty
1366 and $a3,$a3,$in1infty
1367 or $t0,$t0,$a0
1368 or $t1,$t1,$a1
1369 or $t2,$t2,$a2
1370 or $t3,$t3,$a3
1371 andc $acc0,$acc0,$in2infty
1372 andc $acc1,$acc1,$in2infty
1373 andc $acc2,$acc2,$in2infty
1374 andc $acc3,$acc3,$in2infty
1375 and $t0,$t0,$in2infty
1376 and $t1,$t1,$in2infty
1377 and $t2,$t2,$in2infty
1378 and $t3,$t3,$in2infty
1379 or $acc0,$acc0,$t0
1380 or $acc1,$acc1,$t1
1381 or $acc2,$acc2,$t2
1382 or $acc3,$acc3,$t3
1383 std $acc0,$i+0($rp_real)
1384 std $acc1,$i+8($rp_real)
1385 std $acc2,$i+16($rp_real)
1386 std $acc3,$i+24($rp_real)
1387
1388.Ladd_done:
1389 mtlr r0
1390 ld r16,$FRAME-8*16($sp)
1391 ld r17,$FRAME-8*15($sp)
1392 ld r18,$FRAME-8*14($sp)
1393 ld r19,$FRAME-8*13($sp)
1394 ld r20,$FRAME-8*12($sp)
1395 ld r21,$FRAME-8*11($sp)
1396 ld r22,$FRAME-8*10($sp)
1397 ld r23,$FRAME-8*9($sp)
1398 ld r24,$FRAME-8*8($sp)
1399 ld r25,$FRAME-8*7($sp)
1400 ld r26,$FRAME-8*6($sp)
1401 ld r27,$FRAME-8*5($sp)
1402 ld r28,$FRAME-8*4($sp)
1403 ld r29,$FRAME-8*3($sp)
1404 ld r30,$FRAME-8*2($sp)
1405 ld r31,$FRAME-8*1($sp)
1406 addi $sp,$sp,$FRAME
1407 blr
1408 .long 0
1409 .byte 0,12,4,0,0x80,16,3,0
1410 .long 0
1411.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1412___
1413}
1414
1415########################################################################
1416# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1417# const P256_POINT_AFFINE *in2);
1418if (1) {
1419my $FRAME = 64 + 32*10 + 16*8;
1420my ($res_x,$res_y,$res_z,
1421 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1422my $Z1sqr = $S2;
1423# above map() describes stack layout with 10 temporary
1424# 256-bit vectors on top.
1425my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1426
1427$code.=<<___;
1428.globl ecp_nistz256_point_add_affine
1429.align 5
1430ecp_nistz256_point_add_affine:
1431 stdu $sp,-$FRAME($sp)
1432 mflr r0
1433 std r16,$FRAME-8*16($sp)
1434 std r17,$FRAME-8*15($sp)
1435 std r18,$FRAME-8*14($sp)
1436 std r19,$FRAME-8*13($sp)
1437 std r20,$FRAME-8*12($sp)
1438 std r21,$FRAME-8*11($sp)
1439 std r22,$FRAME-8*10($sp)
1440 std r23,$FRAME-8*9($sp)
1441 std r24,$FRAME-8*8($sp)
1442 std r25,$FRAME-8*7($sp)
1443 std r26,$FRAME-8*6($sp)
1444 std r27,$FRAME-8*5($sp)
1445 std r28,$FRAME-8*4($sp)
1446 std r29,$FRAME-8*3($sp)
1447 std r30,$FRAME-8*2($sp)
1448 std r31,$FRAME-8*1($sp)
1449
1450 li $poly1,-1
1451 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1452 li $poly3,1
1453 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1454
1455 mr $rp_real,$rp
1456 mr $ap_real,$ap
1457 mr $bp_real,$bp
1458
1459 ld $a0,64($ap) # in1_z
1460 ld $a1,72($ap)
1461 ld $a2,80($ap)
1462 ld $a3,88($ap)
1463 or $t0,$a0,$a1
1464 or $t2,$a2,$a3
1465 or $in1infty,$t0,$t2
1466 neg $t0,$in1infty
1467 or $in1infty,$in1infty,$t0
1468 sradi $in1infty,$in1infty,63 # !in1infty
1469
1470 ld $acc0,0($bp) # in2_x
1471 ld $acc1,8($bp)
1472 ld $acc2,16($bp)
1473 ld $acc3,24($bp)
1474 ld $t0,32($bp) # in2_y
1475 ld $t1,40($bp)
1476 ld $t2,48($bp)
1477 ld $t3,56($bp)
1478 or $acc0,$acc0,$acc1
1479 or $acc2,$acc2,$acc3
1480 or $acc0,$acc0,$acc2
1481 or $t0,$t0,$t1
1482 or $t2,$t2,$t3
1483 or $t0,$t0,$t2
1484 or $in2infty,$acc0,$t0
1485 neg $t0,$in2infty
1486 or $in2infty,$in2infty,$t0
1487 sradi $in2infty,$in2infty,63 # !in2infty
1488
1489 addi $rp,$sp,$Z1sqr
1490 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1491
1492 mr $a0,$acc0
1493 mr $a1,$acc1
1494 mr $a2,$acc2
1495 mr $a3,$acc3
1496 ld $bi,0($bp_real)
1497 addi $bp,$bp_real,0
1498 addi $rp,$sp,$U2
1499 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
1500
1501 addi $bp,$ap_real,0
1502 ld $bi,64($ap_real) # forward load for p256_mul_mont
1503 ld $a0,$Z1sqr+0($sp)
1504 ld $a1,$Z1sqr+8($sp)
1505 ld $a2,$Z1sqr+16($sp)
1506 ld $a3,$Z1sqr+24($sp)
1507 addi $rp,$sp,$H
1508 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
1509
1510 addi $bp,$ap_real,64
1511 addi $rp,$sp,$S2
1512 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1513
1514 ld $bi,64($ap_real)
1515 ld $a0,$H+0($sp)
1516 ld $a1,$H+8($sp)
1517 ld $a2,$H+16($sp)
1518 ld $a3,$H+24($sp)
1519 addi $bp,$ap_real,64
1520 addi $rp,$sp,$res_z
1521 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1522
1523 ld $bi,32($bp_real)
1524 ld $a0,$S2+0($sp)
1525 ld $a1,$S2+8($sp)
1526 ld $a2,$S2+16($sp)
1527 ld $a3,$S2+24($sp)
1528 addi $bp,$bp_real,32
1529 addi $rp,$sp,$S2
1530 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1531
1532 addi $bp,$ap_real,32
1533 ld $a0,$H+0($sp) # forward load for p256_sqr_mont
1534 ld $a1,$H+8($sp)
1535 ld $a2,$H+16($sp)
1536 ld $a3,$H+24($sp)
1537 addi $rp,$sp,$R
1538 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
1539
1540 addi $rp,$sp,$Hsqr
1541 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1542
1543 ld $a0,$R+0($sp)
1544 ld $a1,$R+8($sp)
1545 ld $a2,$R+16($sp)
1546 ld $a3,$R+24($sp)
1547 addi $rp,$sp,$Rsqr
1548 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1549
1550 ld $bi,$H($sp)
1551 ld $a0,$Hsqr+0($sp)
1552 ld $a1,$Hsqr+8($sp)
1553 ld $a2,$Hsqr+16($sp)
1554 ld $a3,$Hsqr+24($sp)
1555 addi $bp,$sp,$H
1556 addi $rp,$sp,$Hcub
1557 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1558
1559 ld $bi,0($ap_real)
1560 ld $a0,$Hsqr+0($sp)
1561 ld $a1,$Hsqr+8($sp)
1562 ld $a2,$Hsqr+16($sp)
1563 ld $a3,$Hsqr+24($sp)
1564 addi $bp,$ap_real,0
1565 addi $rp,$sp,$U2
1566 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
1567
1568 mr $t0,$acc0
1569 mr $t1,$acc1
1570 mr $t2,$acc2
1571 mr $t3,$acc3
1572 addi $rp,$sp,$Hsqr
1573 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1574
1575 addi $bp,$sp,$Rsqr
1576 addi $rp,$sp,$res_x
1577 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1578
1579 addi $bp,$sp,$Hcub
1580 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1581
1582 addi $bp,$sp,$U2
1583 ld $bi,32($ap_real) # forward load for p256_mul_mont
1584 ld $a0,$Hcub+0($sp)
1585 ld $a1,$Hcub+8($sp)
1586 ld $a2,$Hcub+16($sp)
1587 ld $a3,$Hcub+24($sp)
1588 addi $rp,$sp,$res_y
1589 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1590
1591 addi $bp,$ap_real,32
1592 addi $rp,$sp,$S2
1593 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
1594
1595 ld $bi,$R($sp)
1596 ld $a0,$res_y+0($sp)
1597 ld $a1,$res_y+8($sp)
1598 ld $a2,$res_y+16($sp)
1599 ld $a3,$res_y+24($sp)
1600 addi $bp,$sp,$R
1601 addi $rp,$sp,$res_y
1602 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1603
1604 addi $bp,$sp,$S2
1605 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1606
1607 ld $t0,0($bp_real) # in2
1608 ld $t1,8($bp_real)
1609 ld $t2,16($bp_real)
1610 ld $t3,24($bp_real)
1611 ld $a0,$res_x+0($sp) # res
1612 ld $a1,$res_x+8($sp)
1613 ld $a2,$res_x+16($sp)
1614 ld $a3,$res_x+24($sp)
1615___
1616for($i=0;$i<64;$i+=32) { # conditional moves
1617$code.=<<___;
1618 ld $acc0,$i+0($ap_real) # in1
1619 ld $acc1,$i+8($ap_real)
1620 ld $acc2,$i+16($ap_real)
1621 ld $acc3,$i+24($ap_real)
1622 andc $t0,$t0,$in1infty
1623 andc $t1,$t1,$in1infty
1624 andc $t2,$t2,$in1infty
1625 andc $t3,$t3,$in1infty
1626 and $a0,$a0,$in1infty
1627 and $a1,$a1,$in1infty
1628 and $a2,$a2,$in1infty
1629 and $a3,$a3,$in1infty
1630 or $t0,$t0,$a0
1631 or $t1,$t1,$a1
1632 or $t2,$t2,$a2
1633 or $t3,$t3,$a3
1634 andc $acc0,$acc0,$in2infty
1635 andc $acc1,$acc1,$in2infty
1636 andc $acc2,$acc2,$in2infty
1637 andc $acc3,$acc3,$in2infty
1638 and $t0,$t0,$in2infty
1639 and $t1,$t1,$in2infty
1640 and $t2,$t2,$in2infty
1641 and $t3,$t3,$in2infty
1642 or $acc0,$acc0,$t0
1643 or $acc1,$acc1,$t1
1644 or $acc2,$acc2,$t2
1645 or $acc3,$acc3,$t3
1646___
1647$code.=<<___ if ($i==0);
1648 ld $t0,32($bp_real) # in2
1649 ld $t1,40($bp_real)
1650 ld $t2,48($bp_real)
1651 ld $t3,56($bp_real)
1652___
1653$code.=<<___ if ($i==32);
1654 li $t0,1 # Lone_mont
1655 not $t1,$poly1
1656 li $t2,-1
1657 not $t3,$poly3
1658___
1659$code.=<<___;
1660 ld $a0,$res_x+$i+32($sp)
1661 ld $a1,$res_x+$i+40($sp)
1662 ld $a2,$res_x+$i+48($sp)
1663 ld $a3,$res_x+$i+56($sp)
1664 std $acc0,$i+0($rp_real)
1665 std $acc1,$i+8($rp_real)
1666 std $acc2,$i+16($rp_real)
1667 std $acc3,$i+24($rp_real)
1668___
1669}
1670$code.=<<___;
1671 ld $acc0,$i+0($ap_real) # in1
1672 ld $acc1,$i+8($ap_real)
1673 ld $acc2,$i+16($ap_real)
1674 ld $acc3,$i+24($ap_real)
1675 andc $t0,$t0,$in1infty
1676 andc $t1,$t1,$in1infty
1677 andc $t2,$t2,$in1infty
1678 andc $t3,$t3,$in1infty
1679 and $a0,$a0,$in1infty
1680 and $a1,$a1,$in1infty
1681 and $a2,$a2,$in1infty
1682 and $a3,$a3,$in1infty
1683 or $t0,$t0,$a0
1684 or $t1,$t1,$a1
1685 or $t2,$t2,$a2
1686 or $t3,$t3,$a3
1687 andc $acc0,$acc0,$in2infty
1688 andc $acc1,$acc1,$in2infty
1689 andc $acc2,$acc2,$in2infty
1690 andc $acc3,$acc3,$in2infty
1691 and $t0,$t0,$in2infty
1692 and $t1,$t1,$in2infty
1693 and $t2,$t2,$in2infty
1694 and $t3,$t3,$in2infty
1695 or $acc0,$acc0,$t0
1696 or $acc1,$acc1,$t1
1697 or $acc2,$acc2,$t2
1698 or $acc3,$acc3,$t3
1699 std $acc0,$i+0($rp_real)
1700 std $acc1,$i+8($rp_real)
1701 std $acc2,$i+16($rp_real)
1702 std $acc3,$i+24($rp_real)
1703
1704 mtlr r0
1705 ld r16,$FRAME-8*16($sp)
1706 ld r17,$FRAME-8*15($sp)
1707 ld r18,$FRAME-8*14($sp)
1708 ld r19,$FRAME-8*13($sp)
1709 ld r20,$FRAME-8*12($sp)
1710 ld r21,$FRAME-8*11($sp)
1711 ld r22,$FRAME-8*10($sp)
1712 ld r23,$FRAME-8*9($sp)
1713 ld r24,$FRAME-8*8($sp)
1714 ld r25,$FRAME-8*7($sp)
1715 ld r26,$FRAME-8*6($sp)
1716 ld r27,$FRAME-8*5($sp)
1717 ld r28,$FRAME-8*4($sp)
1718 ld r29,$FRAME-8*3($sp)
1719 ld r30,$FRAME-8*2($sp)
1720 ld r31,$FRAME-8*1($sp)
1721 addi $sp,$sp,$FRAME
1722 blr
1723 .long 0
1724 .byte 0,12,4,0,0x80,16,3,0
1725 .long 0
1726.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1727___
1728}
1729if (1) {
1730my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1731my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1732
1733$code.=<<___;
1734########################################################################
1735# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1736# uint64_t b[4]);
1737.globl ecp_nistz256_ord_mul_mont
1738.align 5
1739ecp_nistz256_ord_mul_mont:
1740 stdu $sp,-160($sp)
1741 std r18,48($sp)
1742 std r19,56($sp)
1743 std r20,64($sp)
1744 std r21,72($sp)
1745 std r22,80($sp)
1746 std r23,88($sp)
1747 std r24,96($sp)
1748 std r25,104($sp)
1749 std r26,112($sp)
1750 std r27,120($sp)
1751 std r28,128($sp)
1752 std r29,136($sp)
1753 std r30,144($sp)
1754 std r31,152($sp)
1755
1756 ld $a0,0($ap)
1757 ld $bi,0($bp)
1758 ld $a1,8($ap)
1759 ld $a2,16($ap)
1760 ld $a3,24($ap)
1761
1762 lis $ordk,0xccd1
1763 lis $ord0,0xf3b9
1764 lis $ord1,0xbce6
1765 ori $ordk,$ordk,0xc8aa
1766 ori $ord0,$ord0,0xcac2
1767 ori $ord1,$ord1,0xfaad
1768 sldi $ordk,$ordk,32
1769 sldi $ord0,$ord0,32
1770 sldi $ord1,$ord1,32
1771 oris $ordk,$ordk,0xee00
1772 oris $ord0,$ord0,0xfc63
1773 oris $ord1,$ord1,0xa717
1774 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1775 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1776 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1777 li $ord2,-1 # 0xffffffffffffffff
1778 sldi $ord3,$ord2,32 # 0xffffffff00000000
1779 li $zr,0
1780
1781 mulld $acc0,$a0,$bi # a[0]*b[0]
1782 mulhdu $t0,$a0,$bi
1783
1784 mulld $acc1,$a1,$bi # a[1]*b[0]
1785 mulhdu $t1,$a1,$bi
1786
1787 mulld $acc2,$a2,$bi # a[2]*b[0]
1788 mulhdu $t2,$a2,$bi
1789
1790 mulld $acc3,$a3,$bi # a[3]*b[0]
1791 mulhdu $acc4,$a3,$bi
1792
1793 mulld $t4,$acc0,$ordk
1794
1795 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
1796 adde $acc2,$acc2,$t1
1797 adde $acc3,$acc3,$t2
1798 addze $acc4,$acc4
1799 li $acc5,0
1800___
1801for ($i=1;$i<4;$i++) {
1802 ################################################################
1803 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1804 # * abcdefgh
1805 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1806 #
1807 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1808 # rewrite above as:
1809 #
1810 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1811 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1812 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1813$code.=<<___;
1814 ld $bi,8*$i($bp) # b[i]
1815
1816 sldi $t0,$t4,32
1817 subfc $acc2,$t4,$acc2
1818 srdi $t1,$t4,32
1819 subfe $acc3,$t0,$acc3
1820 subfe $acc4,$t1,$acc4
1821 subfe $acc5,$zr,$acc5
1822
1823 addic $t0,$acc0,-1 # discarded
1824 mulhdu $t1,$ord0,$t4
1825 mulld $t2,$ord1,$t4
1826 mulhdu $t3,$ord1,$t4
1827
1828 adde $t2,$t2,$t1
1829 mulld $t0,$a0,$bi
1830 addze $t3,$t3
1831 mulld $t1,$a1,$bi
1832
1833 addc $acc0,$acc1,$t2
1834 mulld $t2,$a2,$bi
1835 adde $acc1,$acc2,$t3
1836 mulld $t3,$a3,$bi
1837 adde $acc2,$acc3,$t4
1838 adde $acc3,$acc4,$t4
1839 addze $acc4,$acc5
1840
1841 addc $acc0,$acc0,$t0 # accumulate low parts
1842 mulhdu $t0,$a0,$bi
1843 adde $acc1,$acc1,$t1
1844 mulhdu $t1,$a1,$bi
1845 adde $acc2,$acc2,$t2
1846 mulhdu $t2,$a2,$bi
1847 adde $acc3,$acc3,$t3
1848 mulhdu $t3,$a3,$bi
1849 addze $acc4,$acc4
1850 mulld $t4,$acc0,$ordk
1851 addc $acc1,$acc1,$t0 # accumulate high parts
1852 adde $acc2,$acc2,$t1
1853 adde $acc3,$acc3,$t2
1854 adde $acc4,$acc4,$t3
1855 addze $acc5,$zr
1856___
1857}
1858$code.=<<___;
1859 sldi $t0,$t4,32 # last reduction
1860 subfc $acc2,$t4,$acc2
1861 srdi $t1,$t4,32
1862 subfe $acc3,$t0,$acc3
1863 subfe $acc4,$t1,$acc4
1864 subfe $acc5,$zr,$acc5
1865
1866 addic $t0,$acc0,-1 # discarded
1867 mulhdu $t1,$ord0,$t4
1868 mulld $t2,$ord1,$t4
1869 mulhdu $t3,$ord1,$t4
1870
1871 adde $t2,$t2,$t1
1872 addze $t3,$t3
1873
1874 addc $acc0,$acc1,$t2
1875 adde $acc1,$acc2,$t3
1876 adde $acc2,$acc3,$t4
1877 adde $acc3,$acc4,$t4
1878 addze $acc4,$acc5
1879
1880 subfc $acc0,$ord0,$acc0 # ret -= modulus
1881 subfe $acc1,$ord1,$acc1
1882 subfe $acc2,$ord2,$acc2
1883 subfe $acc3,$ord3,$acc3
1884 subfe $acc4,$zr,$acc4
1885
1886 and $t0,$ord0,$acc4
1887 and $t1,$ord1,$acc4
1888 addc $acc0,$acc0,$t0 # ret += modulus if borrow
1889 and $t3,$ord3,$acc4
1890 adde $acc1,$acc1,$t1
1891 adde $acc2,$acc2,$acc4
1892 adde $acc3,$acc3,$t3
1893
1894 std $acc0,0($rp)
1895 std $acc1,8($rp)
1896 std $acc2,16($rp)
1897 std $acc3,24($rp)
1898
1899 ld r18,48($sp)
1900 ld r19,56($sp)
1901 ld r20,64($sp)
1902 ld r21,72($sp)
1903 ld r22,80($sp)
1904 ld r23,88($sp)
1905 ld r24,96($sp)
1906 ld r25,104($sp)
1907 ld r26,112($sp)
1908 ld r27,120($sp)
1909 ld r28,128($sp)
1910 ld r29,136($sp)
1911 ld r30,144($sp)
1912 ld r31,152($sp)
1913 addi $sp,$sp,160
1914 blr
1915 .long 0
1916 .byte 0,12,4,0,0x80,14,3,0
1917 .long 0
1918.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1919
1920################################################################################
1921# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
15972296 1922# uint64_t rep);
d8f432aa
AP
1923.globl ecp_nistz256_ord_sqr_mont
1924.align 5
1925ecp_nistz256_ord_sqr_mont:
1926 stdu $sp,-160($sp)
1927 std r18,48($sp)
1928 std r19,56($sp)
1929 std r20,64($sp)
1930 std r21,72($sp)
1931 std r22,80($sp)
1932 std r23,88($sp)
1933 std r24,96($sp)
1934 std r25,104($sp)
1935 std r26,112($sp)
1936 std r27,120($sp)
1937 std r28,128($sp)
1938 std r29,136($sp)
1939 std r30,144($sp)
1940 std r31,152($sp)
1941
1942 mtctr $bp
1943
1944 ld $a0,0($ap)
1945 ld $a1,8($ap)
1946 ld $a2,16($ap)
1947 ld $a3,24($ap)
1948
1949 lis $ordk,0xccd1
1950 lis $ord0,0xf3b9
1951 lis $ord1,0xbce6
1952 ori $ordk,$ordk,0xc8aa
1953 ori $ord0,$ord0,0xcac2
1954 ori $ord1,$ord1,0xfaad
1955 sldi $ordk,$ordk,32
1956 sldi $ord0,$ord0,32
1957 sldi $ord1,$ord1,32
1958 oris $ordk,$ordk,0xee00
1959 oris $ord0,$ord0,0xfc63
1960 oris $ord1,$ord1,0xa717
1961 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1962 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1963 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1964 li $ord2,-1 # 0xffffffffffffffff
1965 sldi $ord3,$ord2,32 # 0xffffffff00000000
1966 li $zr,0
1967 b .Loop_ord_sqr
1968
1969.align 5
1970.Loop_ord_sqr:
1971 ################################################################
1972 # | | | | | |a1*a0| |
1973 # | | | | |a2*a0| | |
1974 # | |a3*a2|a3*a0| | | |
1975 # | | | |a2*a1| | | |
1976 # | | |a3*a1| | | | |
1977 # *| | | | | | | | 2|
1978 # +|a3*a3|a2*a2|a1*a1|a0*a0|
1979 # |--+--+--+--+--+--+--+--|
1980 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1981 #
1982 # "can't overflow" below mark carrying into high part of
1983 # multiplication result, which can't overflow, because it
1984 # can never be all ones.
1985
1986 mulld $acc1,$a1,$a0 # a[1]*a[0]
1987 mulhdu $t1,$a1,$a0
1988 mulld $acc2,$a2,$a0 # a[2]*a[0]
1989 mulhdu $t2,$a2,$a0
1990 mulld $acc3,$a3,$a0 # a[3]*a[0]
1991 mulhdu $acc4,$a3,$a0
1992
1993 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
1994 mulld $t0,$a2,$a1 # a[2]*a[1]
1995 mulhdu $t1,$a2,$a1
1996 adde $acc3,$acc3,$t2
1997 mulld $t2,$a3,$a1 # a[3]*a[1]
1998 mulhdu $t3,$a3,$a1
1999 addze $acc4,$acc4 # can't overflow
2000
2001 mulld $acc5,$a3,$a2 # a[3]*a[2]
2002 mulhdu $acc6,$a3,$a2
2003
2004 addc $t1,$t1,$t2 # accumulate high parts of multiplication
2005 mulld $acc0,$a0,$a0 # a[0]*a[0]
2006 addze $t2,$t3 # can't overflow
2007
2008 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
2009 mulhdu $a0,$a0,$a0
2010 adde $acc4,$acc4,$t1
2011 mulld $t1,$a1,$a1 # a[1]*a[1]
2012 adde $acc5,$acc5,$t2
2013 mulhdu $a1,$a1,$a1
2014 addze $acc6,$acc6 # can't overflow
2015
2016 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
2017 mulld $t2,$a2,$a2 # a[2]*a[2]
2018 adde $acc2,$acc2,$acc2
2019 mulhdu $a2,$a2,$a2
2020 adde $acc3,$acc3,$acc3
2021 mulld $t3,$a3,$a3 # a[3]*a[3]
2022 adde $acc4,$acc4,$acc4
2023 mulhdu $a3,$a3,$a3
2024 adde $acc5,$acc5,$acc5
2025 adde $acc6,$acc6,$acc6
2026 addze $acc7,$zr
2027
2028 addc $acc1,$acc1,$a0 # +a[i]*a[i]
2029 mulld $t4,$acc0,$ordk
2030 adde $acc2,$acc2,$t1
2031 adde $acc3,$acc3,$a1
2032 adde $acc4,$acc4,$t2
2033 adde $acc5,$acc5,$a2
2034 adde $acc6,$acc6,$t3
2035 adde $acc7,$acc7,$a3
2036___
2037for($i=0; $i<4; $i++) { # reductions
2038$code.=<<___;
2039 addic $t0,$acc0,-1 # discarded
2040 mulhdu $t1,$ord0,$t4
2041 mulld $t2,$ord1,$t4
2042 mulhdu $t3,$ord1,$t4
2043
2044 adde $t2,$t2,$t1
2045 addze $t3,$t3
2046
2047 addc $acc0,$acc1,$t2
2048 adde $acc1,$acc2,$t3
2049 adde $acc2,$acc3,$t4
2050 adde $acc3,$zr,$t4 # can't overflow
2051___
2052$code.=<<___ if ($i<3);
2053 mulld $t3,$acc0,$ordk
2054___
2055$code.=<<___;
2056 sldi $t0,$t4,32
2057 subfc $acc1,$t4,$acc1
2058 srdi $t1,$t4,32
2059 subfe $acc2,$t0,$acc2
2060 subfe $acc3,$t1,$acc3 # can't borrow
2061___
2062 ($t3,$t4) = ($t4,$t3);
2063}
2064$code.=<<___;
2065 addc $acc0,$acc0,$acc4 # accumulate upper half
2066 adde $acc1,$acc1,$acc5
2067 adde $acc2,$acc2,$acc6
2068 adde $acc3,$acc3,$acc7
2069 addze $acc4,$zr
2070
2071 subfc $acc0,$ord0,$acc0 # ret -= modulus
2072 subfe $acc1,$ord1,$acc1
2073 subfe $acc2,$ord2,$acc2
2074 subfe $acc3,$ord3,$acc3
2075 subfe $acc4,$zr,$acc4
2076
2077 and $t0,$ord0,$acc4
2078 and $t1,$ord1,$acc4
2079 addc $a0,$acc0,$t0 # ret += modulus if borrow
2080 and $t3,$ord3,$acc4
2081 adde $a1,$acc1,$t1
2082 adde $a2,$acc2,$acc4
2083 adde $a3,$acc3,$t3
2084
2085 bdnz .Loop_ord_sqr
2086
2087 std $a0,0($rp)
2088 std $a1,8($rp)
2089 std $a2,16($rp)
2090 std $a3,24($rp)
2091
2092 ld r18,48($sp)
2093 ld r19,56($sp)
2094 ld r20,64($sp)
2095 ld r21,72($sp)
2096 ld r22,80($sp)
2097 ld r23,88($sp)
2098 ld r24,96($sp)
2099 ld r25,104($sp)
2100 ld r26,112($sp)
2101 ld r27,120($sp)
2102 ld r28,128($sp)
2103 ld r29,136($sp)
2104 ld r30,144($sp)
2105 ld r31,152($sp)
2106 addi $sp,$sp,160
2107 blr
2108 .long 0
2109 .byte 0,12,4,0,0x80,14,3,0
2110 .long 0
2111.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2112___
2113} }
2114
2115########################################################################
2116# scatter-gather subroutines
2117{
2118my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2119$code.=<<___;
2120########################################################################
2121# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2122# int index);
2123.globl ecp_nistz256_scatter_w5
2124.align 4
2125ecp_nistz256_scatter_w5:
2126 slwi $index,$index,2
2127 add $out,$out,$index
2128
2129 ld r8, 0($inp) # X
2130 ld r9, 8($inp)
2131 ld r10,16($inp)
2132 ld r11,24($inp)
2133
2134 stw r8, 64*0-4($out)
2135 srdi r8, r8, 32
2136 stw r9, 64*1-4($out)
2137 srdi r9, r9, 32
2138 stw r10,64*2-4($out)
2139 srdi r10,r10,32
2140 stw r11,64*3-4($out)
2141 srdi r11,r11,32
2142 stw r8, 64*4-4($out)
2143 stw r9, 64*5-4($out)
2144 stw r10,64*6-4($out)
2145 stw r11,64*7-4($out)
2146 addi $out,$out,64*8
2147
2148 ld r8, 32($inp) # Y
2149 ld r9, 40($inp)
2150 ld r10,48($inp)
2151 ld r11,56($inp)
2152
2153 stw r8, 64*0-4($out)
2154 srdi r8, r8, 32
2155 stw r9, 64*1-4($out)
2156 srdi r9, r9, 32
2157 stw r10,64*2-4($out)
2158 srdi r10,r10,32
2159 stw r11,64*3-4($out)
2160 srdi r11,r11,32
2161 stw r8, 64*4-4($out)
2162 stw r9, 64*5-4($out)
2163 stw r10,64*6-4($out)
2164 stw r11,64*7-4($out)
2165 addi $out,$out,64*8
2166
2167 ld r8, 64($inp) # Z
2168 ld r9, 72($inp)
2169 ld r10,80($inp)
2170 ld r11,88($inp)
2171
2172 stw r8, 64*0-4($out)
2173 srdi r8, r8, 32
2174 stw r9, 64*1-4($out)
2175 srdi r9, r9, 32
2176 stw r10,64*2-4($out)
2177 srdi r10,r10,32
2178 stw r11,64*3-4($out)
2179 srdi r11,r11,32
2180 stw r8, 64*4-4($out)
2181 stw r9, 64*5-4($out)
2182 stw r10,64*6-4($out)
2183 stw r11,64*7-4($out)
2184
2185 blr
2186 .long 0
2187 .byte 0,12,0x14,0,0,0,3,0
2188 .long 0
2189.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2190
2191########################################################################
2192# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2193# int index);
2194.globl ecp_nistz256_gather_w5
2195.align 4
2196ecp_nistz256_gather_w5:
2197 neg r0,$index
2198 sradi r0,r0,63
2199
2200 add $index,$index,r0
2201 slwi $index,$index,2
2202 add $inp,$inp,$index
2203
2204 lwz r5, 64*0($inp)
2205 lwz r6, 64*1($inp)
2206 lwz r7, 64*2($inp)
2207 lwz r8, 64*3($inp)
2208 lwz r9, 64*4($inp)
2209 lwz r10,64*5($inp)
2210 lwz r11,64*6($inp)
2211 lwz r12,64*7($inp)
2212 addi $inp,$inp,64*8
2213 sldi r9, r9, 32
2214 sldi r10,r10,32
2215 sldi r11,r11,32
2216 sldi r12,r12,32
2217 or r5,r5,r9
2218 or r6,r6,r10
2219 or r7,r7,r11
2220 or r8,r8,r12
2221 and r5,r5,r0
2222 and r6,r6,r0
2223 and r7,r7,r0
2224 and r8,r8,r0
2225 std r5,0($out) # X
2226 std r6,8($out)
2227 std r7,16($out)
2228 std r8,24($out)
2229
2230 lwz r5, 64*0($inp)
2231 lwz r6, 64*1($inp)
2232 lwz r7, 64*2($inp)
2233 lwz r8, 64*3($inp)
2234 lwz r9, 64*4($inp)
2235 lwz r10,64*5($inp)
2236 lwz r11,64*6($inp)
2237 lwz r12,64*7($inp)
2238 addi $inp,$inp,64*8
2239 sldi r9, r9, 32
2240 sldi r10,r10,32
2241 sldi r11,r11,32
2242 sldi r12,r12,32
2243 or r5,r5,r9
2244 or r6,r6,r10
2245 or r7,r7,r11
2246 or r8,r8,r12
2247 and r5,r5,r0
2248 and r6,r6,r0
2249 and r7,r7,r0
2250 and r8,r8,r0
2251 std r5,32($out) # Y
2252 std r6,40($out)
2253 std r7,48($out)
2254 std r8,56($out)
2255
2256 lwz r5, 64*0($inp)
2257 lwz r6, 64*1($inp)
2258 lwz r7, 64*2($inp)
2259 lwz r8, 64*3($inp)
2260 lwz r9, 64*4($inp)
2261 lwz r10,64*5($inp)
2262 lwz r11,64*6($inp)
2263 lwz r12,64*7($inp)
2264 sldi r9, r9, 32
2265 sldi r10,r10,32
2266 sldi r11,r11,32
2267 sldi r12,r12,32
2268 or r5,r5,r9
2269 or r6,r6,r10
2270 or r7,r7,r11
2271 or r8,r8,r12
2272 and r5,r5,r0
2273 and r6,r6,r0
2274 and r7,r7,r0
2275 and r8,r8,r0
2276 std r5,64($out) # Z
2277 std r6,72($out)
2278 std r7,80($out)
2279 std r8,88($out)
2280
2281 blr
2282 .long 0
2283 .byte 0,12,0x14,0,0,0,3,0
2284 .long 0
2285.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2286
2287########################################################################
2288# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2289# int index);
2290.globl ecp_nistz256_scatter_w7
2291.align 4
2292ecp_nistz256_scatter_w7:
2293 li r0,8
2294 mtctr r0
2295 add $out,$out,$index
2296 subi $inp,$inp,8
2297
2298.Loop_scatter_w7:
2299 ldu r0,8($inp)
87a75b3e 2300 stb r0,64*0($out)
d8f432aa 2301 srdi r0,r0,8
87a75b3e 2302 stb r0,64*1($out)
d8f432aa 2303 srdi r0,r0,8
87a75b3e 2304 stb r0,64*2($out)
d8f432aa 2305 srdi r0,r0,8
87a75b3e 2306 stb r0,64*3($out)
d8f432aa 2307 srdi r0,r0,8
87a75b3e 2308 stb r0,64*4($out)
d8f432aa 2309 srdi r0,r0,8
87a75b3e 2310 stb r0,64*5($out)
d8f432aa 2311 srdi r0,r0,8
87a75b3e 2312 stb r0,64*6($out)
d8f432aa 2313 srdi r0,r0,8
87a75b3e 2314 stb r0,64*7($out)
d8f432aa
AP
2315 addi $out,$out,64*8
2316 bdnz .Loop_scatter_w7
2317
2318 blr
2319 .long 0
2320 .byte 0,12,0x14,0,0,0,3,0
2321 .long 0
2322.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2323
2324########################################################################
2325# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2326# int index);
2327.globl ecp_nistz256_gather_w7
2328.align 4
2329ecp_nistz256_gather_w7:
2330 li r0,8
2331 mtctr r0
2332 neg r0,$index
2333 sradi r0,r0,63
2334
2335 add $index,$index,r0
2336 add $inp,$inp,$index
2337 subi $out,$out,8
2338
2339.Loop_gather_w7:
2340 lbz r5, 64*0($inp)
2341 lbz r6, 64*1($inp)
2342 lbz r7, 64*2($inp)
2343 lbz r8, 64*3($inp)
2344 lbz r9, 64*4($inp)
2345 lbz r10,64*5($inp)
2346 lbz r11,64*6($inp)
2347 lbz r12,64*7($inp)
2348 addi $inp,$inp,64*8
2349
2350 sldi r6, r6, 8
2351 sldi r7, r7, 16
2352 sldi r8, r8, 24
2353 sldi r9, r9, 32
2354 sldi r10,r10,40
2355 sldi r11,r11,48
2356 sldi r12,r12,56
2357
2358 or r5,r5,r6
2359 or r7,r7,r8
2360 or r9,r9,r10
2361 or r11,r11,r12
2362 or r5,r5,r7
2363 or r9,r9,r11
2364 or r5,r5,r9
2365 and r5,r5,r0
2366 stdu r5,8($out)
2367 bdnz .Loop_gather_w7
2368
2369 blr
2370 .long 0
2371 .byte 0,12,0x14,0,0,0,3,0
2372 .long 0
2373.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2374___
2375}
2376
2377foreach (split("\n",$code)) {
2378 s/\`([^\`]*)\`/eval $1/ge;
2379
2380 print $_,"\n";
2381}
2382close STDOUT; # enforce flush