]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/x25519-ppc64.pl
Fix Typos
[thirdparty/openssl.git] / crypto / ec / asm / x25519-ppc64.pl
1 #! /usr/bin/env perl
2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # X25519 lower-level primitives for PPC64.
17 #
18 # July 2018.
19 #
20 # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21 # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22 # shoelaces when handling longer carry chains. As base 2^51 has just
23 # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24 # pretty old, base 2^64 implementation is not engaged. Comparison to
25 # compiler-generated code is complicated by the fact that not all
26 # compilers support 128-bit integers. When compiler doesn't, like xlc,
27 # this module delivers more than 2x improvement, and when it does,
28 # from 12% to 30% improvement was measured...
29
30 $flavour = shift;
31 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
32
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36 die "can't locate ppc-xlate.pl";
37
38 open OUT,"| \"$^X\" $xlate $flavour $output";
39 *STDOUT=*OUT;
40
41 my $sp = "r1";
42 my ($rp,$ap,$bp) = map("r$_",3..5);
43
44 ####################################################### base 2^64
45 if (0) {
46 my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
48 map("r$_",(6..12,22..31));
49 my $zero = "r0";
50 my $FRAME = 16*8;
51
52 $code.=<<___;
53 .text
54
55 .globl x25519_fe64_mul
56 .type x25519_fe64_mul,\@function
57 .align 5
58 x25519_fe64_mul:
59 stdu $sp,-$FRAME($sp)
60 std r22,`$FRAME-8*10`($sp)
61 std r23,`$FRAME-8*9`($sp)
62 std r24,`$FRAME-8*8`($sp)
63 std r25,`$FRAME-8*7`($sp)
64 std r26,`$FRAME-8*6`($sp)
65 std r27,`$FRAME-8*5`($sp)
66 std r28,`$FRAME-8*4`($sp)
67 std r29,`$FRAME-8*3`($sp)
68 std r30,`$FRAME-8*2`($sp)
69 std r31,`$FRAME-8*1`($sp)
70
71 ld $bi,0($bp)
72 ld $a0,0($ap)
73 xor $zero,$zero,$zero
74 ld $a1,8($ap)
75 ld $a2,16($ap)
76 ld $a3,24($ap)
77
78 mulld $acc0,$a0,$bi # a[0]*b[0]
79 mulhdu $t0,$a0,$bi
80 mulld $acc1,$a1,$bi # a[1]*b[0]
81 mulhdu $t1,$a1,$bi
82 mulld $acc2,$a2,$bi # a[2]*b[0]
83 mulhdu $t2,$a2,$bi
84 mulld $acc3,$a3,$bi # a[3]*b[0]
85 mulhdu $t3,$a3,$bi
86 ___
87 for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
88 my $i=1; $i<4; shift(@acc), $i++) {
89 my $acc4 = $i==1? $zero : @acc[4];
90
91 $code.=<<___;
92 ld $bi,`8*$i`($bp)
93 addc @acc[1],@acc[1],$t0 # accumulate high parts
94 mulld $t0,$a0,$bi
95 adde @acc[2],@acc[2],$t1
96 mulld $t1,$a1,$bi
97 adde @acc[3],@acc[3],$t2
98 mulld $t2,$a2,$bi
99 adde @acc[4],$acc4,$t3
100 mulld $t3,$a3,$bi
101 addc @acc[1],@acc[1],$t0 # accumulate low parts
102 mulhdu $t0,$a0,$bi
103 adde @acc[2],@acc[2],$t1
104 mulhdu $t1,$a1,$bi
105 adde @acc[3],@acc[3],$t2
106 mulhdu $t2,$a2,$bi
107 adde @acc[4],@acc[4],$t3
108 mulhdu $t3,$a3,$bi
109 adde @acc[5],$zero,$zero
110 ___
111 }
112 $code.=<<___;
113 li $bi,38
114 addc $acc4,$acc4,$t0
115 mulld $t0,$acc4,$bi
116 adde $acc5,$acc5,$t1
117 mulld $t1,$acc5,$bi
118 adde $acc6,$acc6,$t2
119 mulld $t2,$acc6,$bi
120 adde $acc7,$acc7,$t3
121 mulld $t3,$acc7,$bi
122
123 addc $acc0,$acc0,$t0
124 mulhdu $t0,$acc4,$bi
125 adde $acc1,$acc1,$t1
126 mulhdu $t1,$acc5,$bi
127 adde $acc2,$acc2,$t2
128 mulhdu $t2,$acc6,$bi
129 adde $acc3,$acc3,$t3
130 mulhdu $t3,$acc7,$bi
131 adde $acc4,$zero,$zero
132
133 addc $acc1,$acc1,$t0
134 adde $acc2,$acc2,$t1
135 adde $acc3,$acc3,$t2
136 adde $acc4,$acc4,$t3
137
138 mulld $acc4,$acc4,$bi
139
140 addc $acc0,$acc0,$acc4
141 addze $acc1,$acc1
142 addze $acc2,$acc2
143 addze $acc3,$acc3
144
145 subfe $acc4,$acc4,$acc4 # carry -> ~mask
146 std $acc1,8($rp)
147 andc $acc4,$bi,$acc4
148 std $acc2,16($rp)
149 add $acc0,$acc0,$acc4
150 std $acc3,24($rp)
151 std $acc0,0($rp)
152
153 ld r22,`$FRAME-8*10`($sp)
154 ld r23,`$FRAME-8*9`($sp)
155 ld r24,`$FRAME-8*8`($sp)
156 ld r25,`$FRAME-8*7`($sp)
157 ld r26,`$FRAME-8*6`($sp)
158 ld r27,`$FRAME-8*5`($sp)
159 ld r28,`$FRAME-8*4`($sp)
160 ld r29,`$FRAME-8*3`($sp)
161 ld r30,`$FRAME-8*2`($sp)
162 ld r31,`$FRAME-8*1`($sp)
163 addi $sp,$sp,$FRAME
164 blr
165 .long 0
166 .byte 0,12,4,0,0x80,10,3,0
167 .long 0
168 .size x25519_fe64_mul,.-x25519_fe64_mul
169
170 .globl x25519_fe64_sqr
171 .type x25519_fe64_sqr,\@function
172 .align 5
173 x25519_fe64_sqr:
174 stdu $sp,-$FRAME($sp)
175 std r22,`$FRAME-8*10`($sp)
176 std r23,`$FRAME-8*9`($sp)
177 std r24,`$FRAME-8*8`($sp)
178 std r25,`$FRAME-8*7`($sp)
179 std r26,`$FRAME-8*6`($sp)
180 std r27,`$FRAME-8*5`($sp)
181 std r28,`$FRAME-8*4`($sp)
182 std r29,`$FRAME-8*3`($sp)
183 std r30,`$FRAME-8*2`($sp)
184 std r31,`$FRAME-8*1`($sp)
185
186 ld $a0,0($ap)
187 xor $zero,$zero,$zero
188 ld $a1,8($ap)
189 ld $a2,16($ap)
190 ld $a3,24($ap)
191
192 ################################
193 # | | | | | |a1*a0| |
194 # | | | | |a2*a0| | |
195 # | |a3*a2|a3*a0| | | |
196 # | | | |a2*a1| | | |
197 # | | |a3*a1| | | | |
198 # *| | | | | | | | 2|
199 # +|a3*a3|a2*a2|a1*a1|a0*a0|
200 # |--+--+--+--+--+--+--+--|
201 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
202 #
203 # "can't overflow" below mark carrying into high part of
204 # multiplication result, which can't overflow, because it
205 # can never be all ones.
206
207 mulld $acc1,$a1,$a0 # a[1]*a[0]
208 mulhdu $t1,$a1,$a0
209 mulld $acc2,$a2,$a0 # a[2]*a[0]
210 mulhdu $t2,$a2,$a0
211 mulld $acc3,$a3,$a0 # a[3]*a[0]
212 mulhdu $acc4,$a3,$a0
213
214 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
215 mulld $t0,$a2,$a1 # a[2]*a[1]
216 mulhdu $t1,$a2,$a1
217 adde $acc3,$acc3,$t2
218 mulld $t2,$a3,$a1 # a[3]*a[1]
219 mulhdu $t3,$a3,$a1
220 addze $acc4,$acc4 # can't overflow
221
222 mulld $acc5,$a3,$a2 # a[3]*a[2]
223 mulhdu $acc6,$a3,$a2
224
225 addc $t1,$t1,$t2 # accumulate high parts of multiplication
226 mulld $acc0,$a0,$a0 # a[0]*a[0]
227 addze $t2,$t3 # can't overflow
228
229 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
230 mulhdu $a0,$a0,$a0
231 adde $acc4,$acc4,$t1
232 mulld $t1,$a1,$a1 # a[1]*a[1]
233 adde $acc5,$acc5,$t2
234 mulhdu $a1,$a1,$a1
235 addze $acc6,$acc6 # can't overflow
236
237 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
238 mulld $t2,$a2,$a2 # a[2]*a[2]
239 adde $acc2,$acc2,$acc2
240 mulhdu $a2,$a2,$a2
241 adde $acc3,$acc3,$acc3
242 mulld $t3,$a3,$a3 # a[3]*a[3]
243 adde $acc4,$acc4,$acc4
244 mulhdu $a3,$a3,$a3
245 adde $acc5,$acc5,$acc5
246 adde $acc6,$acc6,$acc6
247 addze $acc7,$zero
248
249 addc $acc1,$acc1,$a0 # +a[i]*a[i]
250 li $bi,38
251 adde $acc2,$acc2,$t1
252 adde $acc3,$acc3,$a1
253 adde $acc4,$acc4,$t2
254 adde $acc5,$acc5,$a2
255 adde $acc6,$acc6,$t3
256 adde $acc7,$acc7,$a3
257
258 mulld $t0,$acc4,$bi
259 mulld $t1,$acc5,$bi
260 mulld $t2,$acc6,$bi
261 mulld $t3,$acc7,$bi
262
263 addc $acc0,$acc0,$t0
264 mulhdu $t0,$acc4,$bi
265 adde $acc1,$acc1,$t1
266 mulhdu $t1,$acc5,$bi
267 adde $acc2,$acc2,$t2
268 mulhdu $t2,$acc6,$bi
269 adde $acc3,$acc3,$t3
270 mulhdu $t3,$acc7,$bi
271 addze $acc4,$zero
272
273 addc $acc1,$acc1,$t0
274 adde $acc2,$acc2,$t1
275 adde $acc3,$acc3,$t2
276 adde $acc4,$acc4,$t3
277
278 mulld $acc4,$acc4,$bi
279
280 addc $acc0,$acc0,$acc4
281 addze $acc1,$acc1
282 addze $acc2,$acc2
283 addze $acc3,$acc3
284
285 subfe $acc4,$acc4,$acc4 # carry -> ~mask
286 std $acc1,8($rp)
287 andc $acc4,$bi,$acc4
288 std $acc2,16($rp)
289 add $acc0,$acc0,$acc4
290 std $acc3,24($rp)
291 std $acc0,0($rp)
292
293 ld r22,`$FRAME-8*10`($sp)
294 ld r23,`$FRAME-8*9`($sp)
295 ld r24,`$FRAME-8*8`($sp)
296 ld r25,`$FRAME-8*7`($sp)
297 ld r26,`$FRAME-8*6`($sp)
298 ld r27,`$FRAME-8*5`($sp)
299 ld r28,`$FRAME-8*4`($sp)
300 ld r29,`$FRAME-8*3`($sp)
301 ld r30,`$FRAME-8*2`($sp)
302 ld r31,`$FRAME-8*1`($sp)
303 addi $sp,$sp,$FRAME
304 blr
305 .long 0
306 .byte 0,12,4,0,0x80,10,2,0
307 .long 0
308 .size x25519_fe64_sqr,.-x25519_fe64_sqr
309
310 .globl x25519_fe64_mul121666
311 .type x25519_fe64_mul121666,\@function
312 .align 5
313 x25519_fe64_mul121666:
314 lis $bi,`65536>>16`
315 ori $bi,$bi,`121666-65536`
316
317 ld $t0,0($ap)
318 ld $t1,8($ap)
319 ld $bp,16($ap)
320 ld $ap,24($ap)
321
322 mulld $a0,$t0,$bi
323 mulhdu $t0,$t0,$bi
324 mulld $a1,$t1,$bi
325 mulhdu $t1,$t1,$bi
326 mulld $a2,$bp,$bi
327 mulhdu $bp,$bp,$bi
328 mulld $a3,$ap,$bi
329 mulhdu $ap,$ap,$bi
330
331 addc $a1,$a1,$t0
332 adde $a2,$a2,$t1
333 adde $a3,$a3,$bp
334 addze $ap, $ap
335
336 mulli $ap,$ap,38
337
338 addc $a0,$a0,$ap
339 addze $a1,$a1
340 addze $a2,$a2
341 addze $a3,$a3
342
343 subfe $t1,$t1,$t1 # carry -> ~mask
344 std $a1,8($rp)
345 andc $t0,$t0,$t1
346 std $a2,16($rp)
347 add $a0,$a0,$t0
348 std $a3,24($rp)
349 std $a0,0($rp)
350
351 blr
352 .long 0
353 .byte 0,12,0x14,0,0,0,2,0
354 .long 0
355 .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
356
357 .globl x25519_fe64_add
358 .type x25519_fe64_add,\@function
359 .align 5
360 x25519_fe64_add:
361 ld $a0,0($ap)
362 ld $t0,0($bp)
363 ld $a1,8($ap)
364 ld $t1,8($bp)
365 ld $a2,16($ap)
366 ld $bi,16($bp)
367 ld $a3,24($ap)
368 ld $bp,24($bp)
369
370 addc $a0,$a0,$t0
371 adde $a1,$a1,$t1
372 adde $a2,$a2,$bi
373 adde $a3,$a3,$bp
374
375 li $t0,38
376 subfe $t1,$t1,$t1 # carry -> ~mask
377 andc $t1,$t0,$t1
378
379 addc $a0,$a0,$t1
380 addze $a1,$a1
381 addze $a2,$a2
382 addze $a3,$a3
383
384 subfe $t1,$t1,$t1 # carry -> ~mask
385 std $a1,8($rp)
386 andc $t0,$t0,$t1
387 std $a2,16($rp)
388 add $a0,$a0,$t0
389 std $a3,24($rp)
390 std $a0,0($rp)
391
392 blr
393 .long 0
394 .byte 0,12,0x14,0,0,0,3,0
395 .long 0
396 .size x25519_fe64_add,.-x25519_fe64_add
397
398 .globl x25519_fe64_sub
399 .type x25519_fe64_sub,\@function
400 .align 5
401 x25519_fe64_sub:
402 ld $a0,0($ap)
403 ld $t0,0($bp)
404 ld $a1,8($ap)
405 ld $t1,8($bp)
406 ld $a2,16($ap)
407 ld $bi,16($bp)
408 ld $a3,24($ap)
409 ld $bp,24($bp)
410
411 subfc $a0,$t0,$a0
412 subfe $a1,$t1,$a1
413 subfe $a2,$bi,$a2
414 subfe $a3,$bp,$a3
415
416 li $t0,38
417 subfe $t1,$t1,$t1 # borrow -> mask
418 xor $zero,$zero,$zero
419 and $t1,$t0,$t1
420
421 subfc $a0,$t1,$a0
422 subfe $a1,$zero,$a1
423 subfe $a2,$zero,$a2
424 subfe $a3,$zero,$a3
425
426 subfe $t1,$t1,$t1 # borrow -> mask
427 std $a1,8($rp)
428 and $t0,$t0,$t1
429 std $a2,16($rp)
430 subf $a0,$t0,$a0
431 std $a3,24($rp)
432 std $a0,0($rp)
433
434 blr
435 .long 0
436 .byte 0,12,0x14,0,0,0,3,0
437 .long 0
438 .size x25519_fe64_sub,.-x25519_fe64_sub
439
440 .globl x25519_fe64_tobytes
441 .type x25519_fe64_tobytes,\@function
442 .align 5
443 x25519_fe64_tobytes:
444 ld $a3,24($ap)
445 ld $a0,0($ap)
446 ld $a1,8($ap)
447 ld $a2,16($ap)
448
449 sradi $t0,$a3,63 # most significant bit -> mask
450 li $t1,19
451 and $t0,$t0,$t1
452 sldi $a3,$a3,1
453 add $t0,$t0,$t1 # compare to modulus in the same go
454 srdi $a3,$a3,1 # most significant bit cleared
455
456 addc $a0,$a0,$t0
457 addze $a1,$a1
458 addze $a2,$a2
459 addze $a3,$a3
460
461 xor $zero,$zero,$zero
462 sradi $t0,$a3,63 # most significant bit -> mask
463 sldi $a3,$a3,1
464 andc $t0,$t1,$t0
465 srdi $a3,$a3,1 # most significant bit cleared
466
467 subi $rp,$rp,1
468 subfc $a0,$t0,$a0
469 subfe $a1,$zero,$a1
470 subfe $a2,$zero,$a2
471 subfe $a3,$zero,$a3
472
473 ___
474 for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
475 $code.=<<___;
476 srdi $t0,@a[0],8
477 stbu @a[0],1($rp)
478 srdi @a[0],@a[0],16
479 stbu $t0,1($rp)
480 srdi $t0,@a[0],8
481 stbu @a[0],1($rp)
482 srdi @a[0],@a[0],16
483 stbu $t0,1($rp)
484 srdi $t0,@a[0],8
485 stbu @a[0],1($rp)
486 srdi @a[0],@a[0],16
487 stbu $t0,1($rp)
488 srdi $t0,@a[0],8
489 stbu @a[0],1($rp)
490 stbu $t0,1($rp)
491 ___
492 }
493 $code.=<<___;
494 blr
495 .long 0
496 .byte 0,12,0x14,0,0,0,2,0
497 .long 0
498 .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
499 ___
500 }
501 ####################################################### base 2^51
502 {
503 my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
504 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
505 map("r$_",(6..12,21..31));
506 my $mask = "r0";
507 my $FRAME = 18*8;
508
509 $code.=<<___;
510 .text
511
512 .globl x25519_fe51_mul
513 .type x25519_fe51_mul,\@function
514 .align 5
515 x25519_fe51_mul:
516 stdu $sp,-$FRAME($sp)
517 std r21,`$FRAME-8*11`($sp)
518 std r22,`$FRAME-8*10`($sp)
519 std r23,`$FRAME-8*9`($sp)
520 std r24,`$FRAME-8*8`($sp)
521 std r25,`$FRAME-8*7`($sp)
522 std r26,`$FRAME-8*6`($sp)
523 std r27,`$FRAME-8*5`($sp)
524 std r28,`$FRAME-8*4`($sp)
525 std r29,`$FRAME-8*3`($sp)
526 std r30,`$FRAME-8*2`($sp)
527 std r31,`$FRAME-8*1`($sp)
528
529 ld $bi,0($bp)
530 ld $a0,0($ap)
531 ld $a1,8($ap)
532 ld $a2,16($ap)
533 ld $a3,24($ap)
534 ld $a4,32($ap)
535
536 mulld $h0lo,$a0,$bi # a[0]*b[0]
537 mulhdu $h0hi,$a0,$bi
538
539 mulld $h1lo,$a1,$bi # a[1]*b[0]
540 mulhdu $h1hi,$a1,$bi
541
542 mulld $h4lo,$a4,$bi # a[4]*b[0]
543 mulhdu $h4hi,$a4,$bi
544 ld $ap,8($bp)
545 mulli $a4,$a4,19
546
547 mulld $h2lo,$a2,$bi # a[2]*b[0]
548 mulhdu $h2hi,$a2,$bi
549
550 mulld $h3lo,$a3,$bi # a[3]*b[0]
551 mulhdu $h3hi,$a3,$bi
552 ___
553 for(my @a=($a0,$a1,$a2,$a3,$a4),
554 my $i=1; $i<4; $i++) {
555 ($ap,$bi) = ($bi,$ap);
556 $code.=<<___;
557 mulld $t0,@a[4],$bi
558 mulhdu $t1,@a[4],$bi
559 addc $h0lo,$h0lo,$t0
560 adde $h0hi,$h0hi,$t1
561
562 mulld $t0,@a[0],$bi
563 mulhdu $t1,@a[0],$bi
564 addc $h1lo,$h1lo,$t0
565 adde $h1hi,$h1hi,$t1
566
567 mulld $t0,@a[3],$bi
568 mulhdu $t1,@a[3],$bi
569 ld $ap,`8*($i+1)`($bp)
570 mulli @a[3],@a[3],19
571 addc $h4lo,$h4lo,$t0
572 adde $h4hi,$h4hi,$t1
573
574 mulld $t0,@a[1],$bi
575 mulhdu $t1,@a[1],$bi
576 addc $h2lo,$h2lo,$t0
577 adde $h2hi,$h2hi,$t1
578
579 mulld $t0,@a[2],$bi
580 mulhdu $t1,@a[2],$bi
581 addc $h3lo,$h3lo,$t0
582 adde $h3hi,$h3hi,$t1
583 ___
584 unshift(@a,pop(@a));
585 }
586 ($ap,$bi) = ($bi,$ap);
587 $code.=<<___;
588 mulld $t0,$a1,$bi
589 mulhdu $t1,$a1,$bi
590 addc $h0lo,$h0lo,$t0
591 adde $h0hi,$h0hi,$t1
592
593 mulld $t0,$a2,$bi
594 mulhdu $t1,$a2,$bi
595 addc $h1lo,$h1lo,$t0
596 adde $h1hi,$h1hi,$t1
597
598 mulld $t0,$a3,$bi
599 mulhdu $t1,$a3,$bi
600 addc $h2lo,$h2lo,$t0
601 adde $h2hi,$h2hi,$t1
602
603 mulld $t0,$a4,$bi
604 mulhdu $t1,$a4,$bi
605 addc $h3lo,$h3lo,$t0
606 adde $h3hi,$h3hi,$t1
607
608 mulld $t0,$a0,$bi
609 mulhdu $t1,$a0,$bi
610 addc $h4lo,$h4lo,$t0
611 adde $h4hi,$h4hi,$t1
612
613 .Lfe51_reduce:
614 li $mask,-1
615 srdi $mask,$mask,13 # 0x7ffffffffffff
616
617 srdi $t0,$h2lo,51
618 and $a2,$h2lo,$mask
619 insrdi $t0,$h2hi,51,0 # h2>>51
620 srdi $t1,$h0lo,51
621 and $a0,$h0lo,$mask
622 insrdi $t1,$h0hi,51,0 # h0>>51
623 addc $h3lo,$h3lo,$t0
624 addze $h3hi,$h3hi
625 addc $h1lo,$h1lo,$t1
626 addze $h1hi,$h1hi
627
628 srdi $t0,$h3lo,51
629 and $a3,$h3lo,$mask
630 insrdi $t0,$h3hi,51,0 # h3>>51
631 srdi $t1,$h1lo,51
632 and $a1,$h1lo,$mask
633 insrdi $t1,$h1hi,51,0 # h1>>51
634 addc $h4lo,$h4lo,$t0
635 addze $h4hi,$h4hi
636 add $a2,$a2,$t1
637
638 srdi $t0,$h4lo,51
639 and $a4,$h4lo,$mask
640 insrdi $t0,$h4hi,51,0
641 mulli $t0,$t0,19 # (h4 >> 51) * 19
642
643 add $a0,$a0,$t0
644
645 srdi $t1,$a2,51
646 and $a2,$a2,$mask
647 add $a3,$a3,$t1
648
649 srdi $t0,$a0,51
650 and $a0,$a0,$mask
651 add $a1,$a1,$t0
652
653 std $a2,16($rp)
654 std $a3,24($rp)
655 std $a4,32($rp)
656 std $a0,0($rp)
657 std $a1,8($rp)
658
659 ld r21,`$FRAME-8*11`($sp)
660 ld r22,`$FRAME-8*10`($sp)
661 ld r23,`$FRAME-8*9`($sp)
662 ld r24,`$FRAME-8*8`($sp)
663 ld r25,`$FRAME-8*7`($sp)
664 ld r26,`$FRAME-8*6`($sp)
665 ld r27,`$FRAME-8*5`($sp)
666 ld r28,`$FRAME-8*4`($sp)
667 ld r29,`$FRAME-8*3`($sp)
668 ld r30,`$FRAME-8*2`($sp)
669 ld r31,`$FRAME-8*1`($sp)
670 addi $sp,$sp,$FRAME
671 blr
672 .long 0
673 .byte 0,12,4,0,0x80,11,3,0
674 .long 0
675 .size x25519_fe51_mul,.-x25519_fe51_mul
676 ___
677 {
678 my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
679 $code.=<<___;
680 .globl x25519_fe51_sqr
681 .type x25519_fe51_sqr,\@function
682 .align 5
683 x25519_fe51_sqr:
684 stdu $sp,-$FRAME($sp)
685 std r21,`$FRAME-8*11`($sp)
686 std r22,`$FRAME-8*10`($sp)
687 std r23,`$FRAME-8*9`($sp)
688 std r24,`$FRAME-8*8`($sp)
689 std r25,`$FRAME-8*7`($sp)
690 std r26,`$FRAME-8*6`($sp)
691 std r27,`$FRAME-8*5`($sp)
692 std r28,`$FRAME-8*4`($sp)
693 std r29,`$FRAME-8*3`($sp)
694 std r30,`$FRAME-8*2`($sp)
695 std r31,`$FRAME-8*1`($sp)
696
697 ld $a0,0($ap)
698 ld $a1,8($ap)
699 ld $a2,16($ap)
700 ld $a3,24($ap)
701 ld $a4,32($ap)
702
703 add $bi,$a0,$a0 # a[0]*2
704 mulli $t1,$a4,19 # a[4]*19
705
706 mulld $h0lo,$a0,$a0
707 mulhdu $h0hi,$a0,$a0
708 mulld $h1lo,$a1,$bi
709 mulhdu $h1hi,$a1,$bi
710 mulld $h2lo,$a2,$bi
711 mulhdu $h2hi,$a2,$bi
712 mulld $h3lo,$a3,$bi
713 mulhdu $h3hi,$a3,$bi
714 mulld $h4lo,$a4,$bi
715 mulhdu $h4hi,$a4,$bi
716 add $bi,$a1,$a1 # a[1]*2
717 ___
718 ($a4,$t1) = ($t1,$a4);
719 $code.=<<___;
720 mulld $t0,$t1,$a4
721 mulhdu $t1,$t1,$a4
722 addc $h3lo,$h3lo,$t0
723 adde $h3hi,$h3hi,$t1
724
725 mulli $bp,$a3,19 # a[3]*19
726
727 mulld $t0,$a1,$a1
728 mulhdu $t1,$a1,$a1
729 addc $h2lo,$h2lo,$t0
730 adde $h2hi,$h2hi,$t1
731 mulld $t0,$a2,$bi
732 mulhdu $t1,$a2,$bi
733 addc $h3lo,$h3lo,$t0
734 adde $h3hi,$h3hi,$t1
735 mulld $t0,$a3,$bi
736 mulhdu $t1,$a3,$bi
737 addc $h4lo,$h4lo,$t0
738 adde $h4hi,$h4hi,$t1
739 mulld $t0,$a4,$bi
740 mulhdu $t1,$a4,$bi
741 add $bi,$a3,$a3 # a[3]*2
742 addc $h0lo,$h0lo,$t0
743 adde $h0hi,$h0hi,$t1
744 ___
745 ($a3,$t1) = ($bp,$a3);
746 $code.=<<___;
747 mulld $t0,$t1,$a3
748 mulhdu $t1,$t1,$a3
749 addc $h1lo,$h1lo,$t0
750 adde $h1hi,$h1hi,$t1
751 mulld $t0,$bi,$a4
752 mulhdu $t1,$bi,$a4
753 add $bi,$a2,$a2 # a[2]*2
754 addc $h2lo,$h2lo,$t0
755 adde $h2hi,$h2hi,$t1
756
757 mulld $t0,$a2,$a2
758 mulhdu $t1,$a2,$a2
759 addc $h4lo,$h4lo,$t0
760 adde $h4hi,$h4hi,$t1
761 mulld $t0,$a3,$bi
762 mulhdu $t1,$a3,$bi
763 addc $h0lo,$h0lo,$t0
764 adde $h0hi,$h0hi,$t1
765 mulld $t0,$a4,$bi
766 mulhdu $t1,$a4,$bi
767 addc $h1lo,$h1lo,$t0
768 adde $h1hi,$h1hi,$t1
769
770 b .Lfe51_reduce
771 .long 0
772 .byte 0,12,4,0,0x80,11,2,0
773 .long 0
774 .size x25519_fe51_sqr,.-x25519_fe51_sqr
775 ___
776 }
777 $code.=<<___;
778 .globl x25519_fe51_mul121666
779 .type x25519_fe51_mul121666,\@function
780 .align 5
781 x25519_fe51_mul121666:
782 stdu $sp,-$FRAME($sp)
783 std r21,`$FRAME-8*11`($sp)
784 std r22,`$FRAME-8*10`($sp)
785 std r23,`$FRAME-8*9`($sp)
786 std r24,`$FRAME-8*8`($sp)
787 std r25,`$FRAME-8*7`($sp)
788 std r26,`$FRAME-8*6`($sp)
789 std r27,`$FRAME-8*5`($sp)
790 std r28,`$FRAME-8*4`($sp)
791 std r29,`$FRAME-8*3`($sp)
792 std r30,`$FRAME-8*2`($sp)
793 std r31,`$FRAME-8*1`($sp)
794
795 lis $bi,`65536>>16`
796 ori $bi,$bi,`121666-65536`
797 ld $a0,0($ap)
798 ld $a1,8($ap)
799 ld $a2,16($ap)
800 ld $a3,24($ap)
801 ld $a4,32($ap)
802
803 mulld $h0lo,$a0,$bi # a[0]*121666
804 mulhdu $h0hi,$a0,$bi
805 mulld $h1lo,$a1,$bi # a[1]*121666
806 mulhdu $h1hi,$a1,$bi
807 mulld $h2lo,$a2,$bi # a[2]*121666
808 mulhdu $h2hi,$a2,$bi
809 mulld $h3lo,$a3,$bi # a[3]*121666
810 mulhdu $h3hi,$a3,$bi
811 mulld $h4lo,$a4,$bi # a[4]*121666
812 mulhdu $h4hi,$a4,$bi
813
814 b .Lfe51_reduce
815 .long 0
816 .byte 0,12,4,0,0x80,11,2,0
817 .long 0
818 .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
819 ___
820 }
821
822 $code =~ s/\`([^\`]*)\`/eval $1/gem;
823 print $code;
824 close STDOUT;