2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # X25519 lower-level primitives for PPC64.
20 # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21 # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22 # shoelaces when handling longer carry chains. As base 2^51 has just
23 # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24 # pretty old, base 2^64 implementation is not engaged. Comparison to
25 # compiler-generated code is complicated by the fact that not all
26 # compilers support 128-bit integers. When compiler doesn't, like xlc,
27 # this module delivers more than 2x improvement, and when it does,
28 # from 12% to 30% improvement was measured...
31 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
33 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
34 ( $xlate="${dir}ppc-xlate.pl" and -f
$xlate ) or
35 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f
$xlate) or
36 die "can't locate ppc-xlate.pl";
38 open OUT
,"| \"$^X\" $xlate $flavour $output";
42 my ($rp,$ap,$bp) = map("r$_",3..5);
44 ####################################################### base 2^64
46 my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
48 map("r$_",(6..12,22..31));
55 .globl x25519_fe64_mul
56 .type x25519_fe64_mul
,\
@function
60 std r22
,`$FRAME-8*10`($sp)
61 std r23
,`$FRAME-8*9`($sp)
62 std r24
,`$FRAME-8*8`($sp)
63 std r25
,`$FRAME-8*7`($sp)
64 std r26
,`$FRAME-8*6`($sp)
65 std r27
,`$FRAME-8*5`($sp)
66 std r28
,`$FRAME-8*4`($sp)
67 std r29
,`$FRAME-8*3`($sp)
68 std r30
,`$FRAME-8*2`($sp)
69 std r31
,`$FRAME-8*1`($sp)
78 mulld
$acc0,$a0,$bi # a[0]*b[0]
80 mulld
$acc1,$a1,$bi # a[1]*b[0]
82 mulld
$acc2,$a2,$bi # a[2]*b[0]
84 mulld
$acc3,$a3,$bi # a[3]*b[0]
87 for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
88 my $i=1; $i<4; shift(@acc), $i++) {
89 my $acc4 = $i==1?
$zero : @acc[4];
93 addc
@acc[1],@acc[1],$t0 # accumulate high parts
95 adde
@acc[2],@acc[2],$t1
97 adde
@acc[3],@acc[3],$t2
99 adde
@acc[4],$acc4,$t3
101 addc
@acc[1],@acc[1],$t0 # accumulate low parts
103 adde
@acc[2],@acc[2],$t1
105 adde
@acc[3],@acc[3],$t2
107 adde
@acc[4],@acc[4],$t3
109 adde
@acc[5],$zero,$zero
131 adde
$acc4,$zero,$zero
138 mulld
$acc4,$acc4,$bi
140 addc
$acc0,$acc0,$acc4
145 subfe
$acc4,$acc4,$acc4 # carry -> ~mask
149 add
$acc0,$acc0,$acc4
153 ld r22
,`$FRAME-8*10`($sp)
154 ld r23
,`$FRAME-8*9`($sp)
155 ld r24
,`$FRAME-8*8`($sp)
156 ld r25
,`$FRAME-8*7`($sp)
157 ld r26
,`$FRAME-8*6`($sp)
158 ld r27
,`$FRAME-8*5`($sp)
159 ld r28
,`$FRAME-8*4`($sp)
160 ld r29
,`$FRAME-8*3`($sp)
161 ld r30
,`$FRAME-8*2`($sp)
162 ld r31
,`$FRAME-8*1`($sp)
166 .byte
0,12,4,0,0x80,10,3,0
168 .size x25519_fe64_mul
,.-x25519_fe64_mul
170 .globl x25519_fe64_sqr
171 .type x25519_fe64_sqr
,\
@function
174 stdu
$sp,-$FRAME($sp)
175 std r22
,`$FRAME-8*10`($sp)
176 std r23
,`$FRAME-8*9`($sp)
177 std r24
,`$FRAME-8*8`($sp)
178 std r25
,`$FRAME-8*7`($sp)
179 std r26
,`$FRAME-8*6`($sp)
180 std r27
,`$FRAME-8*5`($sp)
181 std r28
,`$FRAME-8*4`($sp)
182 std r29
,`$FRAME-8*3`($sp)
183 std r30
,`$FRAME-8*2`($sp)
184 std r31
,`$FRAME-8*1`($sp)
187 xor $zero,$zero,$zero
192 ################################
193 # | | | | | |a1*a0| |
194 # | | | | |a2*a0| | |
195 # | |a3*a2|a3*a0| | | |
196 # | | | |a2*a1| | | |
197 # | | |a3*a1| | | | |
198 # *| | | | | | | | 2|
199 # +|a3*a3|a2*a2|a1*a1|a0*a0|
200 # |--+--+--+--+--+--+--+--|
201 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
203 # "can't overflow" below mark carrying into high part of
204 # multiplication result, which can't overflow, because it
205 # can never be all ones.
207 mulld
$acc1,$a1,$a0 # a[1]*a[0]
209 mulld
$acc2,$a2,$a0 # a[2]*a[0]
211 mulld
$acc3,$a3,$a0 # a[3]*a[0]
214 addc
$acc2,$acc2,$t1 # accumulate high parts of multiplication
215 mulld
$t0,$a2,$a1 # a[2]*a[1]
218 mulld
$t2,$a3,$a1 # a[3]*a[1]
220 addze
$acc4,$acc4 # can't overflow
222 mulld
$acc5,$a3,$a2 # a[3]*a[2]
225 addc
$t1,$t1,$t2 # accumulate high parts of multiplication
226 mulld
$acc0,$a0,$a0 # a[0]*a[0]
227 addze
$t2,$t3 # can't overflow
229 addc
$acc3,$acc3,$t0 # accumulate low parts of multiplication
232 mulld
$t1,$a1,$a1 # a[1]*a[1]
235 addze
$acc6,$acc6 # can't overflow
237 addc
$acc1,$acc1,$acc1 # acc[1-6]*=2
238 mulld
$t2,$a2,$a2 # a[2]*a[2]
239 adde
$acc2,$acc2,$acc2
241 adde
$acc3,$acc3,$acc3
242 mulld
$t3,$a3,$a3 # a[3]*a[3]
243 adde
$acc4,$acc4,$acc4
245 adde
$acc5,$acc5,$acc5
246 adde
$acc6,$acc6,$acc6
249 addc
$acc1,$acc1,$a0 # +a[i]*a[i]
278 mulld
$acc4,$acc4,$bi
280 addc
$acc0,$acc0,$acc4
285 subfe
$acc4,$acc4,$acc4 # carry -> ~mask
289 add
$acc0,$acc0,$acc4
293 ld r22
,`$FRAME-8*10`($sp)
294 ld r23
,`$FRAME-8*9`($sp)
295 ld r24
,`$FRAME-8*8`($sp)
296 ld r25
,`$FRAME-8*7`($sp)
297 ld r26
,`$FRAME-8*6`($sp)
298 ld r27
,`$FRAME-8*5`($sp)
299 ld r28
,`$FRAME-8*4`($sp)
300 ld r29
,`$FRAME-8*3`($sp)
301 ld r30
,`$FRAME-8*2`($sp)
302 ld r31
,`$FRAME-8*1`($sp)
306 .byte
0,12,4,0,0x80,10,2,0
308 .size x25519_fe64_sqr
,.-x25519_fe64_sqr
310 .globl x25519_fe64_mul121666
311 .type x25519_fe64_mul121666
,\
@function
313 x25519_fe64_mul121666
:
315 ori
$bi,$bi,`121666-65536`
343 subfe
$t1,$t1,$t1 # carry -> ~mask
353 .byte
0,12,0x14,0,0,0,2,0
355 .size x25519_fe64_mul121666
,.-x25519_fe64_mul121666
357 .globl x25519_fe64_add
358 .type x25519_fe64_add
,\
@function
376 subfe
$t1,$t1,$t1 # carry -> ~mask
384 subfe
$t1,$t1,$t1 # carry -> ~mask
394 .byte
0,12,0x14,0,0,0,3,0
396 .size x25519_fe64_add
,.-x25519_fe64_add
398 .globl x25519_fe64_sub
399 .type x25519_fe64_sub
,\
@function
417 subfe
$t1,$t1,$t1 # borrow -> mask
418 xor $zero,$zero,$zero
426 subfe
$t1,$t1,$t1 # borrow -> mask
436 .byte
0,12,0x14,0,0,0,3,0
438 .size x25519_fe64_sub
,.-x25519_fe64_sub
440 .globl x25519_fe64_tobytes
441 .type x25519_fe64_tobytes
,\
@function
449 sradi
$t0,$a3,63 # most significant bit -> mask
453 add
$t0,$t0,$t1 # compare to modulus in the same go
454 srdi
$a3,$a3,1 # most significant bit cleared
461 xor $zero,$zero,$zero
462 sradi
$t0,$a3,63 # most significant bit -> mask
465 srdi
$a3,$a3,1 # most significant bit cleared
474 for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
496 .byte
0,12,0x14,0,0,0,2,0
498 .size x25519_fe64_tobytes
,.-x25519_fe64_tobytes
501 ####################################################### base 2^51
503 my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
504 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
505 map("r$_",(6..12,21..31));
512 .globl x25519_fe51_mul
513 .type x25519_fe51_mul
,\
@function
516 stdu
$sp,-$FRAME($sp)
517 std r21
,`$FRAME-8*11`($sp)
518 std r22
,`$FRAME-8*10`($sp)
519 std r23
,`$FRAME-8*9`($sp)
520 std r24
,`$FRAME-8*8`($sp)
521 std r25
,`$FRAME-8*7`($sp)
522 std r26
,`$FRAME-8*6`($sp)
523 std r27
,`$FRAME-8*5`($sp)
524 std r28
,`$FRAME-8*4`($sp)
525 std r29
,`$FRAME-8*3`($sp)
526 std r30
,`$FRAME-8*2`($sp)
527 std r31
,`$FRAME-8*1`($sp)
536 mulld
$h0lo,$a0,$bi # a[0]*b[0]
539 mulld
$h1lo,$a1,$bi # a[1]*b[0]
542 mulld
$h4lo,$a4,$bi # a[4]*b[0]
547 mulld
$h2lo,$a2,$bi # a[2]*b[0]
550 mulld
$h3lo,$a3,$bi # a[3]*b[0]
553 for(my @a=($a0,$a1,$a2,$a3,$a4),
554 my $i=1; $i<4; $i++) {
555 ($ap,$bi) = ($bi,$ap);
569 ld
$ap,`8*($i+1)`($bp)
586 ($ap,$bi) = ($bi,$ap);
615 srdi
$mask,$mask,13 # 0x7ffffffffffff
619 insrdi
$t0,$h2hi,51,0 # h2>>51
622 insrdi
$t1,$h0hi,51,0 # h0>>51
630 insrdi
$t0,$h3hi,51,0 # h3>>51
633 insrdi
$t1,$h1hi,51,0 # h1>>51
640 insrdi
$t0,$h4hi,51,0
641 mulli
$t0,$t0,19 # (h4 >> 51) * 19
659 ld r21
,`$FRAME-8*11`($sp)
660 ld r22
,`$FRAME-8*10`($sp)
661 ld r23
,`$FRAME-8*9`($sp)
662 ld r24
,`$FRAME-8*8`($sp)
663 ld r25
,`$FRAME-8*7`($sp)
664 ld r26
,`$FRAME-8*6`($sp)
665 ld r27
,`$FRAME-8*5`($sp)
666 ld r28
,`$FRAME-8*4`($sp)
667 ld r29
,`$FRAME-8*3`($sp)
668 ld r30
,`$FRAME-8*2`($sp)
669 ld r31
,`$FRAME-8*1`($sp)
673 .byte
0,12,4,0,0x80,11,3,0
675 .size x25519_fe51_mul
,.-x25519_fe51_mul
678 my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
680 .globl x25519_fe51_sqr
681 .type x25519_fe51_sqr
,\
@function
684 stdu
$sp,-$FRAME($sp)
685 std r21
,`$FRAME-8*11`($sp)
686 std r22
,`$FRAME-8*10`($sp)
687 std r23
,`$FRAME-8*9`($sp)
688 std r24
,`$FRAME-8*8`($sp)
689 std r25
,`$FRAME-8*7`($sp)
690 std r26
,`$FRAME-8*6`($sp)
691 std r27
,`$FRAME-8*5`($sp)
692 std r28
,`$FRAME-8*4`($sp)
693 std r29
,`$FRAME-8*3`($sp)
694 std r30
,`$FRAME-8*2`($sp)
695 std r31
,`$FRAME-8*1`($sp)
703 add
$bi,$a0,$a0 # a[0]*2
704 mulli
$t1,$a4,19 # a[4]*19
716 add
$bi,$a1,$a1 # a[1]*2
718 ($a4,$t1) = ($t1,$a4);
725 mulli
$bp,$a3,19 # a[3]*19
741 add
$bi,$a3,$a3 # a[3]*2
745 ($a3,$t1) = ($bp,$a3);
753 add
$bi,$a2,$a2 # a[2]*2
772 .byte
0,12,4,0,0x80,11,2,0
774 .size x25519_fe51_sqr
,.-x25519_fe51_sqr
778 .globl x25519_fe51_mul121666
779 .type x25519_fe51_mul121666
,\
@function
781 x25519_fe51_mul121666
:
782 stdu
$sp,-$FRAME($sp)
783 std r21
,`$FRAME-8*11`($sp)
784 std r22
,`$FRAME-8*10`($sp)
785 std r23
,`$FRAME-8*9`($sp)
786 std r24
,`$FRAME-8*8`($sp)
787 std r25
,`$FRAME-8*7`($sp)
788 std r26
,`$FRAME-8*6`($sp)
789 std r27
,`$FRAME-8*5`($sp)
790 std r28
,`$FRAME-8*4`($sp)
791 std r29
,`$FRAME-8*3`($sp)
792 std r30
,`$FRAME-8*2`($sp)
793 std r31
,`$FRAME-8*1`($sp)
796 ori
$bi,$bi,`121666-65536`
803 mulld
$h0lo,$a0,$bi # a[0]*121666
805 mulld
$h1lo,$a1,$bi # a[1]*121666
807 mulld
$h2lo,$a2,$bi # a[2]*121666
809 mulld
$h3lo,$a3,$bi # a[3]*121666
811 mulld
$h4lo,$a4,$bi # a[4]*121666
816 .byte
0,12,4,0,0x80,11,2,0
818 .size x25519_fe51_mul121666
,.-x25519_fe51_mul121666
822 $code =~ s/\`([^\`]*)\`/eval $1/gem;