]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-ppc.pl
Doc nits cleanup, round 2
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-ppc.pl
CommitLineData
9e58d119
AP
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements Poly1305 hash for PowerPC.
11#
12# June 2015
13#
14# Numbers are cycles per processed byte with poly1305_blocks alone,
15# and improvement coefficients relative to gcc-generated code.
16#
17# -m32 -m64
18#
19# Freescale e300 14.8/+80% -
4b8736a2
AP
20# PPC74x0 7.60/+60% -
21# PPC970 7.00/+114% 3.51/+205%
22# POWER7 3.75/+260% 1.93/+100%
23# POWER8 - 2.03/+200%
9e58d119
AP
24#
25# Do we need floating-point implementation for PPC? Results presented
26# in poly1305_ieee754.c are tricky to compare to, because they are for
27# compiler-generated code. On the other hand it's known that floating-
28# point performance can be dominated by FPU latency, which means that
29# there is limit even for ideally optimized (and even vectorized) code.
30# And this limit is estimated to be higher than above -m64 results. Or
31# in other words floating-point implementation can be meaningful to
32# consider only in 32-bit application context. We probably have to
33# recognize that 32-bit builds are getting less popular on high-end
34# systems and therefore tend to target embedded ones, which might not
35# even have FPU...
36#
37# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
38# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
39
40$flavour = shift;
41
42if ($flavour =~ /64/) {
43 $SIZE_T =8;
44 $LRSAVE =2*$SIZE_T;
45 $UCMP ="cmpld";
46 $STU ="stdu";
47 $POP ="ld";
48 $PUSH ="std";
49} elsif ($flavour =~ /32/) {
50 $SIZE_T =4;
51 $LRSAVE =$SIZE_T;
52 $UCMP ="cmplw";
53 $STU ="stwu";
54 $POP ="lwz";
55 $PUSH ="stw";
56} else { die "nonsense $flavour"; }
57
58# Define endianess based on flavour
59# i.e.: linux64le
60$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
61
62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65die "can't locate ppc-xlate.pl";
66
67open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
68
69$FRAME=24*$SIZE_T;
70
71$sp="r1";
72my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
73my ($mac,$nonce)=($inp,$len);
74my $mask = "r0";
75
76$code=<<___;
77.machine "any"
78.text
79___
80 if ($flavour =~ /64/) {
81###############################################################################
82# base 2^64 implementation
83
84my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
85
86$code.=<<___;
87.globl .poly1305_init_int
88.align 4
89.poly1305_init_int:
90 xor r0,r0,r0
91 std r0,0($ctx) # zero hash value
92 std r0,8($ctx)
93 std r0,16($ctx)
94
95 $UCMP $inp,r0
96 beq- Lno_key
97___
98$code.=<<___ if ($LITTLE_ENDIAN);
99 ld $d0,0($inp) # load key material
100 ld $d1,8($inp)
101___
102$code.=<<___ if (!$LITTLE_ENDIAN);
103 li $h0,4
104 lwbrx $d0,0,$inp # load key material
105 li $d1,8
106 lwbrx $h0,$h0,$inp
107 li $h1,12
108 lwbrx $d1,$d1,$inp
109 lwbrx $h1,$h1,$inp
110 insrdi $d0,$h0,32,0
111 insrdi $d1,$h1,32,0
112___
113$code.=<<___;
114 lis $h1,0xfff # 0x0fff0000
115 ori $h1,$h1,0xfffc # 0x0ffffffc
116 insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
117 ori $h0,$h1,3 # 0x0ffffffc0fffffff
118
119 and $d0,$d0,$h0
120 and $d1,$d1,$h1
121
122 std $d0,32($ctx) # store key
123 std $d1,40($ctx)
124
125Lno_key:
126 xor r3,r3,r3
127 blr
128 .long 0
129 .byte 0,12,0x14,0,0,0,2,0
130.size .poly1305_init_int,.-.poly1305_init_int
131
132.globl .poly1305_blocks
133.align 4
134.poly1305_blocks:
135 srdi. $len,$len,4
136 beq- Labort
137
138 $STU $sp,-$FRAME($sp)
139 mflr r0
140 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
141 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
142 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
143 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
144 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
145 $PUSH r0,`$FRAME+$LRSAVE`($sp)
146
147 ld $r0,32($ctx) # load key
148 ld $r1,40($ctx)
149
150 ld $h0,0($ctx) # load hash value
151 ld $h1,8($ctx)
152 ld $h2,16($ctx)
153
154 srdi $s1,$r1,2
155 mtctr $len
156 add $s1,$s1,$r1 # s1 = r1 + r1>>2
157 li $mask,3
158 b Loop
159
160.align 4
161Loop:
162___
163$code.=<<___ if ($LITTLE_ENDIAN);
164 ld $t0,0($inp) # load input
165 ld $t1,8($inp)
166___
167$code.=<<___ if (!$LITTLE_ENDIAN);
168 li $d0,4
169 lwbrx $t0,0,$inp # load input
170 li $t1,8
171 lwbrx $d0,$d0,$inp
172 li $d1,12
173 lwbrx $t1,$t1,$inp
174 lwbrx $d1,$d1,$inp
175 insrdi $t0,$d0,32,0
176 insrdi $t1,$d1,32,0
177___
178$code.=<<___;
179 addi $inp,$inp,16
180
181 addc $h0,$h0,$t0 # accumulate input
182 adde $h1,$h1,$t1
183
184 mulld $d0,$h0,$r0 # h0*r0
185 mulhdu $d1,$h0,$r0
186 adde $h2,$h2,$padbit
187
188 mulld $t0,$h1,$s1 # h1*5*r1
189 mulhdu $t1,$h1,$s1
190 addc $d0,$d0,$t0
191 adde $d1,$d1,$t1
192
193 mulld $t0,$h0,$r1 # h0*r1
194 mulhdu $d2,$h0,$r1
195 addc $d1,$d1,$t0
196 addze $d2,$d2
197
198 mulld $t0,$h1,$r0 # h1*r0
199 mulhdu $t1,$h1,$r0
200 addc $d1,$d1,$t0
201 adde $d2,$d2,$t1
202
203 mulld $t0,$h2,$s1 # h2*5*r1
204 mulld $t1,$h2,$r0 # h2*r0
205 addc $d1,$d1,$t0
206 adde $d2,$d2,$t1
207
208 andc $t0,$d2,$mask # final reduction step
209 and $h2,$d2,$mask
210 srdi $t1,$t0,2
211 add $t0,$t0,$t1
212 addc $h0,$d0,$t0
213 addze $h1,$d1
4b8736a2 214 addze $h2,$h2
9e58d119
AP
215
216 bdnz Loop
217
218 std $h0,0($ctx) # store hash value
219 std $h1,8($ctx)
220 std $h2,16($ctx)
221
222 $POP r27,`$FRAME-$SIZE_T*5`($sp)
223 $POP r28,`$FRAME-$SIZE_T*4`($sp)
224 $POP r29,`$FRAME-$SIZE_T*3`($sp)
225 $POP r30,`$FRAME-$SIZE_T*2`($sp)
226 $POP r31,`$FRAME-$SIZE_T*1`($sp)
227 addi $sp,$sp,$FRAME
228Labort:
229 blr
230 .long 0
231 .byte 0,12,4,1,0x80,5,4,0
232.size .poly1305_blocks,.-.poly1305_blocks
233
234.globl .poly1305_emit
235.align 4
236.poly1305_emit:
237 ld $h0,0($ctx) # load hash
238 ld $h1,8($ctx)
239 ld $h2,16($ctx)
240 ld $padbit,0($nonce) # load nonce
241 ld $nonce,8($nonce)
242
243 addic $d0,$h0,5 # compare to modulus
244 addze $d1,$h1
245 addze $d2,$h2
246
247 srdi $mask,$d2,2 # did it carry/borrow?
248 neg $mask,$mask
249
250 andc $h0,$h0,$mask
251 and $d0,$d0,$mask
252 andc $h1,$h1,$mask
253 and $d1,$d1,$mask
254 or $h0,$h0,$d0
255 or $h1,$h1,$d1
256___
257$code.=<<___ if (!$LITTLE_ENDIAN);
258 rotldi $padbit,$padbit,32 # flip nonce words
259 rotldi $nonce,$nonce,32
260___
261$code.=<<___;
262 addc $h0,$h0,$padbit # accumulate nonce
263 adde $h1,$h1,$nonce
264___
265$code.=<<___ if ($LITTLE_ENDIAN);
266 std $h0,0($mac) # write result
267 std $h1,8($mac)
268___
269$code.=<<___ if (!$LITTLE_ENDIAN);
270 extrdi r0,$h0,32,0
271 li $d0,4
272 stwbrx $h0,0,$mac # write result
273 extrdi $h0,$h1,32,0
274 li $d1,8
275 stwbrx r0,$d0,$mac
276 li $d2,12
277 stwbrx $h1,$d1,$mac
278 stwbrx $h0,$d2,$mac
279___
280$code.=<<___;
281 blr
282 .long 0
283 .byte 0,12,0x14,0,0,0,3,0
284.size .poly1305_emit,.-.poly1305_emit
285___
286 } else {
287###############################################################################
288# base 2^32 implementation
289
290my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
291 $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
292 ) = map("r$_",(7..12,14..31));
293
294$code.=<<___;
295.globl .poly1305_init_int
296.align 4
297.poly1305_init_int:
298 xor r0,r0,r0
299 stw r0,0($ctx) # zero hash value
300 stw r0,4($ctx)
301 stw r0,8($ctx)
302 stw r0,12($ctx)
303 stw r0,16($ctx)
304
305 $UCMP $inp,r0
306 beq- Lno_key
307___
308$code.=<<___ if ($LITTLE_ENDIAN);
309 lw $h0,0($inp) # load key material
310 lw $h1,4($inp)
311 lw $h2,8($inp)
312 lw $h3,12($inp)
313___
314$code.=<<___ if (!$LITTLE_ENDIAN);
315 li $h1,4
316 lwbrx $h0,0,$inp # load key material
317 li $h2,8
318 lwbrx $h1,$h1,$inp
319 li $h3,12
320 lwbrx $h2,$h2,$inp
321 lwbrx $h3,$h3,$inp
322___
323$code.=<<___;
324 lis $mask,0xf000 # 0xf0000000
325 li $r0,-4
326 andc $r0,$r0,$mask # 0x0ffffffc
327
328 andc $h0,$h0,$mask
329 and $h1,$h1,$r0
330 and $h2,$h2,$r0
331 and $h3,$h3,$r0
332
333 stw $h0,32($ctx) # store key
334 stw $h1,36($ctx)
335 stw $h2,40($ctx)
336 stw $h3,44($ctx)
337
338Lno_key:
339 xor r3,r3,r3
340 blr
341 .long 0
342 .byte 0,12,0x14,0,0,0,2,0
343.size .poly1305_init_int,.-.poly1305_init_int
344
345.globl .poly1305_blocks
346.align 4
347.poly1305_blocks:
348 srwi. $len,$len,4
349 beq- Labort
350
351 $STU $sp,-$FRAME($sp)
352 mflr r0
353 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
354 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
355 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
356 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
357 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
358 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
359 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
360 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
361 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
362 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
363 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
364 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
365 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
366 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
367 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
368 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
369 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
370 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
371 $PUSH r0,`$FRAME+$LRSAVE`($sp)
372
373 lwz $r0,32($ctx) # load key
374 lwz $r1,36($ctx)
375 lwz $r2,40($ctx)
376 lwz $r3,44($ctx)
377
378 lwz $h0,0($ctx) # load hash value
379 lwz $h1,4($ctx)
380 lwz $h2,8($ctx)
381 lwz $h3,12($ctx)
382 lwz $h4,16($ctx)
383
384 srwi $s1,$r1,2
385 srwi $s2,$r2,2
386 srwi $s3,$r3,2
387 add $s1,$s1,$r1 # si = ri + ri>>2
388 add $s2,$s2,$r2
389 add $s3,$s3,$r3
390 mtctr $len
391 li $mask,3
392 b Loop
393
394.align 4
395Loop:
396___
397$code.=<<___ if ($LITTLE_ENDIAN);
398 lwz $d0,0($inp) # load input
399 lwz $d1,4($inp)
400 lwz $d2,8($inp)
401 lwz $d3,12($inp)
402___
403$code.=<<___ if (!$LITTLE_ENDIAN);
404 li $d1,4
405 lwbrx $d0,0,$inp # load input
406 li $d2,8
407 lwbrx $d1,$d1,$inp
408 li $d3,12
409 lwbrx $d2,$d2,$inp
410 lwbrx $d3,$d3,$inp
411___
412$code.=<<___;
413 addi $inp,$inp,16
414
415 addc $h0,$h0,$d0 # accumulate input
416 adde $h1,$h1,$d1
417 adde $h2,$h2,$d2
418
419 mullw $d0,$h0,$r0 # h0*r0
420 mulhwu $D0,$h0,$r0
421
422 mullw $d1,$h0,$r1 # h0*r1
423 mulhwu $D1,$h0,$r1
424
425 mullw $d2,$h0,$r2 # h0*r2
426 mulhwu $D2,$h0,$r2
427
428 adde $h3,$h3,$d3
429 adde $h4,$h4,$padbit
430
431 mullw $d3,$h0,$r3 # h0*r3
432 mulhwu $D3,$h0,$r3
433
434 mullw $t0,$h1,$s3 # h1*s3
435 mulhwu $t1,$h1,$s3
436
437 mullw $t2,$h1,$r0 # h1*r0
438 mulhwu $t3,$h1,$r0
439 addc $d0,$d0,$t0
440 adde $D0,$D0,$t1
441
442 mullw $t0,$h1,$r1 # h1*r1
443 mulhwu $t1,$h1,$r1
444 addc $d1,$d1,$t2
445 adde $D1,$D1,$t3
446
447 mullw $t2,$h1,$r2 # h1*r2
448 mulhwu $t3,$h1,$r2
449 addc $d2,$d2,$t0
450 adde $D2,$D2,$t1
451
452 mullw $t0,$h2,$s2 # h2*s2
453 mulhwu $t1,$h2,$s2
454 addc $d3,$d3,$t2
455 adde $D3,$D3,$t3
456
457 mullw $t2,$h2,$s3 # h2*s3
458 mulhwu $t3,$h2,$s3
459 addc $d0,$d0,$t0
460 adde $D0,$D0,$t1
461
462 mullw $t0,$h2,$r0 # h2*r0
463 mulhwu $t1,$h2,$r0
464 addc $d1,$d1,$t2
465 adde $D1,$D1,$t3
466
467 mullw $t2,$h2,$r1 # h2*r1
468 mulhwu $t3,$h2,$r1
469 addc $d2,$d2,$t0
470 adde $D2,$D2,$t1
471
472 mullw $t0,$h3,$s1 # h3*s1
473 mulhwu $t1,$h3,$s1
474 addc $d3,$d3,$t2
475 adde $D3,$D3,$t3
476
477 mullw $t2,$h3,$s2 # h3*s2
478 mulhwu $t3,$h3,$s2
479 addc $d0,$d0,$t0
480 adde $D0,$D0,$t1
481
482 mullw $t0,$h3,$s3 # h3*s3
483 mulhwu $t1,$h3,$s3
484 addc $d1,$d1,$t2
485 adde $D1,$D1,$t3
486
487 mullw $t2,$h3,$r0 # h3*r0
488 mulhwu $t3,$h3,$r0
489 addc $d2,$d2,$t0
490 adde $D2,$D2,$t1
491
492 mullw $t0,$h4,$s1 # h4*s1
493 addc $d3,$d3,$t2
494 adde $D3,$D3,$t3
495 addc $d1,$d1,$t0
496
497 mullw $t1,$h4,$s2 # h4*s2
498 addze $D1,$D1
499 addc $d2,$d2,$t1
500 addze $D2,$D2
501
502 mullw $t2,$h4,$s3 # h4*s3
503 addc $d3,$d3,$t2
504 addze $D3,$D3
505
506 mullw $h4,$h4,$r0 # h4*r0
507
508 addc $h1,$d1,$D0
509 adde $h2,$d2,$D1
510 adde $h3,$d3,$D2
511 adde $h4,$h4,$D3
512
513 andc $D0,$h4,$mask # final reduction step
514 and $h4,$h4,$mask
515 srwi $D1,$D0,2
516 add $D0,$D0,$D1
517 addc $h0,$d0,$D0
518 addze $h1,$h1
519 addze $h2,$h2
520 addze $h3,$h3
4b8736a2 521 addze $h4,$h4
9e58d119
AP
522
523 bdnz Loop
524
525 stw $h0,0($ctx) # store hash value
526 stw $h1,4($ctx)
527 stw $h2,8($ctx)
528 stw $h3,12($ctx)
529 stw $h4,16($ctx)
530
531 $POP r14,`$FRAME-$SIZE_T*18`($sp)
532 $POP r15,`$FRAME-$SIZE_T*17`($sp)
533 $POP r16,`$FRAME-$SIZE_T*16`($sp)
534 $POP r17,`$FRAME-$SIZE_T*15`($sp)
535 $POP r18,`$FRAME-$SIZE_T*14`($sp)
536 $POP r19,`$FRAME-$SIZE_T*13`($sp)
537 $POP r20,`$FRAME-$SIZE_T*12`($sp)
538 $POP r21,`$FRAME-$SIZE_T*11`($sp)
539 $POP r22,`$FRAME-$SIZE_T*10`($sp)
540 $POP r23,`$FRAME-$SIZE_T*9`($sp)
541 $POP r24,`$FRAME-$SIZE_T*8`($sp)
542 $POP r25,`$FRAME-$SIZE_T*7`($sp)
543 $POP r26,`$FRAME-$SIZE_T*6`($sp)
544 $POP r27,`$FRAME-$SIZE_T*5`($sp)
545 $POP r28,`$FRAME-$SIZE_T*4`($sp)
546 $POP r29,`$FRAME-$SIZE_T*3`($sp)
547 $POP r30,`$FRAME-$SIZE_T*2`($sp)
548 $POP r31,`$FRAME-$SIZE_T*1`($sp)
549 addi $sp,$sp,$FRAME
550Labort:
551 blr
552 .long 0
553 .byte 0,12,4,1,0x80,18,4,0
554.size .poly1305_blocks,.-.poly1305_blocks
555
556.globl .poly1305_emit
557.align 4
558.poly1305_emit:
559 $STU $sp,-$FRAME($sp)
560 mflr r0
561 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
562 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
563 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
564 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
565 $PUSH r0,`$FRAME+$LRSAVE`($sp)
566
567 lwz $h0,0($ctx) # load hash
568 lwz $h1,4($ctx)
569 lwz $h2,8($ctx)
570 lwz $h3,12($ctx)
571 lwz $h4,16($ctx)
572
573 addic $d0,$h0,5 # compare to modulus
574 addze $d1,$h1
575 addze $d2,$h2
576 addze $d3,$h3
577 addze $mask,$h4
578
579 srwi $mask,$mask,2 # did it carry/borrow?
580 neg $mask,$mask
581
582 andc $h0,$h0,$mask
583 and $d0,$d0,$mask
584 andc $h1,$h1,$mask
585 and $d1,$d1,$mask
586 or $h0,$h0,$d0
587 lwz $d0,0($nonce) # load nonce
588 andc $h2,$h2,$mask
589 and $d2,$d2,$mask
590 or $h1,$h1,$d1
591 lwz $d1,4($nonce)
592 andc $h3,$h3,$mask
593 and $d3,$d3,$mask
594 or $h2,$h2,$d2
595 lwz $d2,8($nonce)
596 or $h3,$h3,$d3
597 lwz $d3,12($nonce)
598
599 addc $h0,$h0,$d0 # accumulate nonce
600 adde $h1,$h1,$d1
601 adde $h2,$h2,$d2
602 adde $h3,$h3,$d3
603___
604$code.=<<___ if ($LITTLE_ENDIAN);
605 stw $h0,0($mac) # write result
606 stw $h1,4($mac)
607 stw $h2,8($mac)
608 stw $h3,12($mac)
609___
610$code.=<<___ if (!$LITTLE_ENDIAN);
611 li $d1,4
612 stwbrx $h0,0,$mac # write result
613 li $d2,8
614 stwbrx $h1,$d1,$mac
615 li $d3,12
616 stwbrx $h2,$d2,$mac
617 stwbrx $h3,$d3,$mac
618___
619$code.=<<___;
620 $POP r28,`$FRAME-$SIZE_T*4`($sp)
621 $POP r29,`$FRAME-$SIZE_T*3`($sp)
622 $POP r30,`$FRAME-$SIZE_T*2`($sp)
623 $POP r31,`$FRAME-$SIZE_T*1`($sp)
624 addi $sp,$sp,$FRAME
625 blr
626 .long 0
627 .byte 0,12,4,1,0x80,4,3,0
628.size .poly1305_emit,.-.poly1305_emit
629___
630 }
631$code.=<<___;
632.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
633___
634
635$code =~ s/\`([^\`]*)\`/eval $1/gem;
636print $code;
637close STDOUT;