]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-ppc.pl
Removes CTLOG_new_null from the CT public API
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-ppc.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9e58d119
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24# -m32 -m64
25#
26# Freescale e300 14.8/+80% -
4b8736a2
AP
27# PPC74x0 7.60/+60% -
28# PPC970 7.00/+114% 3.51/+205%
29# POWER7 3.75/+260% 1.93/+100%
30# POWER8 - 2.03/+200%
9e58d119
AP
31#
32# Do we need floating-point implementation for PPC? Results presented
33# in poly1305_ieee754.c are tricky to compare to, because they are for
34# compiler-generated code. On the other hand it's known that floating-
35# point performance can be dominated by FPU latency, which means that
36# there is limit even for ideally optimized (and even vectorized) code.
37# And this limit is estimated to be higher than above -m64 results. Or
38# in other words floating-point implementation can be meaningful to
39# consider only in 32-bit application context. We probably have to
40# recognize that 32-bit builds are getting less popular on high-end
41# systems and therefore tend to target embedded ones, which might not
42# even have FPU...
43#
44# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
45# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
46
47$flavour = shift;
48
49if ($flavour =~ /64/) {
50 $SIZE_T =8;
51 $LRSAVE =2*$SIZE_T;
52 $UCMP ="cmpld";
53 $STU ="stdu";
54 $POP ="ld";
55 $PUSH ="std";
56} elsif ($flavour =~ /32/) {
57 $SIZE_T =4;
58 $LRSAVE =$SIZE_T;
59 $UCMP ="cmplw";
60 $STU ="stwu";
61 $POP ="lwz";
62 $PUSH ="stw";
63} else { die "nonsense $flavour"; }
64
65# Define endianess based on flavour
66# i.e.: linux64le
67$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
68
69$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
70( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
71( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
72die "can't locate ppc-xlate.pl";
73
74open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
75
76$FRAME=24*$SIZE_T;
77
78$sp="r1";
79my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
80my ($mac,$nonce)=($inp,$len);
81my $mask = "r0";
82
83$code=<<___;
84.machine "any"
85.text
86___
87 if ($flavour =~ /64/) {
88###############################################################################
89# base 2^64 implementation
90
91my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
92
93$code.=<<___;
94.globl .poly1305_init_int
95.align 4
96.poly1305_init_int:
97 xor r0,r0,r0
98 std r0,0($ctx) # zero hash value
99 std r0,8($ctx)
100 std r0,16($ctx)
101
102 $UCMP $inp,r0
103 beq- Lno_key
104___
105$code.=<<___ if ($LITTLE_ENDIAN);
106 ld $d0,0($inp) # load key material
107 ld $d1,8($inp)
108___
109$code.=<<___ if (!$LITTLE_ENDIAN);
110 li $h0,4
111 lwbrx $d0,0,$inp # load key material
112 li $d1,8
113 lwbrx $h0,$h0,$inp
114 li $h1,12
115 lwbrx $d1,$d1,$inp
116 lwbrx $h1,$h1,$inp
117 insrdi $d0,$h0,32,0
118 insrdi $d1,$h1,32,0
119___
120$code.=<<___;
121 lis $h1,0xfff # 0x0fff0000
122 ori $h1,$h1,0xfffc # 0x0ffffffc
123 insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
124 ori $h0,$h1,3 # 0x0ffffffc0fffffff
125
126 and $d0,$d0,$h0
127 and $d1,$d1,$h1
128
129 std $d0,32($ctx) # store key
130 std $d1,40($ctx)
131
132Lno_key:
133 xor r3,r3,r3
134 blr
135 .long 0
136 .byte 0,12,0x14,0,0,0,2,0
137.size .poly1305_init_int,.-.poly1305_init_int
138
139.globl .poly1305_blocks
140.align 4
141.poly1305_blocks:
142 srdi. $len,$len,4
143 beq- Labort
144
145 $STU $sp,-$FRAME($sp)
146 mflr r0
147 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
148 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
149 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
150 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
151 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
152 $PUSH r0,`$FRAME+$LRSAVE`($sp)
153
154 ld $r0,32($ctx) # load key
155 ld $r1,40($ctx)
156
157 ld $h0,0($ctx) # load hash value
158 ld $h1,8($ctx)
159 ld $h2,16($ctx)
160
161 srdi $s1,$r1,2
162 mtctr $len
163 add $s1,$s1,$r1 # s1 = r1 + r1>>2
164 li $mask,3
165 b Loop
166
167.align 4
168Loop:
169___
170$code.=<<___ if ($LITTLE_ENDIAN);
171 ld $t0,0($inp) # load input
172 ld $t1,8($inp)
173___
174$code.=<<___ if (!$LITTLE_ENDIAN);
175 li $d0,4
176 lwbrx $t0,0,$inp # load input
177 li $t1,8
178 lwbrx $d0,$d0,$inp
179 li $d1,12
180 lwbrx $t1,$t1,$inp
181 lwbrx $d1,$d1,$inp
182 insrdi $t0,$d0,32,0
183 insrdi $t1,$d1,32,0
184___
185$code.=<<___;
186 addi $inp,$inp,16
187
188 addc $h0,$h0,$t0 # accumulate input
189 adde $h1,$h1,$t1
190
191 mulld $d0,$h0,$r0 # h0*r0
192 mulhdu $d1,$h0,$r0
193 adde $h2,$h2,$padbit
194
195 mulld $t0,$h1,$s1 # h1*5*r1
196 mulhdu $t1,$h1,$s1
197 addc $d0,$d0,$t0
198 adde $d1,$d1,$t1
199
200 mulld $t0,$h0,$r1 # h0*r1
201 mulhdu $d2,$h0,$r1
202 addc $d1,$d1,$t0
203 addze $d2,$d2
204
205 mulld $t0,$h1,$r0 # h1*r0
206 mulhdu $t1,$h1,$r0
207 addc $d1,$d1,$t0
208 adde $d2,$d2,$t1
209
210 mulld $t0,$h2,$s1 # h2*5*r1
211 mulld $t1,$h2,$r0 # h2*r0
212 addc $d1,$d1,$t0
213 adde $d2,$d2,$t1
214
215 andc $t0,$d2,$mask # final reduction step
216 and $h2,$d2,$mask
217 srdi $t1,$t0,2
218 add $t0,$t0,$t1
219 addc $h0,$d0,$t0
220 addze $h1,$d1
4b8736a2 221 addze $h2,$h2
9e58d119
AP
222
223 bdnz Loop
224
225 std $h0,0($ctx) # store hash value
226 std $h1,8($ctx)
227 std $h2,16($ctx)
228
229 $POP r27,`$FRAME-$SIZE_T*5`($sp)
230 $POP r28,`$FRAME-$SIZE_T*4`($sp)
231 $POP r29,`$FRAME-$SIZE_T*3`($sp)
232 $POP r30,`$FRAME-$SIZE_T*2`($sp)
233 $POP r31,`$FRAME-$SIZE_T*1`($sp)
234 addi $sp,$sp,$FRAME
235Labort:
236 blr
237 .long 0
238 .byte 0,12,4,1,0x80,5,4,0
239.size .poly1305_blocks,.-.poly1305_blocks
240
241.globl .poly1305_emit
242.align 4
243.poly1305_emit:
244 ld $h0,0($ctx) # load hash
245 ld $h1,8($ctx)
246 ld $h2,16($ctx)
247 ld $padbit,0($nonce) # load nonce
248 ld $nonce,8($nonce)
249
250 addic $d0,$h0,5 # compare to modulus
251 addze $d1,$h1
252 addze $d2,$h2
253
254 srdi $mask,$d2,2 # did it carry/borrow?
255 neg $mask,$mask
256
257 andc $h0,$h0,$mask
258 and $d0,$d0,$mask
259 andc $h1,$h1,$mask
260 and $d1,$d1,$mask
261 or $h0,$h0,$d0
262 or $h1,$h1,$d1
263___
264$code.=<<___ if (!$LITTLE_ENDIAN);
265 rotldi $padbit,$padbit,32 # flip nonce words
266 rotldi $nonce,$nonce,32
267___
268$code.=<<___;
269 addc $h0,$h0,$padbit # accumulate nonce
270 adde $h1,$h1,$nonce
271___
272$code.=<<___ if ($LITTLE_ENDIAN);
273 std $h0,0($mac) # write result
274 std $h1,8($mac)
275___
276$code.=<<___ if (!$LITTLE_ENDIAN);
277 extrdi r0,$h0,32,0
278 li $d0,4
279 stwbrx $h0,0,$mac # write result
280 extrdi $h0,$h1,32,0
281 li $d1,8
282 stwbrx r0,$d0,$mac
283 li $d2,12
284 stwbrx $h1,$d1,$mac
285 stwbrx $h0,$d2,$mac
286___
287$code.=<<___;
288 blr
289 .long 0
290 .byte 0,12,0x14,0,0,0,3,0
291.size .poly1305_emit,.-.poly1305_emit
292___
293 } else {
294###############################################################################
295# base 2^32 implementation
296
297my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
298 $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
299 ) = map("r$_",(7..12,14..31));
300
301$code.=<<___;
302.globl .poly1305_init_int
303.align 4
304.poly1305_init_int:
305 xor r0,r0,r0
306 stw r0,0($ctx) # zero hash value
307 stw r0,4($ctx)
308 stw r0,8($ctx)
309 stw r0,12($ctx)
310 stw r0,16($ctx)
311
312 $UCMP $inp,r0
313 beq- Lno_key
314___
315$code.=<<___ if ($LITTLE_ENDIAN);
316 lw $h0,0($inp) # load key material
317 lw $h1,4($inp)
318 lw $h2,8($inp)
319 lw $h3,12($inp)
320___
321$code.=<<___ if (!$LITTLE_ENDIAN);
322 li $h1,4
323 lwbrx $h0,0,$inp # load key material
324 li $h2,8
325 lwbrx $h1,$h1,$inp
326 li $h3,12
327 lwbrx $h2,$h2,$inp
328 lwbrx $h3,$h3,$inp
329___
330$code.=<<___;
331 lis $mask,0xf000 # 0xf0000000
332 li $r0,-4
333 andc $r0,$r0,$mask # 0x0ffffffc
334
335 andc $h0,$h0,$mask
336 and $h1,$h1,$r0
337 and $h2,$h2,$r0
338 and $h3,$h3,$r0
339
340 stw $h0,32($ctx) # store key
341 stw $h1,36($ctx)
342 stw $h2,40($ctx)
343 stw $h3,44($ctx)
344
345Lno_key:
346 xor r3,r3,r3
347 blr
348 .long 0
349 .byte 0,12,0x14,0,0,0,2,0
350.size .poly1305_init_int,.-.poly1305_init_int
351
352.globl .poly1305_blocks
353.align 4
354.poly1305_blocks:
355 srwi. $len,$len,4
356 beq- Labort
357
358 $STU $sp,-$FRAME($sp)
359 mflr r0
360 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
361 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
362 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
363 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
364 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
365 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
366 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
367 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
368 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
369 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
370 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
371 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
372 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
373 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
374 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
375 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
376 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
377 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
378 $PUSH r0,`$FRAME+$LRSAVE`($sp)
379
380 lwz $r0,32($ctx) # load key
381 lwz $r1,36($ctx)
382 lwz $r2,40($ctx)
383 lwz $r3,44($ctx)
384
385 lwz $h0,0($ctx) # load hash value
386 lwz $h1,4($ctx)
387 lwz $h2,8($ctx)
388 lwz $h3,12($ctx)
389 lwz $h4,16($ctx)
390
391 srwi $s1,$r1,2
392 srwi $s2,$r2,2
393 srwi $s3,$r3,2
394 add $s1,$s1,$r1 # si = ri + ri>>2
395 add $s2,$s2,$r2
396 add $s3,$s3,$r3
397 mtctr $len
398 li $mask,3
399 b Loop
400
401.align 4
402Loop:
403___
404$code.=<<___ if ($LITTLE_ENDIAN);
405 lwz $d0,0($inp) # load input
406 lwz $d1,4($inp)
407 lwz $d2,8($inp)
408 lwz $d3,12($inp)
409___
410$code.=<<___ if (!$LITTLE_ENDIAN);
411 li $d1,4
412 lwbrx $d0,0,$inp # load input
413 li $d2,8
414 lwbrx $d1,$d1,$inp
415 li $d3,12
416 lwbrx $d2,$d2,$inp
417 lwbrx $d3,$d3,$inp
418___
419$code.=<<___;
420 addi $inp,$inp,16
421
422 addc $h0,$h0,$d0 # accumulate input
423 adde $h1,$h1,$d1
424 adde $h2,$h2,$d2
425
426 mullw $d0,$h0,$r0 # h0*r0
427 mulhwu $D0,$h0,$r0
428
429 mullw $d1,$h0,$r1 # h0*r1
430 mulhwu $D1,$h0,$r1
431
432 mullw $d2,$h0,$r2 # h0*r2
433 mulhwu $D2,$h0,$r2
434
435 adde $h3,$h3,$d3
436 adde $h4,$h4,$padbit
437
438 mullw $d3,$h0,$r3 # h0*r3
439 mulhwu $D3,$h0,$r3
440
441 mullw $t0,$h1,$s3 # h1*s3
442 mulhwu $t1,$h1,$s3
443
444 mullw $t2,$h1,$r0 # h1*r0
445 mulhwu $t3,$h1,$r0
446 addc $d0,$d0,$t0
447 adde $D0,$D0,$t1
448
449 mullw $t0,$h1,$r1 # h1*r1
450 mulhwu $t1,$h1,$r1
451 addc $d1,$d1,$t2
452 adde $D1,$D1,$t3
453
454 mullw $t2,$h1,$r2 # h1*r2
455 mulhwu $t3,$h1,$r2
456 addc $d2,$d2,$t0
457 adde $D2,$D2,$t1
458
459 mullw $t0,$h2,$s2 # h2*s2
460 mulhwu $t1,$h2,$s2
461 addc $d3,$d3,$t2
462 adde $D3,$D3,$t3
463
464 mullw $t2,$h2,$s3 # h2*s3
465 mulhwu $t3,$h2,$s3
466 addc $d0,$d0,$t0
467 adde $D0,$D0,$t1
468
469 mullw $t0,$h2,$r0 # h2*r0
470 mulhwu $t1,$h2,$r0
471 addc $d1,$d1,$t2
472 adde $D1,$D1,$t3
473
474 mullw $t2,$h2,$r1 # h2*r1
475 mulhwu $t3,$h2,$r1
476 addc $d2,$d2,$t0
477 adde $D2,$D2,$t1
478
479 mullw $t0,$h3,$s1 # h3*s1
480 mulhwu $t1,$h3,$s1
481 addc $d3,$d3,$t2
482 adde $D3,$D3,$t3
483
484 mullw $t2,$h3,$s2 # h3*s2
485 mulhwu $t3,$h3,$s2
486 addc $d0,$d0,$t0
487 adde $D0,$D0,$t1
488
489 mullw $t0,$h3,$s3 # h3*s3
490 mulhwu $t1,$h3,$s3
491 addc $d1,$d1,$t2
492 adde $D1,$D1,$t3
493
494 mullw $t2,$h3,$r0 # h3*r0
495 mulhwu $t3,$h3,$r0
496 addc $d2,$d2,$t0
497 adde $D2,$D2,$t1
498
499 mullw $t0,$h4,$s1 # h4*s1
500 addc $d3,$d3,$t2
501 adde $D3,$D3,$t3
502 addc $d1,$d1,$t0
503
504 mullw $t1,$h4,$s2 # h4*s2
505 addze $D1,$D1
506 addc $d2,$d2,$t1
507 addze $D2,$D2
508
509 mullw $t2,$h4,$s3 # h4*s3
510 addc $d3,$d3,$t2
511 addze $D3,$D3
512
513 mullw $h4,$h4,$r0 # h4*r0
514
515 addc $h1,$d1,$D0
516 adde $h2,$d2,$D1
517 adde $h3,$d3,$D2
518 adde $h4,$h4,$D3
519
520 andc $D0,$h4,$mask # final reduction step
521 and $h4,$h4,$mask
522 srwi $D1,$D0,2
523 add $D0,$D0,$D1
524 addc $h0,$d0,$D0
525 addze $h1,$h1
526 addze $h2,$h2
527 addze $h3,$h3
4b8736a2 528 addze $h4,$h4
9e58d119
AP
529
530 bdnz Loop
531
532 stw $h0,0($ctx) # store hash value
533 stw $h1,4($ctx)
534 stw $h2,8($ctx)
535 stw $h3,12($ctx)
536 stw $h4,16($ctx)
537
538 $POP r14,`$FRAME-$SIZE_T*18`($sp)
539 $POP r15,`$FRAME-$SIZE_T*17`($sp)
540 $POP r16,`$FRAME-$SIZE_T*16`($sp)
541 $POP r17,`$FRAME-$SIZE_T*15`($sp)
542 $POP r18,`$FRAME-$SIZE_T*14`($sp)
543 $POP r19,`$FRAME-$SIZE_T*13`($sp)
544 $POP r20,`$FRAME-$SIZE_T*12`($sp)
545 $POP r21,`$FRAME-$SIZE_T*11`($sp)
546 $POP r22,`$FRAME-$SIZE_T*10`($sp)
547 $POP r23,`$FRAME-$SIZE_T*9`($sp)
548 $POP r24,`$FRAME-$SIZE_T*8`($sp)
549 $POP r25,`$FRAME-$SIZE_T*7`($sp)
550 $POP r26,`$FRAME-$SIZE_T*6`($sp)
551 $POP r27,`$FRAME-$SIZE_T*5`($sp)
552 $POP r28,`$FRAME-$SIZE_T*4`($sp)
553 $POP r29,`$FRAME-$SIZE_T*3`($sp)
554 $POP r30,`$FRAME-$SIZE_T*2`($sp)
555 $POP r31,`$FRAME-$SIZE_T*1`($sp)
556 addi $sp,$sp,$FRAME
557Labort:
558 blr
559 .long 0
560 .byte 0,12,4,1,0x80,18,4,0
561.size .poly1305_blocks,.-.poly1305_blocks
562
563.globl .poly1305_emit
564.align 4
565.poly1305_emit:
566 $STU $sp,-$FRAME($sp)
567 mflr r0
568 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
569 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
570 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
571 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
572 $PUSH r0,`$FRAME+$LRSAVE`($sp)
573
574 lwz $h0,0($ctx) # load hash
575 lwz $h1,4($ctx)
576 lwz $h2,8($ctx)
577 lwz $h3,12($ctx)
578 lwz $h4,16($ctx)
579
580 addic $d0,$h0,5 # compare to modulus
581 addze $d1,$h1
582 addze $d2,$h2
583 addze $d3,$h3
584 addze $mask,$h4
585
586 srwi $mask,$mask,2 # did it carry/borrow?
587 neg $mask,$mask
588
589 andc $h0,$h0,$mask
590 and $d0,$d0,$mask
591 andc $h1,$h1,$mask
592 and $d1,$d1,$mask
593 or $h0,$h0,$d0
594 lwz $d0,0($nonce) # load nonce
595 andc $h2,$h2,$mask
596 and $d2,$d2,$mask
597 or $h1,$h1,$d1
598 lwz $d1,4($nonce)
599 andc $h3,$h3,$mask
600 and $d3,$d3,$mask
601 or $h2,$h2,$d2
602 lwz $d2,8($nonce)
603 or $h3,$h3,$d3
604 lwz $d3,12($nonce)
605
606 addc $h0,$h0,$d0 # accumulate nonce
607 adde $h1,$h1,$d1
608 adde $h2,$h2,$d2
609 adde $h3,$h3,$d3
610___
611$code.=<<___ if ($LITTLE_ENDIAN);
612 stw $h0,0($mac) # write result
613 stw $h1,4($mac)
614 stw $h2,8($mac)
615 stw $h3,12($mac)
616___
617$code.=<<___ if (!$LITTLE_ENDIAN);
618 li $d1,4
619 stwbrx $h0,0,$mac # write result
620 li $d2,8
621 stwbrx $h1,$d1,$mac
622 li $d3,12
623 stwbrx $h2,$d2,$mac
624 stwbrx $h3,$d3,$mac
625___
626$code.=<<___;
627 $POP r28,`$FRAME-$SIZE_T*4`($sp)
628 $POP r29,`$FRAME-$SIZE_T*3`($sp)
629 $POP r30,`$FRAME-$SIZE_T*2`($sp)
630 $POP r31,`$FRAME-$SIZE_T*1`($sp)
631 addi $sp,$sp,$FRAME
632 blr
633 .long 0
634 .byte 0,12,4,1,0x80,4,3,0
635.size .poly1305_emit,.-.poly1305_emit
636___
637 }
638$code.=<<___;
639.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
640___
641
642$code =~ s/\`([^\`]*)\`/eval $1/gem;
643print $code;
644close STDOUT;