]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-ppcfp.pl
Update copyright year
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-ppcfp.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
49d3b641 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9e58d119
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC FPU.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24# Freescale e300 9.78/+30%
4b8736a2
AP
25# PPC74x0 6.92/+50%
26# PPC970 6.03/+80%
9e58d119
AP
27# POWER7 3.50/+30%
28# POWER8 3.75/+10%
29
1aa89a7a
RL
30# $output is the last argument if it looks like a file (it has an extension)
31# $flavour is the first argument if it doesn't look like a file
32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
9e58d119
AP
34
35if ($flavour =~ /64/) {
36 $SIZE_T =8;
37 $LRSAVE =2*$SIZE_T;
38 $UCMP ="cmpld";
39 $STU ="stdu";
40 $POP ="ld";
41 $PUSH ="std";
42} elsif ($flavour =~ /32/) {
43 $SIZE_T =4;
44 $LRSAVE =$SIZE_T;
45 $UCMP ="cmplw";
46 $STU ="stwu";
47 $POP ="lwz";
48 $PUSH ="stw";
49} else { die "nonsense $flavour"; }
50
51$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
52
53$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
54
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
57( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
58die "can't locate ppc-xlate.pl";
59
1aa89a7a
RL
60open STDOUT,"| $^X $xlate $flavour \"$output\""
61 or die "can't call $xlate: $!";
9e58d119
AP
62
63$LOCALS=6*$SIZE_T;
64$FRAME=$LOCALS+6*8+18*8;
65
66my $sp="r1";
67
68my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
69my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
70
71my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
72 $two0,$two32,$two64,$two96,$two130,$five_two130,
73 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
74 $s2lo,$s2hi,$s3lo,$s3hi,
75 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
76# borrowings
77my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
78my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
79my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
80
81$code.=<<___;
82.machine "any"
83.text
84
85.globl .poly1305_init_fpu
86.align 6
87.poly1305_init_fpu:
88 $STU $sp,-$LOCALS($sp) # minimal frame
89 mflr $padbit
90 $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
91
92 bl LPICmeup
93
94 xor r0,r0,r0
95 mtlr $padbit # restore lr
96
97 lfd $two0,8*0($len) # load constants
98 lfd $two32,8*1($len)
99 lfd $two64,8*2($len)
100 lfd $two96,8*3($len)
101 lfd $two130,8*4($len)
102 lfd $five_two130,8*5($len)
103
104 stfd $two0,8*0($ctx) # initial hash value, biased 0
105 stfd $two32,8*1($ctx)
106 stfd $two64,8*2($ctx)
107 stfd $two96,8*3($ctx)
108
109 $UCMP $inp,r0
110 beq- Lno_key
111
112 lfd $h3lo,8*13($len) # new fpscr
113 mffs $h3hi # old fpscr
114
115 stfd $two0,8*4($ctx) # key "template"
116 stfd $two32,8*5($ctx)
117 stfd $two64,8*6($ctx)
118 stfd $two96,8*7($ctx)
119
120 li $in1,4
121 li $in2,8
122 li $in3,12
123 $LWXLE $in0,0,$inp # load key
124 $LWXLE $in1,$in1,$inp
125 $LWXLE $in2,$in2,$inp
126 $LWXLE $in3,$in3,$inp
127
128 lis $i1,0xf000 # 0xf0000000
129 ori $i2,$i1,3 # 0xf0000003
130 andc $in0,$in0,$i1 # &=0x0fffffff
131 andc $in1,$in1,$i2 # &=0x0ffffffc
132 andc $in2,$in2,$i2
133 andc $in3,$in3,$i2
134
135 stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
136 stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
137 stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
138 stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
139
140 mtfsf 255,$h3lo # fpscr
141 stfd $two0,8*18($ctx) # copy constants to context
142 stfd $two32,8*19($ctx)
143 stfd $two64,8*20($ctx)
144 stfd $two96,8*21($ctx)
145 stfd $two130,8*22($ctx)
146 stfd $five_two130,8*23($ctx)
147
148 lfd $h0lo,8*4($ctx) # load [biased] key
149 lfd $h1lo,8*5($ctx)
150 lfd $h2lo,8*6($ctx)
151 lfd $h3lo,8*7($ctx)
152
153 fsub $h0lo,$h0lo,$two0 # r0
154 fsub $h1lo,$h1lo,$two32 # r1
155 fsub $h2lo,$h2lo,$two64 # r2
156 fsub $h3lo,$h3lo,$two96 # r3
157
158 lfd $two0,8*6($len) # more constants
159 lfd $two32,8*7($len)
160 lfd $two64,8*8($len)
161 lfd $two96,8*9($len)
162
163 fmul $h1hi,$h1lo,$five_two130 # s1
164 fmul $h2hi,$h2lo,$five_two130 # s2
165 stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
166 fmul $h3hi,$h3lo,$five_two130 # s3
167
168 fadd $h0hi,$h0lo,$two0
169 stfd $h1hi,8*12($ctx) # put aside for now
170 fadd $h1hi,$h1lo,$two32
171 stfd $h2hi,8*13($ctx)
172 fadd $h2hi,$h2lo,$two64
173 stfd $h3hi,8*14($ctx)
174 fadd $h3hi,$h3lo,$two96
175
176 fsub $h0hi,$h0hi,$two0
177 fsub $h1hi,$h1hi,$two32
178 fsub $h2hi,$h2hi,$two64
179 fsub $h3hi,$h3hi,$two96
180
181 lfd $two0,8*10($len) # more constants
182 lfd $two32,8*11($len)
183 lfd $two64,8*12($len)
184
185 fsub $h0lo,$h0lo,$h0hi
186 fsub $h1lo,$h1lo,$h1hi
187 fsub $h2lo,$h2lo,$h2hi
188 fsub $h3lo,$h3lo,$h3hi
189
190 stfd $h0hi,8*5($ctx) # r0hi
191 stfd $h1hi,8*7($ctx) # r1hi
192 stfd $h2hi,8*9($ctx) # r2hi
193 stfd $h3hi,8*11($ctx) # r3hi
194
195 stfd $h0lo,8*4($ctx) # r0lo
196 stfd $h1lo,8*6($ctx) # r1lo
197 stfd $h2lo,8*8($ctx) # r2lo
198 stfd $h3lo,8*10($ctx) # r3lo
199
200 lfd $h1lo,8*12($ctx) # s1
201 lfd $h2lo,8*13($ctx) # s2
202 lfd $h3lo,8*14($ctx) # s3
203 lfd $h0lo,8*15($ctx) # pull original fpscr
204
205 fadd $h1hi,$h1lo,$two0
206 fadd $h2hi,$h2lo,$two32
207 fadd $h3hi,$h3lo,$two64
208
209 fsub $h1hi,$h1hi,$two0
210 fsub $h2hi,$h2hi,$two32
211 fsub $h3hi,$h3hi,$two64
212
213 fsub $h1lo,$h1lo,$h1hi
214 fsub $h2lo,$h2lo,$h2hi
215 fsub $h3lo,$h3lo,$h3hi
216
217 stfd $h1hi,8*13($ctx) # s1hi
218 stfd $h2hi,8*15($ctx) # s2hi
219 stfd $h3hi,8*17($ctx) # s3hi
220
221 stfd $h1lo,8*12($ctx) # s1lo
222 stfd $h2lo,8*14($ctx) # s2lo
223 stfd $h3lo,8*16($ctx) # s3lo
224
225 mtfsf 255,$h0lo # restore fpscr
226Lno_key:
227 xor r3,r3,r3
228 addi $sp,$sp,$LOCALS
229 blr
230 .long 0
231 .byte 0,12,4,1,0x80,0,2,0
232.size .poly1305_init_fpu,.-.poly1305_init_fpu
233
234.globl .poly1305_blocks_fpu
235.align 4
236.poly1305_blocks_fpu:
237 srwi. $len,$len,4
238 beq- Labort
239
240 $STU $sp,-$FRAME($sp)
241 mflr r0
242 stfd f14,`$FRAME-8*18`($sp)
243 stfd f15,`$FRAME-8*17`($sp)
244 stfd f16,`$FRAME-8*16`($sp)
245 stfd f17,`$FRAME-8*15`($sp)
246 stfd f18,`$FRAME-8*14`($sp)
247 stfd f19,`$FRAME-8*13`($sp)
248 stfd f20,`$FRAME-8*12`($sp)
249 stfd f21,`$FRAME-8*11`($sp)
250 stfd f22,`$FRAME-8*10`($sp)
251 stfd f23,`$FRAME-8*9`($sp)
252 stfd f24,`$FRAME-8*8`($sp)
253 stfd f25,`$FRAME-8*7`($sp)
254 stfd f26,`$FRAME-8*6`($sp)
255 stfd f27,`$FRAME-8*5`($sp)
256 stfd f28,`$FRAME-8*4`($sp)
257 stfd f29,`$FRAME-8*3`($sp)
258 stfd f30,`$FRAME-8*2`($sp)
259 stfd f31,`$FRAME-8*1`($sp)
260 $PUSH r0,`$FRAME+$LRSAVE`($sp)
261
262 xor r0,r0,r0
263 li $in3,1
264 mtctr $len
265 neg $len,$len
266 stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
267 stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
268
269 lfd $two0,8*18($ctx) # load constants
270 lfd $two32,8*19($ctx)
271 lfd $two64,8*20($ctx)
272 lfd $two96,8*21($ctx)
273 lfd $two130,8*22($ctx)
274 lfd $five_two130,8*23($ctx)
275
276 lfd $h0lo,8*0($ctx) # load [biased] hash value
277 lfd $h1lo,8*1($ctx)
278 lfd $h2lo,8*2($ctx)
279 lfd $h3lo,8*3($ctx)
280
281 stfd $two0,`$LOCALS+8*0`($sp) # input "template"
282 oris $in3,$padbit,`(1023+52+96)<<4`
283 stfd $two32,`$LOCALS+8*1`($sp)
284 stfd $two64,`$LOCALS+8*2`($sp)
285 stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
286
287 li $i1,4
288 li $i2,8
289 li $i3,12
290 $LWXLE $in0,0,$inp # load input
291 $LWXLE $in1,$i1,$inp
292 $LWXLE $in2,$i2,$inp
293 $LWXLE $in3,$i3,$inp
294 addi $inp,$inp,16
295
296 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
297 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
298 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
299 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
300
301 mffs $x0 # original fpscr
302 lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
303 lfd $r0lo,8*4($ctx) # load key
304 lfd $r0hi,8*5($ctx)
305 lfd $r1lo,8*6($ctx)
306 lfd $r1hi,8*7($ctx)
307 lfd $r2lo,8*8($ctx)
308 lfd $r2hi,8*9($ctx)
309 lfd $r3lo,8*10($ctx)
310 lfd $r3hi,8*11($ctx)
311 lfd $s1lo,8*12($ctx)
312 lfd $s1hi,8*13($ctx)
313 lfd $s2lo,8*14($ctx)
314 lfd $s2hi,8*15($ctx)
315 lfd $s3lo,8*16($ctx)
316 lfd $s3hi,8*17($ctx)
317
318 stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
319 mtfsf 255,$x1
320
321 addic $len,$len,1
322 addze r0,r0
323 slwi. r0,r0,4
324 sub $inp,$inp,r0 # conditional rewind
325
326 lfd $x0,`$LOCALS+8*0`($sp)
327 lfd $x1,`$LOCALS+8*1`($sp)
328 lfd $x2,`$LOCALS+8*2`($sp)
329 lfd $x3,`$LOCALS+8*3`($sp)
330
331 fsub $h0lo,$h0lo,$two0 # de-bias hash value
332 $LWXLE $in0,0,$inp # modulo-scheduled input load
333 fsub $h1lo,$h1lo,$two32
334 $LWXLE $in1,$i1,$inp
335 fsub $h2lo,$h2lo,$two64
336 $LWXLE $in2,$i2,$inp
337 fsub $h3lo,$h3lo,$two96
338 $LWXLE $in3,$i3,$inp
339
340 fsub $x0,$x0,$two0 # de-bias input
341 addi $inp,$inp,16
342 fsub $x1,$x1,$two32
343 fsub $x2,$x2,$two64
344 fsub $x3,$x3,$two96
345
346 fadd $x0,$x0,$h0lo # accumulate input
347 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
348 fadd $x1,$x1,$h1lo
349 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
350 fadd $x2,$x2,$h2lo
351 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
352 fadd $x3,$x3,$h3lo
353 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
354
355 b Lentry
356
357.align 4
358Loop:
359 fsub $y0,$y0,$two0 # de-bias input
360 addic $len,$len,1
361 fsub $y1,$y1,$two32
362 addze r0,r0
363 fsub $y2,$y2,$two64
364 slwi. r0,r0,4
365 fsub $y3,$y3,$two96
366 sub $inp,$inp,r0 # conditional rewind
367
368 fadd $h0lo,$h0lo,$y0 # accumulate input
369 fadd $h0hi,$h0hi,$y1
370 fadd $h2lo,$h2lo,$y2
371 fadd $h2hi,$h2hi,$y3
372
373 ######################################### base 2^48 -> base 2^32
374 fadd $c1lo,$h1lo,$two64
375 $LWXLE $in0,0,$inp # modulo-scheduled input load
376 fadd $c1hi,$h1hi,$two64
377 $LWXLE $in1,$i1,$inp
378 fadd $c3lo,$h3lo,$two130
379 $LWXLE $in2,$i2,$inp
380 fadd $c3hi,$h3hi,$two130
381 $LWXLE $in3,$i3,$inp
382 fadd $c0lo,$h0lo,$two32
383 addi $inp,$inp,16
384 fadd $c0hi,$h0hi,$two32
385 fadd $c2lo,$h2lo,$two96
386 fadd $c2hi,$h2hi,$two96
387
388 fsub $c1lo,$c1lo,$two64
389 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
390 fsub $c1hi,$c1hi,$two64
391 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
392 fsub $c3lo,$c3lo,$two130
393 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
394 fsub $c3hi,$c3hi,$two130
395 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
396 fsub $c0lo,$c0lo,$two32
397 fsub $c0hi,$c0hi,$two32
398 fsub $c2lo,$c2lo,$two96
399 fsub $c2hi,$c2hi,$two96
400
401 fsub $h1lo,$h1lo,$c1lo
402 fsub $h1hi,$h1hi,$c1hi
403 fsub $h3lo,$h3lo,$c3lo
404 fsub $h3hi,$h3hi,$c3hi
405 fsub $h2lo,$h2lo,$c2lo
406 fsub $h2hi,$h2hi,$c2hi
407 fsub $h0lo,$h0lo,$c0lo
408 fsub $h0hi,$h0hi,$c0hi
409
410 fadd $h1lo,$h1lo,$c0lo
411 fadd $h1hi,$h1hi,$c0hi
412 fadd $h3lo,$h3lo,$c2lo
413 fadd $h3hi,$h3hi,$c2hi
414 fadd $h2lo,$h2lo,$c1lo
415 fadd $h2hi,$h2hi,$c1hi
416 fmadd $h0lo,$c3lo,$five_two130,$h0lo
417 fmadd $h0hi,$c3hi,$five_two130,$h0hi
418
419 fadd $x1,$h1lo,$h1hi
420 lfd $s1lo,8*12($ctx) # reload constants
421 fadd $x3,$h3lo,$h3hi
422 lfd $s1hi,8*13($ctx)
423 fadd $x2,$h2lo,$h2hi
424 lfd $r3lo,8*10($ctx)
425 fadd $x0,$h0lo,$h0hi
426 lfd $r3hi,8*11($ctx)
427Lentry:
428 fmul $h0lo,$s3lo,$x1
429 fmul $h0hi,$s3hi,$x1
430 fmul $h2lo,$r1lo,$x1
431 fmul $h2hi,$r1hi,$x1
432 fmul $h1lo,$r0lo,$x1
433 fmul $h1hi,$r0hi,$x1
434 fmul $h3lo,$r2lo,$x1
435 fmul $h3hi,$r2hi,$x1
436
437 fmadd $h0lo,$s1lo,$x3,$h0lo
438 fmadd $h0hi,$s1hi,$x3,$h0hi
439 fmadd $h2lo,$s3lo,$x3,$h2lo
440 fmadd $h2hi,$s3hi,$x3,$h2hi
441 fmadd $h1lo,$s2lo,$x3,$h1lo
442 fmadd $h1hi,$s2hi,$x3,$h1hi
443 fmadd $h3lo,$r0lo,$x3,$h3lo
444 fmadd $h3hi,$r0hi,$x3,$h3hi
445
446 fmadd $h0lo,$s2lo,$x2,$h0lo
447 fmadd $h0hi,$s2hi,$x2,$h0hi
448 fmadd $h2lo,$r0lo,$x2,$h2lo
449 fmadd $h2hi,$r0hi,$x2,$h2hi
450 fmadd $h1lo,$s3lo,$x2,$h1lo
451 fmadd $h1hi,$s3hi,$x2,$h1hi
452 fmadd $h3lo,$r1lo,$x2,$h3lo
453 fmadd $h3hi,$r1hi,$x2,$h3hi
454
455 fmadd $h0lo,$r0lo,$x0,$h0lo
456 lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
457 fmadd $h0hi,$r0hi,$x0,$h0hi
458 lfd $y1,`$LOCALS+8*1`($sp)
459 fmadd $h2lo,$r2lo,$x0,$h2lo
460 lfd $y2,`$LOCALS+8*2`($sp)
461 fmadd $h2hi,$r2hi,$x0,$h2hi
462 lfd $y3,`$LOCALS+8*3`($sp)
463 fmadd $h1lo,$r1lo,$x0,$h1lo
464 fmadd $h1hi,$r1hi,$x0,$h1hi
465 fmadd $h3lo,$r3lo,$x0,$h3lo
466 fmadd $h3hi,$r3hi,$x0,$h3hi
467
468 bdnz Loop
469
470 ######################################### base 2^48 -> base 2^32
471 fadd $c0lo,$h0lo,$two32
472 fadd $c0hi,$h0hi,$two32
473 fadd $c2lo,$h2lo,$two96
474 fadd $c2hi,$h2hi,$two96
475 fadd $c1lo,$h1lo,$two64
476 fadd $c1hi,$h1hi,$two64
477 fadd $c3lo,$h3lo,$two130
478 fadd $c3hi,$h3hi,$two130
479
480 fsub $c0lo,$c0lo,$two32
481 fsub $c0hi,$c0hi,$two32
482 fsub $c2lo,$c2lo,$two96
483 fsub $c2hi,$c2hi,$two96
484 fsub $c1lo,$c1lo,$two64
485 fsub $c1hi,$c1hi,$two64
486 fsub $c3lo,$c3lo,$two130
487 fsub $c3hi,$c3hi,$two130
488
489 fsub $h1lo,$h1lo,$c1lo
490 fsub $h1hi,$h1hi,$c1hi
491 fsub $h3lo,$h3lo,$c3lo
492 fsub $h3hi,$h3hi,$c3hi
493 fsub $h2lo,$h2lo,$c2lo
494 fsub $h2hi,$h2hi,$c2hi
495 fsub $h0lo,$h0lo,$c0lo
496 fsub $h0hi,$h0hi,$c0hi
497
498 fadd $h1lo,$h1lo,$c0lo
499 fadd $h1hi,$h1hi,$c0hi
500 fadd $h3lo,$h3lo,$c2lo
501 fadd $h3hi,$h3hi,$c2hi
502 fadd $h2lo,$h2lo,$c1lo
503 fadd $h2hi,$h2hi,$c1hi
504 fmadd $h0lo,$c3lo,$five_two130,$h0lo
505 fmadd $h0hi,$c3hi,$five_two130,$h0hi
506
507 fadd $x1,$h1lo,$h1hi
508 fadd $x3,$h3lo,$h3hi
509 fadd $x2,$h2lo,$h2hi
510 fadd $x0,$h0lo,$h0hi
511
512 lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
513 fadd $x1,$x1,$two32 # bias
514 fadd $x3,$x3,$two96
515 fadd $x2,$x2,$two64
516 fadd $x0,$x0,$two0
517
518 stfd $x1,8*1($ctx) # store [biased] hash value
519 stfd $x3,8*3($ctx)
520 stfd $x2,8*2($ctx)
521 stfd $x0,8*0($ctx)
522
523 mtfsf 255,$h0lo # restore original fpscr
524 lfd f14,`$FRAME-8*18`($sp)
525 lfd f15,`$FRAME-8*17`($sp)
526 lfd f16,`$FRAME-8*16`($sp)
527 lfd f17,`$FRAME-8*15`($sp)
528 lfd f18,`$FRAME-8*14`($sp)
529 lfd f19,`$FRAME-8*13`($sp)
530 lfd f20,`$FRAME-8*12`($sp)
531 lfd f21,`$FRAME-8*11`($sp)
532 lfd f22,`$FRAME-8*10`($sp)
533 lfd f23,`$FRAME-8*9`($sp)
534 lfd f24,`$FRAME-8*8`($sp)
535 lfd f25,`$FRAME-8*7`($sp)
536 lfd f26,`$FRAME-8*6`($sp)
537 lfd f27,`$FRAME-8*5`($sp)
538 lfd f28,`$FRAME-8*4`($sp)
539 lfd f29,`$FRAME-8*3`($sp)
540 lfd f30,`$FRAME-8*2`($sp)
541 lfd f31,`$FRAME-8*1`($sp)
542 addi $sp,$sp,$FRAME
543Labort:
544 blr
545 .long 0
546 .byte 0,12,4,1,0x80,0,4,0
547.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
548___
549{
550my ($mac,$nonce)=($inp,$len);
551
552my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
553 ) = map("r$_",(7..11,28..31));
554my $mask = "r0";
555my $FRAME = (6+4)*$SIZE_T;
556
557$code.=<<___;
558.globl .poly1305_emit_fpu
559.align 4
560.poly1305_emit_fpu:
561 $STU $sp,-$FRAME($sp)
562 mflr r0
563 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
564 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
565 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
566 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
567 $PUSH r0,`$FRAME+$LRSAVE`($sp)
568
569 lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
570 lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
571 lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
572 lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
573 lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
574 lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
575 lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
576 lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
577
578 lis $mask,0xfff0
579 andc $d0,$d0,$mask # mask exponent
580 andc $d1,$d1,$mask
581 andc $d2,$d2,$mask
582 andc $d3,$d3,$mask # can be partially reduced...
583 li $mask,3
584
585 srwi $padbit,$d3,2 # ... so reduce
586 and $h4,$d3,$mask
587 andc $d3,$d3,$mask
588 add $d3,$d3,$padbit
589___
590 if ($SIZE_T==4) {
591$code.=<<___;
592 addc $h0,$h0,$d3
593 adde $h1,$h1,$d0
594 adde $h2,$h2,$d1
595 adde $h3,$h3,$d2
596 addze $h4,$h4
597
598 addic $d0,$h0,5 # compare to modulus
599 addze $d1,$h1
600 addze $d2,$h2
601 addze $d3,$h3
602 addze $mask,$h4
603
604 srwi $mask,$mask,2 # did it carry/borrow?
605 neg $mask,$mask
606 srawi $mask,$mask,31 # mask
607
608 andc $h0,$h0,$mask
609 and $d0,$d0,$mask
610 andc $h1,$h1,$mask
611 and $d1,$d1,$mask
612 or $h0,$h0,$d0
613 lwz $d0,0($nonce) # load nonce
614 andc $h2,$h2,$mask
615 and $d2,$d2,$mask
616 or $h1,$h1,$d1
617 lwz $d1,4($nonce)
618 andc $h3,$h3,$mask
619 and $d3,$d3,$mask
620 or $h2,$h2,$d2
621 lwz $d2,8($nonce)
622 or $h3,$h3,$d3
623 lwz $d3,12($nonce)
624
625 addc $h0,$h0,$d0 # accumulate nonce
626 adde $h1,$h1,$d1
627 adde $h2,$h2,$d2
628 adde $h3,$h3,$d3
629___
630 } else {
631$code.=<<___;
632 add $h0,$h0,$d3
633 add $h1,$h1,$d0
634 add $h2,$h2,$d1
635 add $h3,$h3,$d2
636
637 srdi $d0,$h0,32
638 add $h1,$h1,$d0
639 srdi $d1,$h1,32
640 add $h2,$h2,$d1
641 srdi $d2,$h2,32
642 add $h3,$h3,$d2
643 srdi $d3,$h3,32
644 add $h4,$h4,$d3
645
646 insrdi $h0,$h1,32,0
647 insrdi $h2,$h3,32,0
648
649 addic $d0,$h0,5 # compare to modulus
650 addze $d1,$h2
651 addze $d2,$h4
652
653 srdi $mask,$d2,2 # did it carry/borrow?
654 neg $mask,$mask
655 sradi $mask,$mask,63 # mask
656 ld $d2,0($nonce) # load nonce
657 ld $d3,8($nonce)
658
659 andc $h0,$h0,$mask
660 and $d0,$d0,$mask
661 andc $h2,$h2,$mask
662 and $d1,$d1,$mask
663 or $h0,$h0,$d0
664 or $h2,$h2,$d1
665___
666$code.=<<___ if (!$LITTLE_ENDIAN);
667 rotldi $d2,$d2,32 # flip nonce words
668 rotldi $d3,$d3,32
669___
670$code.=<<___;
671 addc $h0,$h0,$d2 # accumulate nonce
672 adde $h2,$h2,$d3
673
674 srdi $h1,$h0,32
675 srdi $h3,$h2,32
676___
677 }
678$code.=<<___ if ($LITTLE_ENDIAN);
679 stw $h0,0($mac) # write result
680 stw $h1,4($mac)
681 stw $h2,8($mac)
682 stw $h3,12($mac)
683___
684$code.=<<___ if (!$LITTLE_ENDIAN);
685 li $d1,4
686 stwbrx $h0,0,$mac # write result
687 li $d2,8
688 stwbrx $h1,$d1,$mac
689 li $d3,12
690 stwbrx $h2,$d2,$mac
691 stwbrx $h3,$d3,$mac
692___
693$code.=<<___;
694 $POP r28,`$FRAME-$SIZE_T*4`($sp)
695 $POP r29,`$FRAME-$SIZE_T*3`($sp)
696 $POP r30,`$FRAME-$SIZE_T*2`($sp)
697 $POP r31,`$FRAME-$SIZE_T*1`($sp)
698 addi $sp,$sp,$FRAME
699 blr
700 .long 0
701 .byte 0,12,4,1,0x80,4,3,0
702.size .poly1305_emit_fpu,.-.poly1305_emit_fpu
703___
704}
705# Ugly hack here, because PPC assembler syntax seem to vary too
706# much from platforms to platform...
707$code.=<<___;
708.align 6
709LPICmeup:
710 mflr r0
711 bcl 20,31,\$+4
712 mflr $len # vvvvvv "distance" between . and 1st data entry
713 addi $len,$len,`64-8` # borrow $len
714 mtlr r0
715 blr
716 .long 0
717 .byte 0,12,0x14,0,0,0,0,0
718 .space `64-9*4`
719
720.quad 0x4330000000000000 # 2^(52+0)
721.quad 0x4530000000000000 # 2^(52+32)
722.quad 0x4730000000000000 # 2^(52+64)
723.quad 0x4930000000000000 # 2^(52+96)
724.quad 0x4b50000000000000 # 2^(52+130)
725
726.quad 0x37f4000000000000 # 5/2^130
727
728.quad 0x4430000000000000 # 2^(52+16+0)
729.quad 0x4630000000000000 # 2^(52+16+32)
730.quad 0x4830000000000000 # 2^(52+16+64)
731.quad 0x4a30000000000000 # 2^(52+16+96)
732.quad 0x3e30000000000000 # 2^(52+16+0-96)
733.quad 0x4030000000000000 # 2^(52+16+32-96)
734.quad 0x4230000000000000 # 2^(52+16+64-96)
735
736.quad 0x0000000000000001 # fpscr: truncate, no exceptions
737.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
738.align 4
739___
740
741$code =~ s/\`([^\`]*)\`/eval $1/gem;
742print $code;
a21314db 743close STDOUT or die "error closing STDOUT: $!";