]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/poly1305/asm/poly1305-ppc.pl
Update copyright year
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
12 # project. The module is dual licensed under OpenSSL and CRYPTOGAMS
13 # licenses depending on where you obtain it. For further details see
14 # https://github.com/dot-asm/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for PowerPC.
18 #
19 # June 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
23 #
24 # -m32 -m64
25 #
26 # Freescale e300 14.8/+80% -
27 # PPC74x0 7.60/+60% -
28 # PPC970 7.00/+114% 3.51/+205%
29 # POWER7 3.75/+260% 1.93/+100%
30 # POWER8 - 2.03/+200%
31 # POWER9 - 2.00/+150%
32 #
33 # Do we need floating-point implementation for PPC? Results presented
34 # in poly1305_ieee754.c are tricky to compare to, because they are for
35 # compiler-generated code. On the other hand it's known that floating-
36 # point performance can be dominated by FPU latency, which means that
37 # there is limit even for ideally optimized (and even vectorized) code.
38 # And this limit is estimated to be higher than above -m64 results. Or
39 # in other words floating-point implementation can be meaningful to
40 # consider only in 32-bit application context. We probably have to
41 # recognize that 32-bit builds are getting less popular on high-end
42 # systems and therefore tend to target embedded ones, which might not
43 # even have FPU...
44 #
45 # On side note, Power ISA 2.07 enables vector base 2^26 implementation,
46 # and POWER8 might have capacity to break 1.0 cycle per byte barrier...
47 #
48 # January 2019
49 #
50 # ... Unfortunately not:-( Estimate was a projection of ARM result,
51 # but ARM has vector multiply-n-add instruction, while PowerISA does
52 # not, not one usable in the context. Improvement is ~40% over -m64
53 # result above and is ~1.43 on little-endian systems.
54
55 # $output is the last argument if it looks like a file (it has an extension)
56 # $flavour is the first argument if it doesn't look like a file
57 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
59
60 if ($flavour =~ /64/) {
61 $SIZE_T =8;
62 $LRSAVE =2*$SIZE_T;
63 $UCMP ="cmpld";
64 $STU ="stdu";
65 $POP ="ld";
66 $PUSH ="std";
67 } elsif ($flavour =~ /32/) {
68 $SIZE_T =4;
69 $LRSAVE =$SIZE_T;
70 $UCMP ="cmplw";
71 $STU ="stwu";
72 $POP ="lwz";
73 $PUSH ="stw";
74 } else { die "nonsense $flavour"; }
75
76 # Define endianness based on flavour
77 # i.e.: linux64le
78 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
79
80 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
82 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
83 die "can't locate ppc-xlate.pl";
84
85 open STDOUT,"| $^X $xlate $flavour \"$output\""
86 or die "can't call $xlate: $!";
87
88 $FRAME=24*$SIZE_T;
89
90 $sp="r1";
91 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
92 my ($mac,$nonce)=($inp,$len);
93 my $mask = "r0";
94
95 $code=<<___;
96 .machine "any"
97 .text
98 ___
99 if ($flavour =~ /64/) {
100 ###############################################################################
101 # base 2^64 implementation
102
103 my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
104
105 $code.=<<___;
106 .globl .poly1305_init_int
107 .align 4
108 .poly1305_init_int:
109 xor r0,r0,r0
110 std r0,0($ctx) # zero hash value
111 std r0,8($ctx)
112 std r0,16($ctx)
113 stw r0,24($ctx) # clear is_base2_26
114
115 $UCMP $inp,r0
116 beq- Lno_key
117 ___
118 $code.=<<___ if ($LITTLE_ENDIAN);
119 ld $d0,0($inp) # load key material
120 ld $d1,8($inp)
121 ___
122 $code.=<<___ if (!$LITTLE_ENDIAN);
123 li $h0,4
124 lwbrx $d0,0,$inp # load key material
125 li $d1,8
126 lwbrx $h0,$h0,$inp
127 li $h1,12
128 lwbrx $d1,$d1,$inp
129 lwbrx $h1,$h1,$inp
130 insrdi $d0,$h0,32,0
131 insrdi $d1,$h1,32,0
132 ___
133 $code.=<<___;
134 lis $h1,0xfff # 0x0fff0000
135 ori $h1,$h1,0xfffc # 0x0ffffffc
136 insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
137 ori $h0,$h1,3 # 0x0ffffffc0fffffff
138
139 and $d0,$d0,$h0
140 and $d1,$d1,$h1
141
142 std $d0,32($ctx) # store key
143 std $d1,40($ctx)
144
145 Lno_key:
146 xor r3,r3,r3
147 blr
148 .long 0
149 .byte 0,12,0x14,0,0,0,2,0
150 .size .poly1305_init_int,.-.poly1305_init_int
151
152 .globl .poly1305_blocks
153 .align 4
154 .poly1305_blocks:
155 Lpoly1305_blocks:
156 srdi. $len,$len,4
157 beq- Labort
158
159 $STU $sp,-$FRAME($sp)
160 mflr r0
161 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
162 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
163 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
164 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
165 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
166 $PUSH r0,`$FRAME+$LRSAVE`($sp)
167
168 ld $r0,32($ctx) # load key
169 ld $r1,40($ctx)
170
171 ld $h0,0($ctx) # load hash value
172 ld $h1,8($ctx)
173 ld $h2,16($ctx)
174
175 srdi $s1,$r1,2
176 mtctr $len
177 add $s1,$s1,$r1 # s1 = r1 + r1>>2
178 li $mask,3
179 b Loop
180
181 .align 4
182 Loop:
183 ___
184 $code.=<<___ if ($LITTLE_ENDIAN);
185 ld $t0,0($inp) # load input
186 ld $t1,8($inp)
187 ___
188 $code.=<<___ if (!$LITTLE_ENDIAN);
189 li $d0,4
190 lwbrx $t0,0,$inp # load input
191 li $t1,8
192 lwbrx $d0,$d0,$inp
193 li $d1,12
194 lwbrx $t1,$t1,$inp
195 lwbrx $d1,$d1,$inp
196 insrdi $t0,$d0,32,0
197 insrdi $t1,$d1,32,0
198 ___
199 $code.=<<___;
200 addi $inp,$inp,16
201
202 addc $h0,$h0,$t0 # accumulate input
203 adde $h1,$h1,$t1
204
205 mulld $d0,$h0,$r0 # h0*r0
206 mulhdu $d1,$h0,$r0
207 adde $h2,$h2,$padbit
208
209 mulld $t0,$h1,$s1 # h1*5*r1
210 mulhdu $t1,$h1,$s1
211 addc $d0,$d0,$t0
212 adde $d1,$d1,$t1
213
214 mulld $t0,$h0,$r1 # h0*r1
215 mulhdu $d2,$h0,$r1
216 addc $d1,$d1,$t0
217 addze $d2,$d2
218
219 mulld $t0,$h1,$r0 # h1*r0
220 mulhdu $t1,$h1,$r0
221 addc $d1,$d1,$t0
222 adde $d2,$d2,$t1
223
224 mulld $t0,$h2,$s1 # h2*5*r1
225 mulld $t1,$h2,$r0 # h2*r0
226 addc $d1,$d1,$t0
227 adde $d2,$d2,$t1
228
229 andc $t0,$d2,$mask # final reduction step
230 and $h2,$d2,$mask
231 srdi $t1,$t0,2
232 add $t0,$t0,$t1
233 addc $h0,$d0,$t0
234 addze $h1,$d1
235 addze $h2,$h2
236
237 bdnz Loop
238
239 std $h0,0($ctx) # store hash value
240 std $h1,8($ctx)
241 std $h2,16($ctx)
242
243 $POP r27,`$FRAME-$SIZE_T*5`($sp)
244 $POP r28,`$FRAME-$SIZE_T*4`($sp)
245 $POP r29,`$FRAME-$SIZE_T*3`($sp)
246 $POP r30,`$FRAME-$SIZE_T*2`($sp)
247 $POP r31,`$FRAME-$SIZE_T*1`($sp)
248 addi $sp,$sp,$FRAME
249 Labort:
250 blr
251 .long 0
252 .byte 0,12,4,1,0x80,5,4,0
253 .size .poly1305_blocks,.-.poly1305_blocks
254 ___
255 {
256 my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
257
258 $code.=<<___;
259 .globl .poly1305_emit
260 .align 5
261 .poly1305_emit:
262 lwz $h0,0($ctx) # load hash value base 2^26
263 lwz $h1,4($ctx)
264 lwz $h2,8($ctx)
265 lwz $h3,12($ctx)
266 lwz $h4,16($ctx)
267 lwz r0,24($ctx) # is_base2_26
268
269 sldi $h1,$h1,26 # base 2^26 -> base 2^64
270 sldi $t0,$h2,52
271 srdi $h2,$h2,12
272 sldi $h3,$h3,14
273 add $h0,$h0,$h1
274 addc $h0,$h0,$t0
275 sldi $t0,$h4,40
276 srdi $h4,$h4,24
277 adde $h1,$h2,$h3
278 addc $h1,$h1,$t0
279 addze $h2,$h4
280
281 ld $h3,0($ctx) # load hash value base 2^64
282 ld $h4,8($ctx)
283 ld $t0,16($ctx)
284
285 neg r0,r0
286 xor $h0,$h0,$h3 # choose between radixes
287 xor $h1,$h1,$h4
288 xor $h2,$h2,$t0
289 and $h0,$h0,r0
290 and $h1,$h1,r0
291 and $h2,$h2,r0
292 xor $h0,$h0,$h3
293 xor $h1,$h1,$h4
294 xor $h2,$h2,$t0
295
296 addic $h3,$h0,5 # compare to modulus
297 addze $h4,$h1
298 addze $t0,$h2
299
300 srdi $t0,$t0,2 # see if it carried/borrowed
301 neg $t0,$t0
302
303 andc $h0,$h0,$t0
304 and $h3,$h3,$t0
305 andc $h1,$h1,$t0
306 and $h4,$h4,$t0
307 or $h0,$h0,$h3
308 or $h1,$h1,$h4
309
310 lwz $t0,4($nonce)
311 lwz $h2,12($nonce)
312 lwz $h3,0($nonce)
313 lwz $h4,8($nonce)
314
315 insrdi $h3,$t0,32,0
316 insrdi $h4,$h2,32,0
317
318 addc $h0,$h0,$h3 # accumulate nonce
319 adde $h1,$h1,$h4
320
321 addi $ctx,$mac,-1
322 addi $mac,$mac,7
323
324 stbu $h0,1($ctx) # write [little-endian] result
325 srdi $h0,$h0,8
326 stbu $h1,1($mac)
327 srdi $h1,$h1,8
328
329 stbu $h0,1($ctx)
330 srdi $h0,$h0,8
331 stbu $h1,1($mac)
332 srdi $h1,$h1,8
333
334 stbu $h0,1($ctx)
335 srdi $h0,$h0,8
336 stbu $h1,1($mac)
337 srdi $h1,$h1,8
338
339 stbu $h0,1($ctx)
340 srdi $h0,$h0,8
341 stbu $h1,1($mac)
342 srdi $h1,$h1,8
343
344 stbu $h0,1($ctx)
345 srdi $h0,$h0,8
346 stbu $h1,1($mac)
347 srdi $h1,$h1,8
348
349 stbu $h0,1($ctx)
350 srdi $h0,$h0,8
351 stbu $h1,1($mac)
352 srdi $h1,$h1,8
353
354 stbu $h0,1($ctx)
355 srdi $h0,$h0,8
356 stbu $h1,1($mac)
357 srdi $h1,$h1,8
358
359 stbu $h0,1($ctx)
360 stbu $h1,1($mac)
361
362 blr
363 .long 0
364 .byte 0,12,0x14,0,0,0,3,0
365 .size .poly1305_emit,.-.poly1305_emit
366 ___
367 } } else {
368 ###############################################################################
369 # base 2^32 implementation
370
371 my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
372 $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
373 ) = map("r$_",(7..12,14..31));
374
375 $code.=<<___;
376 .globl .poly1305_init_int
377 .align 4
378 .poly1305_init_int:
379 xor r0,r0,r0
380 stw r0,0($ctx) # zero hash value
381 stw r0,4($ctx)
382 stw r0,8($ctx)
383 stw r0,12($ctx)
384 stw r0,16($ctx)
385 stw r0,24($ctx) # clear is_base2_26
386
387 $UCMP $inp,r0
388 beq- Lno_key
389 ___
390 $code.=<<___ if ($LITTLE_ENDIAN);
391 lw $h0,0($inp) # load key material
392 lw $h1,4($inp)
393 lw $h2,8($inp)
394 lw $h3,12($inp)
395 ___
396 $code.=<<___ if (!$LITTLE_ENDIAN);
397 li $h1,4
398 lwbrx $h0,0,$inp # load key material
399 li $h2,8
400 lwbrx $h1,$h1,$inp
401 li $h3,12
402 lwbrx $h2,$h2,$inp
403 lwbrx $h3,$h3,$inp
404 ___
405 $code.=<<___;
406 lis $mask,0xf000 # 0xf0000000
407 li $r0,-4
408 andc $r0,$r0,$mask # 0x0ffffffc
409
410 andc $h0,$h0,$mask
411 and $h1,$h1,$r0
412 and $h2,$h2,$r0
413 and $h3,$h3,$r0
414
415 stw $h0,32($ctx) # store key
416 stw $h1,36($ctx)
417 stw $h2,40($ctx)
418 stw $h3,44($ctx)
419
420 Lno_key:
421 xor r3,r3,r3
422 blr
423 .long 0
424 .byte 0,12,0x14,0,0,0,2,0
425 .size .poly1305_init_int,.-.poly1305_init_int
426
427 .globl .poly1305_blocks
428 .align 4
429 .poly1305_blocks:
430 Lpoly1305_blocks:
431 srwi. $len,$len,4
432 beq- Labort
433
434 $STU $sp,-$FRAME($sp)
435 mflr r0
436 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
437 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
438 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
439 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
440 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
441 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
442 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
443 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
444 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
445 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
446 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
447 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
448 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
449 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
450 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
451 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
452 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
453 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
454 $PUSH r0,`$FRAME+$LRSAVE`($sp)
455
456 lwz $r0,32($ctx) # load key
457 lwz $r1,36($ctx)
458 lwz $r2,40($ctx)
459 lwz $r3,44($ctx)
460
461 lwz $h0,0($ctx) # load hash value
462 lwz $h1,4($ctx)
463 lwz $h2,8($ctx)
464 lwz $h3,12($ctx)
465 lwz $h4,16($ctx)
466
467 srwi $s1,$r1,2
468 srwi $s2,$r2,2
469 srwi $s3,$r3,2
470 add $s1,$s1,$r1 # si = ri + ri>>2
471 add $s2,$s2,$r2
472 add $s3,$s3,$r3
473 mtctr $len
474 li $mask,3
475 b Loop
476
477 .align 4
478 Loop:
479 ___
480 $code.=<<___ if ($LITTLE_ENDIAN);
481 lwz $d0,0($inp) # load input
482 lwz $d1,4($inp)
483 lwz $d2,8($inp)
484 lwz $d3,12($inp)
485 ___
486 $code.=<<___ if (!$LITTLE_ENDIAN);
487 li $d1,4
488 lwbrx $d0,0,$inp # load input
489 li $d2,8
490 lwbrx $d1,$d1,$inp
491 li $d3,12
492 lwbrx $d2,$d2,$inp
493 lwbrx $d3,$d3,$inp
494 ___
495 $code.=<<___;
496 addi $inp,$inp,16
497
498 addc $h0,$h0,$d0 # accumulate input
499 adde $h1,$h1,$d1
500 adde $h2,$h2,$d2
501
502 mullw $d0,$h0,$r0 # h0*r0
503 mulhwu $D0,$h0,$r0
504
505 mullw $d1,$h0,$r1 # h0*r1
506 mulhwu $D1,$h0,$r1
507
508 mullw $d2,$h0,$r2 # h0*r2
509 mulhwu $D2,$h0,$r2
510
511 adde $h3,$h3,$d3
512 adde $h4,$h4,$padbit
513
514 mullw $d3,$h0,$r3 # h0*r3
515 mulhwu $D3,$h0,$r3
516
517 mullw $t0,$h1,$s3 # h1*s3
518 mulhwu $t1,$h1,$s3
519
520 mullw $t2,$h1,$r0 # h1*r0
521 mulhwu $t3,$h1,$r0
522 addc $d0,$d0,$t0
523 adde $D0,$D0,$t1
524
525 mullw $t0,$h1,$r1 # h1*r1
526 mulhwu $t1,$h1,$r1
527 addc $d1,$d1,$t2
528 adde $D1,$D1,$t3
529
530 mullw $t2,$h1,$r2 # h1*r2
531 mulhwu $t3,$h1,$r2
532 addc $d2,$d2,$t0
533 adde $D2,$D2,$t1
534
535 mullw $t0,$h2,$s2 # h2*s2
536 mulhwu $t1,$h2,$s2
537 addc $d3,$d3,$t2
538 adde $D3,$D3,$t3
539
540 mullw $t2,$h2,$s3 # h2*s3
541 mulhwu $t3,$h2,$s3
542 addc $d0,$d0,$t0
543 adde $D0,$D0,$t1
544
545 mullw $t0,$h2,$r0 # h2*r0
546 mulhwu $t1,$h2,$r0
547 addc $d1,$d1,$t2
548 adde $D1,$D1,$t3
549
550 mullw $t2,$h2,$r1 # h2*r1
551 mulhwu $t3,$h2,$r1
552 addc $d2,$d2,$t0
553 adde $D2,$D2,$t1
554
555 mullw $t0,$h3,$s1 # h3*s1
556 mulhwu $t1,$h3,$s1
557 addc $d3,$d3,$t2
558 adde $D3,$D3,$t3
559
560 mullw $t2,$h3,$s2 # h3*s2
561 mulhwu $t3,$h3,$s2
562 addc $d0,$d0,$t0
563 adde $D0,$D0,$t1
564
565 mullw $t0,$h3,$s3 # h3*s3
566 mulhwu $t1,$h3,$s3
567 addc $d1,$d1,$t2
568 adde $D1,$D1,$t3
569
570 mullw $t2,$h3,$r0 # h3*r0
571 mulhwu $t3,$h3,$r0
572 addc $d2,$d2,$t0
573 adde $D2,$D2,$t1
574
575 mullw $t0,$h4,$s1 # h4*s1
576 addc $d3,$d3,$t2
577 adde $D3,$D3,$t3
578 addc $d1,$d1,$t0
579
580 mullw $t1,$h4,$s2 # h4*s2
581 addze $D1,$D1
582 addc $d2,$d2,$t1
583 addze $D2,$D2
584
585 mullw $t2,$h4,$s3 # h4*s3
586 addc $d3,$d3,$t2
587 addze $D3,$D3
588
589 mullw $h4,$h4,$r0 # h4*r0
590
591 addc $h1,$d1,$D0
592 adde $h2,$d2,$D1
593 adde $h3,$d3,$D2
594 adde $h4,$h4,$D3
595
596 andc $D0,$h4,$mask # final reduction step
597 and $h4,$h4,$mask
598 srwi $D1,$D0,2
599 add $D0,$D0,$D1
600 addc $h0,$d0,$D0
601 addze $h1,$h1
602 addze $h2,$h2
603 addze $h3,$h3
604 addze $h4,$h4
605
606 bdnz Loop
607
608 stw $h0,0($ctx) # store hash value
609 stw $h1,4($ctx)
610 stw $h2,8($ctx)
611 stw $h3,12($ctx)
612 stw $h4,16($ctx)
613
614 $POP r14,`$FRAME-$SIZE_T*18`($sp)
615 $POP r15,`$FRAME-$SIZE_T*17`($sp)
616 $POP r16,`$FRAME-$SIZE_T*16`($sp)
617 $POP r17,`$FRAME-$SIZE_T*15`($sp)
618 $POP r18,`$FRAME-$SIZE_T*14`($sp)
619 $POP r19,`$FRAME-$SIZE_T*13`($sp)
620 $POP r20,`$FRAME-$SIZE_T*12`($sp)
621 $POP r21,`$FRAME-$SIZE_T*11`($sp)
622 $POP r22,`$FRAME-$SIZE_T*10`($sp)
623 $POP r23,`$FRAME-$SIZE_T*9`($sp)
624 $POP r24,`$FRAME-$SIZE_T*8`($sp)
625 $POP r25,`$FRAME-$SIZE_T*7`($sp)
626 $POP r26,`$FRAME-$SIZE_T*6`($sp)
627 $POP r27,`$FRAME-$SIZE_T*5`($sp)
628 $POP r28,`$FRAME-$SIZE_T*4`($sp)
629 $POP r29,`$FRAME-$SIZE_T*3`($sp)
630 $POP r30,`$FRAME-$SIZE_T*2`($sp)
631 $POP r31,`$FRAME-$SIZE_T*1`($sp)
632 addi $sp,$sp,$FRAME
633 Labort:
634 blr
635 .long 0
636 .byte 0,12,4,1,0x80,18,4,0
637 .size .poly1305_blocks,.-.poly1305_blocks
638 ___
639 {
640 my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
641
642 $code.=<<___;
643 .globl .poly1305_emit
644 .align 5
645 .poly1305_emit:
646 lwz r0,24($ctx) # is_base2_26
647 lwz $h0,0($ctx) # load hash value
648 lwz $h1,4($ctx)
649 lwz $h2,8($ctx)
650 lwz $h3,12($ctx)
651 lwz $h4,16($ctx)
652 cmplwi r0,0
653 beq Lemit_base2_32
654
655 slwi $t0,$h1,26 # base 2^26 -> base 2^32
656 srwi $h1,$h1,6
657 slwi $t1,$h2,20
658 srwi $h2,$h2,12
659 addc $h0,$h0,$t0
660 slwi $t0,$h3,14
661 srwi $h3,$h3,18
662 adde $h1,$h1,$t1
663 slwi $t1,$h4,8
664 srwi $h4,$h4,24
665 adde $h2,$h2,$t0
666 adde $h3,$h3,$t1
667 addze $h4,$h4
668
669 Lemit_base2_32:
670 addic r0,$h0,5 # compare to modulus
671 addze r0,$h1
672 addze r0,$h2
673 addze r0,$h3
674 addze r0,$h4
675
676 srwi r0,r0,2 # see if it carried/borrowed
677 neg r0,r0
678 andi. r0,r0,5
679
680 addc $h0,$h0,r0
681 lwz r0,0($nonce)
682 addze $h1,$h1
683 lwz $t0,4($nonce)
684 addze $h2,$h2
685 lwz $t1,8($nonce)
686 addze $h3,$h3
687 lwz $h4,12($nonce)
688
689 addc $h0,$h0,r0 # accumulate nonce
690 adde $h1,$h1,$t0
691 adde $h2,$h2,$t1
692 adde $h3,$h3,$h4
693
694 addi $ctx,$mac,-1
695 addi $mac,$mac,7
696
697 stbu $h0,1($ctx) # write [little-endian] result
698 srwi $h0,$h0,8
699 stbu $h2,1($mac)
700 srwi $h2,$h2,8
701
702 stbu $h0,1($ctx)
703 srwi $h0,$h0,8
704 stbu $h2,1($mac)
705 srwi $h2,$h2,8
706
707 stbu $h0,1($ctx)
708 srwi $h0,$h0,8
709 stbu $h2,1($mac)
710 srwi $h2,$h2,8
711
712 stbu $h0,1($ctx)
713 stbu $h2,1($mac)
714
715 stbu $h1,1($ctx)
716 srwi $h1,$h1,8
717 stbu $h3,1($mac)
718 srwi $h3,$h3,8
719
720 stbu $h1,1($ctx)
721 srwi $h1,$h1,8
722 stbu $h3,1($mac)
723 srwi $h3,$h3,8
724
725 stbu $h1,1($ctx)
726 srwi $h1,$h1,8
727 stbu $h3,1($mac)
728 srwi $h3,$h3,8
729
730 stbu $h1,1($ctx)
731 stbu $h3,1($mac)
732
733 blr
734 .long 0
735 .byte 0,12,0x14,0,0,0,3,0
736 .size .poly1305_emit,.-.poly1305_emit
737 ___
738 } }
739 {{{
740 ########################################################################
741 # PowerISA 2.07/VSX section #
742 ########################################################################
743
744 my $LOCALS= 6*$SIZE_T;
745 my $VSXFRAME = $LOCALS + 6*$SIZE_T;
746 $VSXFRAME += 128; # local variables
747 $VSXFRAME += 13*16; # v20-v31 offload
748
749 my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
750
751 ########################################################################
752 # Layout of opaque area is following:
753 #
754 # unsigned __int32 h[5]; # current hash value base 2^26
755 # unsigned __int32 pad;
756 # unsigned __int32 is_base2_26, pad;
757 # unsigned __int64 r[2]; # key value base 2^64
758 # struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
759 #
760 # where r^n are base 2^26 digits of powers of multiplier key. There are
761 # 5 digits, but last four are interleaved with multiples of 5, totalling
762 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
763 # powers is as they appear in register, not memory.
764
765 my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
766 my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
767 my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
768 my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
769 my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
770 my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
771 my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
772 my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
773 my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
774
775 if ($flavour =~ /64/) {
776 ###############################################################################
777 # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
778 # but the base 2^26 computational part is same...
779
780 my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
781 my $mask = "r0";
782
783 $code.=<<___;
784 .globl .poly1305_blocks_vsx
785 .align 5
786 .poly1305_blocks_vsx:
787 lwz r7,24($ctx) # is_base2_26
788 cmpldi $len,128
789 bge __poly1305_blocks_vsx
790
791 neg r0,r7 # is_base2_26 as mask
792 lwz r7,0($ctx) # load hash base 2^26
793 lwz r8,4($ctx)
794 lwz r9,8($ctx)
795 lwz r10,12($ctx)
796 lwz r11,16($ctx)
797
798 sldi r8,r8,26 # base 2^26 -> base 2^64
799 sldi r12,r9,52
800 add r7,r7,r8
801 srdi r9,r9,12
802 sldi r10,r10,14
803 addc r7,r7,r12
804 sldi r8,r11,40
805 adde r9,r9,r10
806 srdi r11,r11,24
807 addc r9,r9,r8
808 addze r11,r11
809
810 ld r8,0($ctx) # load hash base 2^64
811 ld r10,8($ctx)
812 ld r12,16($ctx)
813
814 xor r7,r7,r8 # select between radixes
815 xor r9,r9,r10
816 xor r11,r11,r12
817 and r7,r7,r0
818 and r9,r9,r0
819 and r11,r11,r0
820 xor r7,r7,r8
821 xor r9,r9,r10
822 xor r11,r11,r12
823
824 li r0,0
825 std r7,0($ctx) # store hash base 2^64
826 std r9,8($ctx)
827 std r11,16($ctx)
828 stw r0,24($ctx) # clear is_base2_26
829
830 b Lpoly1305_blocks
831 .long 0
832 .byte 0,12,0x14,0,0,0,4,0
833 .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
834
835 .align 5
836 __poly1305_mul:
837 mulld $d0,$h0,$r0 # h0*r0
838 mulhdu $d1,$h0,$r0
839
840 mulld $t0,$h1,$s1 # h1*5*r1
841 mulhdu $t1,$h1,$s1
842 addc $d0,$d0,$t0
843 adde $d1,$d1,$t1
844
845 mulld $t0,$h0,$r1 # h0*r1
846 mulhdu $d2,$h0,$r1
847 addc $d1,$d1,$t0
848 addze $d2,$d2
849
850 mulld $t0,$h1,$r0 # h1*r0
851 mulhdu $t1,$h1,$r0
852 addc $d1,$d1,$t0
853 adde $d2,$d2,$t1
854
855 mulld $t0,$h2,$s1 # h2*5*r1
856 mulld $t1,$h2,$r0 # h2*r0
857 addc $d1,$d1,$t0
858 adde $d2,$d2,$t1
859
860 andc $t0,$d2,$mask # final reduction step
861 and $h2,$d2,$mask
862 srdi $t1,$t0,2
863 add $t0,$t0,$t1
864 addc $h0,$d0,$t0
865 addze $h1,$d1
866 addze $h2,$h2
867
868 blr
869 .long 0
870 .byte 0,12,0x14,0,0,0,0,0
871 .size __poly1305_mul,.-__poly1305_mul
872
873 .align 5
874 __poly1305_splat:
875 extrdi $d0,$h0,26,38
876 extrdi $d1,$h0,26,12
877 stw $d0,0x00($t1)
878
879 extrdi $d2,$h0,12,0
880 slwi $d0,$d1,2
881 stw $d1,0x10($t1)
882 add $d0,$d0,$d1 # * 5
883 stw $d0,0x20($t1)
884
885 insrdi $d2,$h1,14,38
886 slwi $d0,$d2,2
887 stw $d2,0x30($t1)
888 add $d0,$d0,$d2 # * 5
889 stw $d0,0x40($t1)
890
891 extrdi $d1,$h1,26,24
892 extrdi $d2,$h1,24,0
893 slwi $d0,$d1,2
894 stw $d1,0x50($t1)
895 add $d0,$d0,$d1 # * 5
896 stw $d0,0x60($t1)
897
898 insrdi $d2,$h2,3,37
899 slwi $d0,$d2,2
900 stw $d2,0x70($t1)
901 add $d0,$d0,$d2 # * 5
902 stw $d0,0x80($t1)
903
904 blr
905 .long 0
906 .byte 0,12,0x14,0,0,0,0,0
907 .size __poly1305_splat,.-__poly1305_splat
908
909 .align 5
910 __poly1305_blocks_vsx:
911 $STU $sp,-$VSXFRAME($sp)
912 mflr r0
913 li r10,`15+$LOCALS+128`
914 li r11,`31+$LOCALS+128`
915 mfspr r12,256
916 stvx v20,r10,$sp
917 addi r10,r10,32
918 stvx v21,r11,$sp
919 addi r11,r11,32
920 stvx v22,r10,$sp
921 addi r10,r10,32
922 stvx v23,r10,$sp
923 addi r10,r10,32
924 stvx v24,r11,$sp
925 addi r11,r11,32
926 stvx v25,r10,$sp
927 addi r10,r10,32
928 stvx v26,r10,$sp
929 addi r10,r10,32
930 stvx v27,r11,$sp
931 addi r11,r11,32
932 stvx v28,r10,$sp
933 addi r10,r10,32
934 stvx v29,r11,$sp
935 addi r11,r11,32
936 stvx v30,r10,$sp
937 stvx v31,r11,$sp
938 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
939 li r12,-1
940 mtspr 256,r12 # preserve all AltiVec registers
941 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
942 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
943 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
944 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
945 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
946 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
947
948 bl LPICmeup
949
950 li $x10,0x10
951 li $x20,0x20
952 li $x30,0x30
953 li $x40,0x40
954 li $x50,0x50
955 lvx_u $mask26,$x00,$const
956 lvx_u $_26,$x10,$const
957 lvx_u $_40,$x20,$const
958 lvx_u $I2perm,$x30,$const
959 lvx_u $padbits,$x40,$const
960
961 cmplwi r7,0 # is_base2_26?
962 bne Lskip_init_vsx
963
964 ld $r0,32($ctx) # load key base 2^64
965 ld $r1,40($ctx)
966 srdi $s1,$r1,2
967 li $mask,3
968 add $s1,$s1,$r1 # s1 = r1 + r1>>2
969
970 mr $h0,$r0 # "calculate" r^1
971 mr $h1,$r1
972 li $h2,0
973 addi $t1,$ctx,`48+(12^$BIG_ENDIAN)`
974 bl __poly1305_splat
975
976 bl __poly1305_mul # calculate r^2
977 addi $t1,$ctx,`48+(4^$BIG_ENDIAN)`
978 bl __poly1305_splat
979
980 bl __poly1305_mul # calculate r^3
981 addi $t1,$ctx,`48+(8^$BIG_ENDIAN)`
982 bl __poly1305_splat
983
984 bl __poly1305_mul # calculate r^4
985 addi $t1,$ctx,`48+(0^$BIG_ENDIAN)`
986 bl __poly1305_splat
987
988 ld $h0,0($ctx) # load hash
989 ld $h1,8($ctx)
990 ld $h2,16($ctx)
991
992 extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26
993 extrdi $d1,$h0,26,12
994 extrdi $d2,$h0,12,0
995 mtvrwz $H0,$d0
996 insrdi $d2,$h1,14,38
997 mtvrwz $H1,$d1
998 extrdi $d1,$h1,26,24
999 mtvrwz $H2,$d2
1000 extrdi $d2,$h1,24,0
1001 mtvrwz $H3,$d1
1002 insrdi $d2,$h2,3,37
1003 mtvrwz $H4,$d2
1004 ___
1005 } else {
1006 ###############################################################################
1007 # 32-bit initialization
1008
1009 my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
1010 my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
1011
1012 $code.=<<___;
1013 .globl .poly1305_blocks_vsx
1014 .align 5
1015 .poly1305_blocks_vsx:
1016 lwz r7,24($ctx) # is_base2_26
1017 cmplwi $len,128
1018 bge __poly1305_blocks_vsx
1019 cmplwi r7,0
1020 beq Lpoly1305_blocks
1021
1022 lwz $h0,0($ctx) # load hash
1023 lwz $h1,4($ctx)
1024 lwz $h2,8($ctx)
1025 lwz $h3,12($ctx)
1026 lwz $h4,16($ctx)
1027
1028 slwi $t0,$h1,26 # base 2^26 -> base 2^32
1029 srwi $h1,$h1,6
1030 slwi $t1,$h2,20
1031 srwi $h2,$h2,12
1032 addc $h0,$h0,$t0
1033 slwi $t0,$h3,14
1034 srwi $h3,$h3,18
1035 adde $h1,$h1,$t1
1036 slwi $t1,$h4,8
1037 srwi $h4,$h4,24
1038 adde $h2,$h2,$t0
1039 li $t0,0
1040 adde $h3,$h3,$t1
1041 addze $h4,$h4
1042
1043 stw $h0,0($ctx) # store hash base 2^32
1044 stw $h1,4($ctx)
1045 stw $h2,8($ctx)
1046 stw $h3,12($ctx)
1047 stw $h4,16($ctx)
1048 stw $t0,24($ctx) # clear is_base2_26
1049
1050 b Lpoly1305_blocks
1051 .long 0
1052 .byte 0,12,0x14,0,0,0,4,0
1053 .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
1054
1055 .align 5
1056 __poly1305_mul:
1057 vmulouw $ACC0,$H0,$R0
1058 vmulouw $ACC1,$H1,$R0
1059 vmulouw $ACC2,$H2,$R0
1060 vmulouw $ACC3,$H3,$R0
1061 vmulouw $ACC4,$H4,$R0
1062
1063 vmulouw $T0,$H4,$S1
1064 vaddudm $ACC0,$ACC0,$T0
1065 vmulouw $T0,$H0,$R1
1066 vaddudm $ACC1,$ACC1,$T0
1067 vmulouw $T0,$H1,$R1
1068 vaddudm $ACC2,$ACC2,$T0
1069 vmulouw $T0,$H2,$R1
1070 vaddudm $ACC3,$ACC3,$T0
1071 vmulouw $T0,$H3,$R1
1072 vaddudm $ACC4,$ACC4,$T0
1073
1074 vmulouw $T0,$H3,$S2
1075 vaddudm $ACC0,$ACC0,$T0
1076 vmulouw $T0,$H4,$S2
1077 vaddudm $ACC1,$ACC1,$T0
1078 vmulouw $T0,$H0,$R2
1079 vaddudm $ACC2,$ACC2,$T0
1080 vmulouw $T0,$H1,$R2
1081 vaddudm $ACC3,$ACC3,$T0
1082 vmulouw $T0,$H2,$R2
1083 vaddudm $ACC4,$ACC4,$T0
1084
1085 vmulouw $T0,$H2,$S3
1086 vaddudm $ACC0,$ACC0,$T0
1087 vmulouw $T0,$H3,$S3
1088 vaddudm $ACC1,$ACC1,$T0
1089 vmulouw $T0,$H4,$S3
1090 vaddudm $ACC2,$ACC2,$T0
1091 vmulouw $T0,$H0,$R3
1092 vaddudm $ACC3,$ACC3,$T0
1093 vmulouw $T0,$H1,$R3
1094 vaddudm $ACC4,$ACC4,$T0
1095
1096 vmulouw $T0,$H1,$S4
1097 vaddudm $ACC0,$ACC0,$T0
1098 vmulouw $T0,$H2,$S4
1099 vaddudm $ACC1,$ACC1,$T0
1100 vmulouw $T0,$H3,$S4
1101 vaddudm $ACC2,$ACC2,$T0
1102 vmulouw $T0,$H4,$S4
1103 vaddudm $ACC3,$ACC3,$T0
1104 vmulouw $T0,$H0,$R4
1105 vaddudm $ACC4,$ACC4,$T0
1106
1107 ################################################################
1108 # lazy reduction
1109
1110 vspltisb $T0,2
1111 vsrd $H4,$ACC3,$_26
1112 vsrd $H1,$ACC0,$_26
1113 vand $H3,$ACC3,$mask26
1114 vand $H0,$ACC0,$mask26
1115 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1116 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1117
1118 vsrd $ACC4,$H4,$_26
1119 vsrd $ACC1,$H1,$_26
1120 vand $H4,$H4,$mask26
1121 vand $H1,$H1,$mask26
1122 vaddudm $H0,$H0,$ACC4
1123 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1124
1125 vsld $ACC4,$ACC4,$T0 # <<2
1126 vsrd $ACC2,$H2,$_26
1127 vand $H2,$H2,$mask26
1128 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1129 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1130
1131 vsrd $ACC0,$H0,$_26
1132 vsrd $ACC3,$H3,$_26
1133 vand $H0,$H0,$mask26
1134 vand $H3,$H3,$mask26
1135 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1136 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1137
1138 blr
1139 .long 0
1140 .byte 0,12,0x14,0,0,0,0,0
1141 .size __poly1305_mul,.-__poly1305_mul
1142
1143 .align 5
1144 __poly1305_blocks_vsx:
1145 $STU $sp,-$VSXFRAME($sp)
1146 mflr r0
1147 li r10,`15+$LOCALS+128`
1148 li r11,`31+$LOCALS+128`
1149 mfspr r12,256
1150 stvx v20,r10,$sp
1151 addi r10,r10,32
1152 stvx v21,r11,$sp
1153 addi r11,r11,32
1154 stvx v22,r10,$sp
1155 addi r10,r10,32
1156 stvx v23,r10,$sp
1157 addi r10,r10,32
1158 stvx v24,r11,$sp
1159 addi r11,r11,32
1160 stvx v25,r10,$sp
1161 addi r10,r10,32
1162 stvx v26,r10,$sp
1163 addi r10,r10,32
1164 stvx v27,r11,$sp
1165 addi r11,r11,32
1166 stvx v28,r10,$sp
1167 addi r10,r10,32
1168 stvx v29,r11,$sp
1169 addi r11,r11,32
1170 stvx v30,r10,$sp
1171 stvx v31,r11,$sp
1172 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
1173 li r12,-1
1174 mtspr 256,r12 # preserve all AltiVec registers
1175 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
1176 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
1177 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
1178 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
1179 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
1180 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
1181
1182 bl LPICmeup
1183
1184 li $x10,0x10
1185 li $x20,0x20
1186 li $x30,0x30
1187 li $x40,0x40
1188 li $x50,0x50
1189 lvx_u $mask26,$x00,$const
1190 lvx_u $_26,$x10,$const
1191 lvx_u $_40,$x20,$const
1192 lvx_u $I2perm,$x30,$const
1193 lvx_u $padbits,$x40,$const
1194
1195 cmplwi r7,0 # is_base2_26?
1196 bne Lskip_init_vsx
1197
1198 lwz $h1,32($ctx) # load key base 2^32
1199 lwz $h2,36($ctx)
1200 lwz $h3,40($ctx)
1201 lwz $h4,44($ctx)
1202
1203 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
1204 extrwi $h1,$h1,6,0
1205 insrwi $h1,$h2,20,6
1206 extrwi $h2,$h2,12,0
1207 insrwi $h2,$h3,14,6
1208 extrwi $h3,$h3,18,0
1209 insrwi $h3,$h4,8,6
1210 extrwi $h4,$h4,24,0
1211
1212 mtvrwz $R0,$h0
1213 slwi $h0,$h1,2
1214 mtvrwz $R1,$h1
1215 add $h1,$h1,$h0
1216 mtvrwz $S1,$h1
1217 slwi $h1,$h2,2
1218 mtvrwz $R2,$h2
1219 add $h2,$h2,$h1
1220 mtvrwz $S2,$h2
1221 slwi $h2,$h3,2
1222 mtvrwz $R3,$h3
1223 add $h3,$h3,$h2
1224 mtvrwz $S3,$h3
1225 slwi $h3,$h4,2
1226 mtvrwz $R4,$h4
1227 add $h4,$h4,$h3
1228 mtvrwz $S4,$h4
1229
1230 vmr $H0,$R0
1231 vmr $H1,$R1
1232 vmr $H2,$R2
1233 vmr $H3,$R3
1234 vmr $H4,$R4
1235
1236 bl __poly1305_mul # r^1:- * r^1:-
1237
1238 vpermdi $R0,$H0,$R0,0b00
1239 vpermdi $R1,$H1,$R1,0b00
1240 vpermdi $R2,$H2,$R2,0b00
1241 vpermdi $R3,$H3,$R3,0b00
1242 vpermdi $R4,$H4,$R4,0b00
1243 vpermdi $H0,$H0,$H0,0b00
1244 vpermdi $H1,$H1,$H1,0b00
1245 vpermdi $H2,$H2,$H2,0b00
1246 vpermdi $H3,$H3,$H3,0b00
1247 vpermdi $H4,$H4,$H4,0b00
1248 vsld $S1,$R1,$T0 # <<2
1249 vsld $S2,$R2,$T0
1250 vsld $S3,$R3,$T0
1251 vsld $S4,$R4,$T0
1252 vaddudm $S1,$S1,$R1
1253 vaddudm $S2,$S2,$R2
1254 vaddudm $S3,$S3,$R3
1255 vaddudm $S4,$S4,$R4
1256
1257 bl __poly1305_mul # r^2:r^2 * r^2:r^1
1258
1259 addi $h0,$ctx,0x60
1260 lwz $h1,0($ctx) # load hash
1261 lwz $h2,4($ctx)
1262 lwz $h3,8($ctx)
1263 lwz $h4,12($ctx)
1264 lwz $t0,16($ctx)
1265
1266 vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3
1267 vmrgow $R1,$R1,$H1
1268 vmrgow $R2,$R2,$H2
1269 vmrgow $R3,$R3,$H3
1270 vmrgow $R4,$R4,$H4
1271 vslw $S1,$R1,$T0 # <<2
1272 vslw $S2,$R2,$T0
1273 vslw $S3,$R3,$T0
1274 vslw $S4,$R4,$T0
1275 vadduwm $S1,$S1,$R1
1276 vadduwm $S2,$S2,$R2
1277 vadduwm $S3,$S3,$R3
1278 vadduwm $S4,$S4,$R4
1279
1280 stvx_u $R0,$x30,$ctx
1281 stvx_u $R1,$x40,$ctx
1282 stvx_u $S1,$x50,$ctx
1283 stvx_u $R2,$x00,$h0
1284 stvx_u $S2,$x10,$h0
1285 stvx_u $R3,$x20,$h0
1286 stvx_u $S3,$x30,$h0
1287 stvx_u $R4,$x40,$h0
1288 stvx_u $S4,$x50,$h0
1289
1290 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
1291 extrwi $h1,$h1,6,0
1292 mtvrwz $H0,$h0
1293 insrwi $h1,$h2,20,6
1294 extrwi $h2,$h2,12,0
1295 mtvrwz $H1,$h1
1296 insrwi $h2,$h3,14,6
1297 extrwi $h3,$h3,18,0
1298 mtvrwz $H2,$h2
1299 insrwi $h3,$h4,8,6
1300 extrwi $h4,$h4,24,0
1301 mtvrwz $H3,$h3
1302 insrwi $h4,$t0,3,5
1303 mtvrwz $H4,$h4
1304 ___
1305 }
1306 $code.=<<___;
1307 li r0,1
1308 stw r0,24($ctx) # set is_base2_26
1309 b Loaded_vsx
1310
1311 .align 4
1312 Lskip_init_vsx:
1313 li $x10,4
1314 li $x20,8
1315 li $x30,12
1316 li $x40,16
1317 lvwzx_u $H0,$x00,$ctx
1318 lvwzx_u $H1,$x10,$ctx
1319 lvwzx_u $H2,$x20,$ctx
1320 lvwzx_u $H3,$x30,$ctx
1321 lvwzx_u $H4,$x40,$ctx
1322
1323 Loaded_vsx:
1324 li $x10,0x10
1325 li $x20,0x20
1326 li $x30,0x30
1327 li $x40,0x40
1328 li $x50,0x50
1329 li $x60,0x60
1330 li $x70,0x70
1331 addi $ctx_,$ctx,64 # &ctx->r[1]
1332 addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow
1333
1334 vxor $T0,$T0,$T0 # ensure second half is zero
1335 vpermdi $H0,$H0,$T0,0b00
1336 vpermdi $H1,$H1,$T0,0b00
1337 vpermdi $H2,$H2,$T0,0b00
1338 vpermdi $H3,$H3,$T0,0b00
1339 vpermdi $H4,$H4,$T0,0b00
1340
1341 be?lvx_u $_4,$x50,$const # byte swap mask
1342 lvx_u $T1,$x00,$inp # load first input block
1343 lvx_u $T2,$x10,$inp
1344 lvx_u $T3,$x20,$inp
1345 lvx_u $T4,$x30,$inp
1346 be?vperm $T1,$T1,$T1,$_4
1347 be?vperm $T2,$T2,$T2,$_4
1348 be?vperm $T3,$T3,$T3,$_4
1349 be?vperm $T4,$T4,$T4,$_4
1350
1351 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1352 vspltisb $_4,4
1353 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1354 vspltisb $_14,14
1355 vpermdi $I3,$T1,$T2,0b11
1356
1357 vsrd $I1,$I0,$_26
1358 vsrd $I2,$I2,$_4
1359 vsrd $I4,$I3,$_40
1360 vsrd $I3,$I3,$_14
1361 vand $I0,$I0,$mask26
1362 vand $I1,$I1,$mask26
1363 vand $I2,$I2,$mask26
1364 vand $I3,$I3,$mask26
1365
1366 vpermdi $T1,$T3,$T4,0b00
1367 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1368 vpermdi $T3,$T3,$T4,0b11
1369
1370 vsrd $T0,$T1,$_26
1371 vsrd $T2,$T2,$_4
1372 vsrd $T4,$T3,$_40
1373 vsrd $T3,$T3,$_14
1374 vand $T1,$T1,$mask26
1375 vand $T0,$T0,$mask26
1376 vand $T2,$T2,$mask26
1377 vand $T3,$T3,$mask26
1378
1379 # inp[2]:inp[0]:inp[3]:inp[1]
1380 vmrgow $I4,$T4,$I4
1381 vmrgow $I0,$T1,$I0
1382 vmrgow $I1,$T0,$I1
1383 vmrgow $I2,$T2,$I2
1384 vmrgow $I3,$T3,$I3
1385 vor $I4,$I4,$padbits
1386
1387 lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop
1388 lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement
1389 lvx_splt $S1,$x10,$ctx_
1390 lvx_splt $R2,$x20,$ctx_
1391 lvx_splt $S2,$x30,$ctx_
1392 lvx_splt $T1,$x40,$ctx_
1393 lvx_splt $T2,$x50,$ctx_
1394 lvx_splt $T3,$x60,$ctx_
1395 lvx_splt $T4,$x70,$ctx_
1396 stvx $R1,$x00,$_ctx
1397 stvx $S1,$x10,$_ctx
1398 stvx $R2,$x20,$_ctx
1399 stvx $S2,$x30,$_ctx
1400 stvx $T1,$x40,$_ctx
1401 stvx $T2,$x50,$_ctx
1402 stvx $T3,$x60,$_ctx
1403 stvx $T4,$x70,$_ctx
1404
1405 addi $inp,$inp,0x40
1406 addi $const,$const,0x50
1407 addi r0,$len,-64
1408 srdi r0,r0,6
1409 mtctr r0
1410 b Loop_vsx
1411
1412 .align 4
1413 Loop_vsx:
1414 ################################################################
1415 ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1416 ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1417 ## \___________________/
1418 ##
1419 ## Note that we start with inp[2:3]*r^2. This is because it
1420 ## doesn't depend on reduction in previous iteration.
1421 ################################################################
1422 ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1423 ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1424 ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1425 ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1426 ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1427
1428 vmuleuw $ACC0,$I0,$R0
1429 vmuleuw $ACC1,$I0,$R1
1430 vmuleuw $ACC2,$I0,$R2
1431 vmuleuw $ACC3,$I1,$R2
1432
1433 vmuleuw $T0,$I1,$R0
1434 vaddudm $ACC1,$ACC1,$T0
1435 vmuleuw $T0,$I1,$R1
1436 vaddudm $ACC2,$ACC2,$T0
1437 vmuleuw $ACC4,$I2,$R2
1438 vmuleuw $T0,$I4,$S1
1439 vaddudm $ACC0,$ACC0,$T0
1440 vmuleuw $T0,$I2,$R1
1441 vaddudm $ACC3,$ACC3,$T0
1442 lvx $S3,$x50,$_ctx
1443 vmuleuw $T0,$I3,$R1
1444 vaddudm $ACC4,$ACC4,$T0
1445 lvx $R3,$x40,$_ctx
1446
1447 vaddudm $H2,$H2,$I2
1448 vaddudm $H0,$H0,$I0
1449 vaddudm $H3,$H3,$I3
1450 vaddudm $H1,$H1,$I1
1451 vaddudm $H4,$H4,$I4
1452
1453 vmuleuw $T0,$I3,$S2
1454 vaddudm $ACC0,$ACC0,$T0
1455 vmuleuw $T0,$I4,$S2
1456 vaddudm $ACC1,$ACC1,$T0
1457 vmuleuw $T0,$I2,$R0
1458 vaddudm $ACC2,$ACC2,$T0
1459 vmuleuw $T0,$I3,$R0
1460 vaddudm $ACC3,$ACC3,$T0
1461 lvx $S4,$x70,$_ctx
1462 vmuleuw $T0,$I4,$R0
1463 vaddudm $ACC4,$ACC4,$T0
1464 lvx $R4,$x60,$_ctx
1465
1466 vmuleuw $T0,$I2,$S3
1467 vaddudm $ACC0,$ACC0,$T0
1468 vmuleuw $T0,$I3,$S3
1469 vaddudm $ACC1,$ACC1,$T0
1470 vmuleuw $T0,$I4,$S3
1471 vaddudm $ACC2,$ACC2,$T0
1472 vmuleuw $T0,$I0,$R3
1473 vaddudm $ACC3,$ACC3,$T0
1474 vmuleuw $T0,$I1,$R3
1475 vaddudm $ACC4,$ACC4,$T0
1476
1477 be?lvx_u $_4,$x00,$const # byte swap mask
1478 lvx_u $T1,$x00,$inp # load next input block
1479 lvx_u $T2,$x10,$inp
1480 lvx_u $T3,$x20,$inp
1481 lvx_u $T4,$x30,$inp
1482 be?vperm $T1,$T1,$T1,$_4
1483 be?vperm $T2,$T2,$T2,$_4
1484 be?vperm $T3,$T3,$T3,$_4
1485 be?vperm $T4,$T4,$T4,$_4
1486
1487 vmuleuw $T0,$I1,$S4
1488 vaddudm $ACC0,$ACC0,$T0
1489 vmuleuw $T0,$I2,$S4
1490 vaddudm $ACC1,$ACC1,$T0
1491 vmuleuw $T0,$I3,$S4
1492 vaddudm $ACC2,$ACC2,$T0
1493 vmuleuw $T0,$I4,$S4
1494 vaddudm $ACC3,$ACC3,$T0
1495 vmuleuw $T0,$I0,$R4
1496 vaddudm $ACC4,$ACC4,$T0
1497
1498 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1499 vspltisb $_4,4
1500 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1501 vpermdi $I3,$T1,$T2,0b11
1502
1503 # (hash + inp[0:1]) * r^4
1504 vmulouw $T0,$H0,$R0
1505 vaddudm $ACC0,$ACC0,$T0
1506 vmulouw $T0,$H1,$R0
1507 vaddudm $ACC1,$ACC1,$T0
1508 vmulouw $T0,$H2,$R0
1509 vaddudm $ACC2,$ACC2,$T0
1510 vmulouw $T0,$H3,$R0
1511 vaddudm $ACC3,$ACC3,$T0
1512 vmulouw $T0,$H4,$R0
1513 vaddudm $ACC4,$ACC4,$T0
1514
1515 vpermdi $T1,$T3,$T4,0b00
1516 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1517 vpermdi $T3,$T3,$T4,0b11
1518
1519 vmulouw $T0,$H2,$S3
1520 vaddudm $ACC0,$ACC0,$T0
1521 vmulouw $T0,$H3,$S3
1522 vaddudm $ACC1,$ACC1,$T0
1523 vmulouw $T0,$H4,$S3
1524 vaddudm $ACC2,$ACC2,$T0
1525 vmulouw $T0,$H0,$R3
1526 vaddudm $ACC3,$ACC3,$T0
1527 lvx $S1,$x10,$_ctx
1528 vmulouw $T0,$H1,$R3
1529 vaddudm $ACC4,$ACC4,$T0
1530 lvx $R1,$x00,$_ctx
1531
1532 vsrd $I1,$I0,$_26
1533 vsrd $I2,$I2,$_4
1534 vsrd $I4,$I3,$_40
1535 vsrd $I3,$I3,$_14
1536
1537 vmulouw $T0,$H1,$S4
1538 vaddudm $ACC0,$ACC0,$T0
1539 vmulouw $T0,$H2,$S4
1540 vaddudm $ACC1,$ACC1,$T0
1541 vmulouw $T0,$H3,$S4
1542 vaddudm $ACC2,$ACC2,$T0
1543 vmulouw $T0,$H4,$S4
1544 vaddudm $ACC3,$ACC3,$T0
1545 lvx $S2,$x30,$_ctx
1546 vmulouw $T0,$H0,$R4
1547 vaddudm $ACC4,$ACC4,$T0
1548 lvx $R2,$x20,$_ctx
1549
1550 vand $I0,$I0,$mask26
1551 vand $I1,$I1,$mask26
1552 vand $I2,$I2,$mask26
1553 vand $I3,$I3,$mask26
1554
1555 vmulouw $T0,$H4,$S1
1556 vaddudm $ACC0,$ACC0,$T0
1557 vmulouw $T0,$H0,$R1
1558 vaddudm $ACC1,$ACC1,$T0
1559 vmulouw $T0,$H1,$R1
1560 vaddudm $ACC2,$ACC2,$T0
1561 vmulouw $T0,$H2,$R1
1562 vaddudm $ACC3,$ACC3,$T0
1563 vmulouw $T0,$H3,$R1
1564 vaddudm $ACC4,$ACC4,$T0
1565
1566 vsrd $T2,$T2,$_4
1567 vsrd $_4,$T1,$_26
1568 vsrd $T4,$T3,$_40
1569 vsrd $T3,$T3,$_14
1570
1571 vmulouw $T0,$H3,$S2
1572 vaddudm $ACC0,$ACC0,$T0
1573 vmulouw $T0,$H4,$S2
1574 vaddudm $ACC1,$ACC1,$T0
1575 vmulouw $T0,$H0,$R2
1576 vaddudm $ACC2,$ACC2,$T0
1577 vmulouw $T0,$H1,$R2
1578 vaddudm $ACC3,$ACC3,$T0
1579 vmulouw $T0,$H2,$R2
1580 vaddudm $ACC4,$ACC4,$T0
1581
1582 vand $T1,$T1,$mask26
1583 vand $_4,$_4,$mask26
1584 vand $T2,$T2,$mask26
1585 vand $T3,$T3,$mask26
1586
1587 ################################################################
1588 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1589 # and P. Schwabe
1590
1591 vspltisb $T0,2
1592 vsrd $H4,$ACC3,$_26
1593 vsrd $H1,$ACC0,$_26
1594 vand $H3,$ACC3,$mask26
1595 vand $H0,$ACC0,$mask26
1596 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1597 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1598
1599 vmrgow $I4,$T4,$I4
1600 vmrgow $I0,$T1,$I0
1601 vmrgow $I1,$_4,$I1
1602 vmrgow $I2,$T2,$I2
1603 vmrgow $I3,$T3,$I3
1604 vor $I4,$I4,$padbits
1605
1606 vsrd $ACC4,$H4,$_26
1607 vsrd $ACC1,$H1,$_26
1608 vand $H4,$H4,$mask26
1609 vand $H1,$H1,$mask26
1610 vaddudm $H0,$H0,$ACC4
1611 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1612
1613 vsld $ACC4,$ACC4,$T0 # <<2
1614 vsrd $ACC2,$H2,$_26
1615 vand $H2,$H2,$mask26
1616 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1617 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1618
1619 vsrd $ACC0,$H0,$_26
1620 vsrd $ACC3,$H3,$_26
1621 vand $H0,$H0,$mask26
1622 vand $H3,$H3,$mask26
1623 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1624 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1625
1626 addi $inp,$inp,0x40
1627 bdnz Loop_vsx
1628
1629 neg $len,$len
1630 andi. $len,$len,0x30
1631 sub $inp,$inp,$len
1632
1633 lvx_u $R0,$x30,$ctx # load all powers
1634 lvx_u $R1,$x00,$ctx_
1635 lvx_u $S1,$x10,$ctx_
1636 lvx_u $R2,$x20,$ctx_
1637 lvx_u $S2,$x30,$ctx_
1638
1639 Last_vsx:
1640 vmuleuw $ACC0,$I0,$R0
1641 vmuleuw $ACC1,$I1,$R0
1642 vmuleuw $ACC2,$I2,$R0
1643 vmuleuw $ACC3,$I3,$R0
1644 vmuleuw $ACC4,$I4,$R0
1645
1646 vmuleuw $T0,$I4,$S1
1647 vaddudm $ACC0,$ACC0,$T0
1648 vmuleuw $T0,$I0,$R1
1649 vaddudm $ACC1,$ACC1,$T0
1650 vmuleuw $T0,$I1,$R1
1651 vaddudm $ACC2,$ACC2,$T0
1652 vmuleuw $T0,$I2,$R1
1653 vaddudm $ACC3,$ACC3,$T0
1654 lvx_u $S3,$x50,$ctx_
1655 vmuleuw $T0,$I3,$R1
1656 vaddudm $ACC4,$ACC4,$T0
1657 lvx_u $R3,$x40,$ctx_
1658
1659 vaddudm $H2,$H2,$I2
1660 vaddudm $H0,$H0,$I0
1661 vaddudm $H3,$H3,$I3
1662 vaddudm $H1,$H1,$I1
1663 vaddudm $H4,$H4,$I4
1664
1665 vmuleuw $T0,$I3,$S2
1666 vaddudm $ACC0,$ACC0,$T0
1667 vmuleuw $T0,$I4,$S2
1668 vaddudm $ACC1,$ACC1,$T0
1669 vmuleuw $T0,$I0,$R2
1670 vaddudm $ACC2,$ACC2,$T0
1671 vmuleuw $T0,$I1,$R2
1672 vaddudm $ACC3,$ACC3,$T0
1673 lvx_u $S4,$x70,$ctx_
1674 vmuleuw $T0,$I2,$R2
1675 vaddudm $ACC4,$ACC4,$T0
1676 lvx_u $R4,$x60,$ctx_
1677
1678 vmuleuw $T0,$I2,$S3
1679 vaddudm $ACC0,$ACC0,$T0
1680 vmuleuw $T0,$I3,$S3
1681 vaddudm $ACC1,$ACC1,$T0
1682 vmuleuw $T0,$I4,$S3
1683 vaddudm $ACC2,$ACC2,$T0
1684 vmuleuw $T0,$I0,$R3
1685 vaddudm $ACC3,$ACC3,$T0
1686 vmuleuw $T0,$I1,$R3
1687 vaddudm $ACC4,$ACC4,$T0
1688
1689 vmuleuw $T0,$I1,$S4
1690 vaddudm $ACC0,$ACC0,$T0
1691 vmuleuw $T0,$I2,$S4
1692 vaddudm $ACC1,$ACC1,$T0
1693 vmuleuw $T0,$I3,$S4
1694 vaddudm $ACC2,$ACC2,$T0
1695 vmuleuw $T0,$I4,$S4
1696 vaddudm $ACC3,$ACC3,$T0
1697 vmuleuw $T0,$I0,$R4
1698 vaddudm $ACC4,$ACC4,$T0
1699
1700 # (hash + inp[0:1]) * r^4
1701 vmulouw $T0,$H0,$R0
1702 vaddudm $ACC0,$ACC0,$T0
1703 vmulouw $T0,$H1,$R0
1704 vaddudm $ACC1,$ACC1,$T0
1705 vmulouw $T0,$H2,$R0
1706 vaddudm $ACC2,$ACC2,$T0
1707 vmulouw $T0,$H3,$R0
1708 vaddudm $ACC3,$ACC3,$T0
1709 vmulouw $T0,$H4,$R0
1710 vaddudm $ACC4,$ACC4,$T0
1711
1712 vmulouw $T0,$H2,$S3
1713 vaddudm $ACC0,$ACC0,$T0
1714 vmulouw $T0,$H3,$S3
1715 vaddudm $ACC1,$ACC1,$T0
1716 vmulouw $T0,$H4,$S3
1717 vaddudm $ACC2,$ACC2,$T0
1718 vmulouw $T0,$H0,$R3
1719 vaddudm $ACC3,$ACC3,$T0
1720 lvx_u $S1,$x10,$ctx_
1721 vmulouw $T0,$H1,$R3
1722 vaddudm $ACC4,$ACC4,$T0
1723 lvx_u $R1,$x00,$ctx_
1724
1725 vmulouw $T0,$H1,$S4
1726 vaddudm $ACC0,$ACC0,$T0
1727 vmulouw $T0,$H2,$S4
1728 vaddudm $ACC1,$ACC1,$T0
1729 vmulouw $T0,$H3,$S4
1730 vaddudm $ACC2,$ACC2,$T0
1731 vmulouw $T0,$H4,$S4
1732 vaddudm $ACC3,$ACC3,$T0
1733 lvx_u $S2,$x30,$ctx_
1734 vmulouw $T0,$H0,$R4
1735 vaddudm $ACC4,$ACC4,$T0
1736 lvx_u $R2,$x20,$ctx_
1737
1738 vmulouw $T0,$H4,$S1
1739 vaddudm $ACC0,$ACC0,$T0
1740 vmulouw $T0,$H0,$R1
1741 vaddudm $ACC1,$ACC1,$T0
1742 vmulouw $T0,$H1,$R1
1743 vaddudm $ACC2,$ACC2,$T0
1744 vmulouw $T0,$H2,$R1
1745 vaddudm $ACC3,$ACC3,$T0
1746 vmulouw $T0,$H3,$R1
1747 vaddudm $ACC4,$ACC4,$T0
1748
1749 vmulouw $T0,$H3,$S2
1750 vaddudm $ACC0,$ACC0,$T0
1751 vmulouw $T0,$H4,$S2
1752 vaddudm $ACC1,$ACC1,$T0
1753 vmulouw $T0,$H0,$R2
1754 vaddudm $ACC2,$ACC2,$T0
1755 vmulouw $T0,$H1,$R2
1756 vaddudm $ACC3,$ACC3,$T0
1757 vmulouw $T0,$H2,$R2
1758 vaddudm $ACC4,$ACC4,$T0
1759
1760 ################################################################
1761 # horizontal addition
1762
1763 vpermdi $H0,$ACC0,$ACC0,0b10
1764 vpermdi $H1,$ACC1,$ACC1,0b10
1765 vpermdi $H2,$ACC2,$ACC2,0b10
1766 vpermdi $H3,$ACC3,$ACC3,0b10
1767 vpermdi $H4,$ACC4,$ACC4,0b10
1768 vaddudm $ACC0,$ACC0,$H0
1769 vaddudm $ACC1,$ACC1,$H1
1770 vaddudm $ACC2,$ACC2,$H2
1771 vaddudm $ACC3,$ACC3,$H3
1772 vaddudm $ACC4,$ACC4,$H4
1773
1774 ################################################################
1775 # lazy reduction
1776
1777 vspltisb $T0,2
1778 vsrd $H4,$ACC3,$_26
1779 vsrd $H1,$ACC0,$_26
1780 vand $H3,$ACC3,$mask26
1781 vand $H0,$ACC0,$mask26
1782 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1783 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1784
1785 vsrd $ACC4,$H4,$_26
1786 vsrd $ACC1,$H1,$_26
1787 vand $H4,$H4,$mask26
1788 vand $H1,$H1,$mask26
1789 vaddudm $H0,$H0,$ACC4
1790 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1791
1792 vsld $ACC4,$ACC4,$T0 # <<2
1793 vsrd $ACC2,$H2,$_26
1794 vand $H2,$H2,$mask26
1795 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1796 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1797
1798 vsrd $ACC0,$H0,$_26
1799 vsrd $ACC3,$H3,$_26
1800 vand $H0,$H0,$mask26
1801 vand $H3,$H3,$mask26
1802 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1803 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1804
1805 beq Ldone_vsx
1806
1807 add r6,$const,$len
1808
1809 be?lvx_u $_4,$x00,$const # byte swap mask
1810 lvx_u $T1,$x00,$inp # load last partial input block
1811 lvx_u $T2,$x10,$inp
1812 lvx_u $T3,$x20,$inp
1813 lvx_u $T4,$x30,$inp
1814 be?vperm $T1,$T1,$T1,$_4
1815 be?vperm $T2,$T2,$T2,$_4
1816 be?vperm $T3,$T3,$T3,$_4
1817 be?vperm $T4,$T4,$T4,$_4
1818
1819 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1820 vspltisb $_4,4
1821 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1822 vpermdi $I3,$T1,$T2,0b11
1823
1824 vsrd $I1,$I0,$_26
1825 vsrd $I2,$I2,$_4
1826 vsrd $I4,$I3,$_40
1827 vsrd $I3,$I3,$_14
1828 vand $I0,$I0,$mask26
1829 vand $I1,$I1,$mask26
1830 vand $I2,$I2,$mask26
1831 vand $I3,$I3,$mask26
1832
1833 vpermdi $T0,$T3,$T4,0b00
1834 vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1835 vpermdi $T2,$T3,$T4,0b11
1836
1837 lvx_u $ACC0,$x00,r6
1838 lvx_u $ACC1,$x30,r6
1839
1840 vsrd $T3,$T0,$_26
1841 vsrd $T1,$T1,$_4
1842 vsrd $T4,$T2,$_40
1843 vsrd $T2,$T2,$_14
1844 vand $T0,$T0,$mask26
1845 vand $T3,$T3,$mask26
1846 vand $T1,$T1,$mask26
1847 vand $T2,$T2,$mask26
1848
1849 # inp[2]:inp[0]:inp[3]:inp[1]
1850 vmrgow $I4,$T4,$I4
1851 vmrgow $I0,$T0,$I0
1852 vmrgow $I1,$T3,$I1
1853 vmrgow $I2,$T1,$I2
1854 vmrgow $I3,$T2,$I3
1855 vor $I4,$I4,$padbits
1856
1857 vperm $H0,$H0,$H0,$ACC0 # move hash to right lane
1858 vand $I0,$I0, $ACC1 # mask redundant input lane[s]
1859 vperm $H1,$H1,$H1,$ACC0
1860 vand $I1,$I1, $ACC1
1861 vperm $H2,$H2,$H2,$ACC0
1862 vand $I2,$I2, $ACC1
1863 vperm $H3,$H3,$H3,$ACC0
1864 vand $I3,$I3, $ACC1
1865 vperm $H4,$H4,$H4,$ACC0
1866 vand $I4,$I4, $ACC1
1867
1868 vaddudm $I0,$I0,$H0 # accumulate hash
1869 vxor $H0,$H0,$H0 # wipe hash value
1870 vaddudm $I1,$I1,$H1
1871 vxor $H1,$H1,$H1
1872 vaddudm $I2,$I2,$H2
1873 vxor $H2,$H2,$H2
1874 vaddudm $I3,$I3,$H3
1875 vxor $H3,$H3,$H3
1876 vaddudm $I4,$I4,$H4
1877 vxor $H4,$H4,$H4
1878
1879 xor. $len,$len,$len
1880 b Last_vsx
1881
1882 .align 4
1883 Ldone_vsx:
1884 $POP r0,`$VSXFRAME+$LRSAVE`($sp)
1885 li $x10,4
1886 li $x20,8
1887 li $x30,12
1888 li $x40,16
1889 stvwx_u $H0,$x00,$ctx # store hash
1890 stvwx_u $H1,$x10,$ctx
1891 stvwx_u $H2,$x20,$ctx
1892 stvwx_u $H3,$x30,$ctx
1893 stvwx_u $H4,$x40,$ctx
1894
1895 lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
1896 mtlr r0
1897 li r10,`15+$LOCALS+128`
1898 li r11,`31+$LOCALS+128`
1899 mtspr 256,r12 # restore vrsave
1900 lvx v20,r10,$sp
1901 addi r10,r10,32
1902 lvx v21,r10,$sp
1903 addi r10,r10,32
1904 lvx v22,r11,$sp
1905 addi r11,r11,32
1906 lvx v23,r10,$sp
1907 addi r10,r10,32
1908 lvx v24,r11,$sp
1909 addi r11,r11,32
1910 lvx v25,r10,$sp
1911 addi r10,r10,32
1912 lvx v26,r11,$sp
1913 addi r11,r11,32
1914 lvx v27,r10,$sp
1915 addi r10,r10,32
1916 lvx v28,r11,$sp
1917 addi r11,r11,32
1918 lvx v29,r10,$sp
1919 addi r10,r10,32
1920 lvx v30,r11,$sp
1921 lvx v31,r10,$sp
1922 $POP r27,`$VSXFRAME-$SIZE_T*5`($sp)
1923 $POP r28,`$VSXFRAME-$SIZE_T*4`($sp)
1924 $POP r29,`$VSXFRAME-$SIZE_T*3`($sp)
1925 $POP r30,`$VSXFRAME-$SIZE_T*2`($sp)
1926 $POP r31,`$VSXFRAME-$SIZE_T*1`($sp)
1927 addi $sp,$sp,$VSXFRAME
1928 blr
1929 .long 0
1930 .byte 0,12,0x04,1,0x80,5,4,0
1931 .long 0
1932 .size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
1933
1934 .align 6
1935 LPICmeup:
1936 mflr r0
1937 bcl 20,31,\$+4
1938 mflr $const # vvvvvv "distance" between . and 1st data entry
1939 addi $const,$const,`64-8`
1940 mtlr r0
1941 blr
1942 .long 0
1943 .byte 0,12,0x14,0,0,0,0,0
1944 .space `64-9*4`
1945
1946 .quad 0x0000000003ffffff,0x0000000003ffffff # mask26
1947 .quad 0x000000000000001a,0x000000000000001a # _26
1948 .quad 0x0000000000000028,0x0000000000000028 # _40
1949 .quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm
1950 .quad 0x0100000001000000,0x0100000001000000 # padbits
1951 .quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian
1952
1953 .quad 0x0000000000000000,0x0000000004050607 # magic tail masks
1954 .quad 0x0405060700000000,0x0000000000000000
1955 .quad 0x0000000000000000,0x0405060700000000
1956
1957 .quad 0xffffffff00000000,0xffffffffffffffff
1958 .quad 0xffffffff00000000,0xffffffff00000000
1959 .quad 0x0000000000000000,0xffffffff00000000
1960 ___
1961 }}}
1962 $code.=<<___;
1963 .asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
1964 ___
1965
1966 foreach (split("\n",$code)) {
1967 s/\`([^\`]*)\`/eval($1)/ge;
1968
1969 # instructions prefixed with '?' are endian-specific and need
1970 # to be adjusted accordingly...
1971 if ($flavour !~ /le$/) { # big-endian
1972 s/be\?// or
1973 s/le\?/#le#/
1974 } else { # little-endian
1975 s/le\?// or
1976 s/be\?/#be#/
1977 }
1978
1979 print $_,"\n";
1980 }
1981 close STDOUT or die "error closing STDOUT: $!";