]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-ppc.pl
Fix Typos
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-ppc.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
83cf7abf 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
49d3b641 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9e58d119
AP
9#
10# ====================================================================
a28e4890
AP
11# Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
12# project. The module is dual licensed under OpenSSL and CRYPTOGAMS
13# licenses depending on where you obtain it. For further details see
14# https://github.com/dot-asm/cryptogams/.
9e58d119
AP
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24# -m32 -m64
25#
26# Freescale e300 14.8/+80% -
4b8736a2
AP
27# PPC74x0 7.60/+60% -
28# PPC970 7.00/+114% 3.51/+205%
29# POWER7 3.75/+260% 1.93/+100%
30# POWER8 - 2.03/+200%
41013cd6 31# POWER9 - 2.00/+150%
9e58d119
AP
32#
33# Do we need floating-point implementation for PPC? Results presented
34# in poly1305_ieee754.c are tricky to compare to, because they are for
35# compiler-generated code. On the other hand it's known that floating-
36# point performance can be dominated by FPU latency, which means that
37# there is limit even for ideally optimized (and even vectorized) code.
38# And this limit is estimated to be higher than above -m64 results. Or
39# in other words floating-point implementation can be meaningful to
40# consider only in 32-bit application context. We probably have to
41# recognize that 32-bit builds are getting less popular on high-end
42# systems and therefore tend to target embedded ones, which might not
43# even have FPU...
44#
45# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
46# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
a28e4890
AP
47#
48# January 2019
49#
50# ... Unfortunately not:-( Estimate was a projection of ARM result,
51# but ARM has vector multiply-n-add instruction, while PowerISA does
52# not, not one usable in the context. Improvement is ~40% over -m64
53# result above and is ~1.43 on little-endian systems.
9e58d119
AP
54
55$flavour = shift;
56
57if ($flavour =~ /64/) {
58 $SIZE_T =8;
59 $LRSAVE =2*$SIZE_T;
60 $UCMP ="cmpld";
61 $STU ="stdu";
62 $POP ="ld";
63 $PUSH ="std";
64} elsif ($flavour =~ /32/) {
65 $SIZE_T =4;
66 $LRSAVE =$SIZE_T;
67 $UCMP ="cmplw";
68 $STU ="stwu";
69 $POP ="lwz";
70 $PUSH ="stw";
71} else { die "nonsense $flavour"; }
72
60250017 73# Define endianness based on flavour
9e58d119
AP
74# i.e.: linux64le
75$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
76
77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
79( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
80die "can't locate ppc-xlate.pl";
81
82open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
83
84$FRAME=24*$SIZE_T;
85
86$sp="r1";
87my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
88my ($mac,$nonce)=($inp,$len);
89my $mask = "r0";
90
91$code=<<___;
92.machine "any"
93.text
94___
95 if ($flavour =~ /64/) {
96###############################################################################
97# base 2^64 implementation
98
99my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
100
101$code.=<<___;
102.globl .poly1305_init_int
103.align 4
104.poly1305_init_int:
105 xor r0,r0,r0
106 std r0,0($ctx) # zero hash value
107 std r0,8($ctx)
108 std r0,16($ctx)
a28e4890 109 stw r0,24($ctx) # clear is_base2_26
9e58d119
AP
110
111 $UCMP $inp,r0
112 beq- Lno_key
113___
114$code.=<<___ if ($LITTLE_ENDIAN);
115 ld $d0,0($inp) # load key material
116 ld $d1,8($inp)
117___
118$code.=<<___ if (!$LITTLE_ENDIAN);
119 li $h0,4
120 lwbrx $d0,0,$inp # load key material
121 li $d1,8
122 lwbrx $h0,$h0,$inp
123 li $h1,12
124 lwbrx $d1,$d1,$inp
125 lwbrx $h1,$h1,$inp
126 insrdi $d0,$h0,32,0
127 insrdi $d1,$h1,32,0
128___
129$code.=<<___;
130 lis $h1,0xfff # 0x0fff0000
131 ori $h1,$h1,0xfffc # 0x0ffffffc
132 insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
133 ori $h0,$h1,3 # 0x0ffffffc0fffffff
134
135 and $d0,$d0,$h0
136 and $d1,$d1,$h1
137
138 std $d0,32($ctx) # store key
139 std $d1,40($ctx)
140
141Lno_key:
142 xor r3,r3,r3
143 blr
144 .long 0
145 .byte 0,12,0x14,0,0,0,2,0
146.size .poly1305_init_int,.-.poly1305_init_int
147
148.globl .poly1305_blocks
149.align 4
150.poly1305_blocks:
a28e4890 151Lpoly1305_blocks:
9e58d119
AP
152 srdi. $len,$len,4
153 beq- Labort
154
155 $STU $sp,-$FRAME($sp)
156 mflr r0
157 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
158 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
159 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
160 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
161 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
162 $PUSH r0,`$FRAME+$LRSAVE`($sp)
163
164 ld $r0,32($ctx) # load key
165 ld $r1,40($ctx)
166
167 ld $h0,0($ctx) # load hash value
168 ld $h1,8($ctx)
169 ld $h2,16($ctx)
170
171 srdi $s1,$r1,2
172 mtctr $len
173 add $s1,$s1,$r1 # s1 = r1 + r1>>2
174 li $mask,3
175 b Loop
176
177.align 4
178Loop:
179___
180$code.=<<___ if ($LITTLE_ENDIAN);
181 ld $t0,0($inp) # load input
182 ld $t1,8($inp)
183___
184$code.=<<___ if (!$LITTLE_ENDIAN);
185 li $d0,4
186 lwbrx $t0,0,$inp # load input
187 li $t1,8
188 lwbrx $d0,$d0,$inp
189 li $d1,12
190 lwbrx $t1,$t1,$inp
191 lwbrx $d1,$d1,$inp
192 insrdi $t0,$d0,32,0
193 insrdi $t1,$d1,32,0
194___
195$code.=<<___;
196 addi $inp,$inp,16
197
198 addc $h0,$h0,$t0 # accumulate input
199 adde $h1,$h1,$t1
200
201 mulld $d0,$h0,$r0 # h0*r0
202 mulhdu $d1,$h0,$r0
203 adde $h2,$h2,$padbit
204
205 mulld $t0,$h1,$s1 # h1*5*r1
206 mulhdu $t1,$h1,$s1
207 addc $d0,$d0,$t0
208 adde $d1,$d1,$t1
209
210 mulld $t0,$h0,$r1 # h0*r1
211 mulhdu $d2,$h0,$r1
212 addc $d1,$d1,$t0
213 addze $d2,$d2
214
215 mulld $t0,$h1,$r0 # h1*r0
216 mulhdu $t1,$h1,$r0
217 addc $d1,$d1,$t0
218 adde $d2,$d2,$t1
219
220 mulld $t0,$h2,$s1 # h2*5*r1
221 mulld $t1,$h2,$r0 # h2*r0
222 addc $d1,$d1,$t0
223 adde $d2,$d2,$t1
224
225 andc $t0,$d2,$mask # final reduction step
226 and $h2,$d2,$mask
227 srdi $t1,$t0,2
228 add $t0,$t0,$t1
229 addc $h0,$d0,$t0
230 addze $h1,$d1
4b8736a2 231 addze $h2,$h2
9e58d119
AP
232
233 bdnz Loop
234
235 std $h0,0($ctx) # store hash value
236 std $h1,8($ctx)
237 std $h2,16($ctx)
238
239 $POP r27,`$FRAME-$SIZE_T*5`($sp)
240 $POP r28,`$FRAME-$SIZE_T*4`($sp)
241 $POP r29,`$FRAME-$SIZE_T*3`($sp)
242 $POP r30,`$FRAME-$SIZE_T*2`($sp)
243 $POP r31,`$FRAME-$SIZE_T*1`($sp)
244 addi $sp,$sp,$FRAME
245Labort:
246 blr
247 .long 0
248 .byte 0,12,4,1,0x80,5,4,0
249.size .poly1305_blocks,.-.poly1305_blocks
a28e4890
AP
250___
251{
252my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
9e58d119 253
a28e4890 254$code.=<<___;
9e58d119 255.globl .poly1305_emit
a28e4890 256.align 5
9e58d119 257.poly1305_emit:
a28e4890
AP
258 lwz $h0,0($ctx) # load hash value base 2^26
259 lwz $h1,4($ctx)
260 lwz $h2,8($ctx)
261 lwz $h3,12($ctx)
262 lwz $h4,16($ctx)
263 lwz r0,24($ctx) # is_base2_26
264
265 sldi $h1,$h1,26 # base 2^26 -> base 2^64
266 sldi $t0,$h2,52
267 srdi $h2,$h2,12
268 sldi $h3,$h3,14
269 add $h0,$h0,$h1
270 addc $h0,$h0,$t0
271 sldi $t0,$h4,40
272 srdi $h4,$h4,24
273 adde $h1,$h2,$h3
274 addc $h1,$h1,$t0
275 addze $h2,$h4
276
277 ld $h3,0($ctx) # load hash value base 2^64
278 ld $h4,8($ctx)
279 ld $t0,16($ctx)
280
281 neg r0,r0
282 xor $h0,$h0,$h3 # choose between radixes
283 xor $h1,$h1,$h4
284 xor $h2,$h2,$t0
285 and $h0,$h0,r0
286 and $h1,$h1,r0
287 and $h2,$h2,r0
288 xor $h0,$h0,$h3
289 xor $h1,$h1,$h4
290 xor $h2,$h2,$t0
291
292 addic $h3,$h0,5 # compare to modulus
293 addze $h4,$h1
294 addze $t0,$h2
295
296 srdi $t0,$t0,2 # see if it carried/borrowed
297 neg $t0,$t0
298
299 andc $h0,$h0,$t0
300 and $h3,$h3,$t0
301 andc $h1,$h1,$t0
302 and $h4,$h4,$t0
303 or $h0,$h0,$h3
304 or $h1,$h1,$h4
305
306 lwz $t0,4($nonce)
307 lwz $h2,12($nonce)
308 lwz $h3,0($nonce)
309 lwz $h4,8($nonce)
310
311 insrdi $h3,$t0,32,0
312 insrdi $h4,$h2,32,0
313
314 addc $h0,$h0,$h3 # accumulate nonce
315 adde $h1,$h1,$h4
316
317 addi $ctx,$mac,-1
318 addi $mac,$mac,7
319
320 stbu $h0,1($ctx) # write [little-endian] result
321 srdi $h0,$h0,8
322 stbu $h1,1($mac)
323 srdi $h1,$h1,8
324
325 stbu $h0,1($ctx)
326 srdi $h0,$h0,8
327 stbu $h1,1($mac)
328 srdi $h1,$h1,8
329
330 stbu $h0,1($ctx)
331 srdi $h0,$h0,8
332 stbu $h1,1($mac)
333 srdi $h1,$h1,8
334
335 stbu $h0,1($ctx)
336 srdi $h0,$h0,8
337 stbu $h1,1($mac)
338 srdi $h1,$h1,8
339
340 stbu $h0,1($ctx)
341 srdi $h0,$h0,8
342 stbu $h1,1($mac)
343 srdi $h1,$h1,8
344
345 stbu $h0,1($ctx)
346 srdi $h0,$h0,8
347 stbu $h1,1($mac)
348 srdi $h1,$h1,8
349
350 stbu $h0,1($ctx)
351 srdi $h0,$h0,8
352 stbu $h1,1($mac)
353 srdi $h1,$h1,8
354
355 stbu $h0,1($ctx)
356 stbu $h1,1($mac)
9e58d119 357
9e58d119
AP
358 blr
359 .long 0
360 .byte 0,12,0x14,0,0,0,3,0
361.size .poly1305_emit,.-.poly1305_emit
362___
a28e4890 363} } else {
9e58d119
AP
364###############################################################################
365# base 2^32 implementation
366
367my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
368 $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
369 ) = map("r$_",(7..12,14..31));
370
371$code.=<<___;
372.globl .poly1305_init_int
373.align 4
374.poly1305_init_int:
375 xor r0,r0,r0
376 stw r0,0($ctx) # zero hash value
377 stw r0,4($ctx)
378 stw r0,8($ctx)
379 stw r0,12($ctx)
380 stw r0,16($ctx)
a28e4890 381 stw r0,24($ctx) # clear is_base2_26
9e58d119
AP
382
383 $UCMP $inp,r0
384 beq- Lno_key
385___
386$code.=<<___ if ($LITTLE_ENDIAN);
387 lw $h0,0($inp) # load key material
388 lw $h1,4($inp)
389 lw $h2,8($inp)
390 lw $h3,12($inp)
391___
392$code.=<<___ if (!$LITTLE_ENDIAN);
393 li $h1,4
394 lwbrx $h0,0,$inp # load key material
395 li $h2,8
396 lwbrx $h1,$h1,$inp
397 li $h3,12
398 lwbrx $h2,$h2,$inp
399 lwbrx $h3,$h3,$inp
400___
401$code.=<<___;
402 lis $mask,0xf000 # 0xf0000000
403 li $r0,-4
404 andc $r0,$r0,$mask # 0x0ffffffc
405
406 andc $h0,$h0,$mask
407 and $h1,$h1,$r0
408 and $h2,$h2,$r0
409 and $h3,$h3,$r0
410
411 stw $h0,32($ctx) # store key
412 stw $h1,36($ctx)
413 stw $h2,40($ctx)
414 stw $h3,44($ctx)
415
416Lno_key:
417 xor r3,r3,r3
418 blr
419 .long 0
420 .byte 0,12,0x14,0,0,0,2,0
421.size .poly1305_init_int,.-.poly1305_init_int
422
423.globl .poly1305_blocks
424.align 4
425.poly1305_blocks:
a28e4890 426Lpoly1305_blocks:
9e58d119
AP
427 srwi. $len,$len,4
428 beq- Labort
429
430 $STU $sp,-$FRAME($sp)
431 mflr r0
432 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
433 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
434 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
435 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
436 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
437 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
438 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
439 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
440 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
441 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
442 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
443 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
444 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
445 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
446 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
447 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
448 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
449 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
450 $PUSH r0,`$FRAME+$LRSAVE`($sp)
451
452 lwz $r0,32($ctx) # load key
453 lwz $r1,36($ctx)
454 lwz $r2,40($ctx)
455 lwz $r3,44($ctx)
456
457 lwz $h0,0($ctx) # load hash value
458 lwz $h1,4($ctx)
459 lwz $h2,8($ctx)
460 lwz $h3,12($ctx)
461 lwz $h4,16($ctx)
462
463 srwi $s1,$r1,2
464 srwi $s2,$r2,2
465 srwi $s3,$r3,2
466 add $s1,$s1,$r1 # si = ri + ri>>2
467 add $s2,$s2,$r2
468 add $s3,$s3,$r3
469 mtctr $len
470 li $mask,3
471 b Loop
472
473.align 4
474Loop:
475___
476$code.=<<___ if ($LITTLE_ENDIAN);
477 lwz $d0,0($inp) # load input
478 lwz $d1,4($inp)
479 lwz $d2,8($inp)
480 lwz $d3,12($inp)
481___
482$code.=<<___ if (!$LITTLE_ENDIAN);
483 li $d1,4
484 lwbrx $d0,0,$inp # load input
485 li $d2,8
486 lwbrx $d1,$d1,$inp
487 li $d3,12
488 lwbrx $d2,$d2,$inp
489 lwbrx $d3,$d3,$inp
490___
491$code.=<<___;
492 addi $inp,$inp,16
493
494 addc $h0,$h0,$d0 # accumulate input
495 adde $h1,$h1,$d1
496 adde $h2,$h2,$d2
497
498 mullw $d0,$h0,$r0 # h0*r0
499 mulhwu $D0,$h0,$r0
500
501 mullw $d1,$h0,$r1 # h0*r1
502 mulhwu $D1,$h0,$r1
503
504 mullw $d2,$h0,$r2 # h0*r2
505 mulhwu $D2,$h0,$r2
506
507 adde $h3,$h3,$d3
508 adde $h4,$h4,$padbit
509
510 mullw $d3,$h0,$r3 # h0*r3
511 mulhwu $D3,$h0,$r3
512
513 mullw $t0,$h1,$s3 # h1*s3
514 mulhwu $t1,$h1,$s3
515
516 mullw $t2,$h1,$r0 # h1*r0
517 mulhwu $t3,$h1,$r0
518 addc $d0,$d0,$t0
519 adde $D0,$D0,$t1
520
521 mullw $t0,$h1,$r1 # h1*r1
522 mulhwu $t1,$h1,$r1
523 addc $d1,$d1,$t2
524 adde $D1,$D1,$t3
525
526 mullw $t2,$h1,$r2 # h1*r2
527 mulhwu $t3,$h1,$r2
528 addc $d2,$d2,$t0
529 adde $D2,$D2,$t1
530
531 mullw $t0,$h2,$s2 # h2*s2
532 mulhwu $t1,$h2,$s2
533 addc $d3,$d3,$t2
534 adde $D3,$D3,$t3
535
536 mullw $t2,$h2,$s3 # h2*s3
537 mulhwu $t3,$h2,$s3
538 addc $d0,$d0,$t0
539 adde $D0,$D0,$t1
540
541 mullw $t0,$h2,$r0 # h2*r0
542 mulhwu $t1,$h2,$r0
543 addc $d1,$d1,$t2
544 adde $D1,$D1,$t3
545
546 mullw $t2,$h2,$r1 # h2*r1
547 mulhwu $t3,$h2,$r1
548 addc $d2,$d2,$t0
549 adde $D2,$D2,$t1
550
551 mullw $t0,$h3,$s1 # h3*s1
552 mulhwu $t1,$h3,$s1
553 addc $d3,$d3,$t2
554 adde $D3,$D3,$t3
555
556 mullw $t2,$h3,$s2 # h3*s2
557 mulhwu $t3,$h3,$s2
558 addc $d0,$d0,$t0
559 adde $D0,$D0,$t1
560
561 mullw $t0,$h3,$s3 # h3*s3
562 mulhwu $t1,$h3,$s3
563 addc $d1,$d1,$t2
564 adde $D1,$D1,$t3
565
566 mullw $t2,$h3,$r0 # h3*r0
567 mulhwu $t3,$h3,$r0
568 addc $d2,$d2,$t0
569 adde $D2,$D2,$t1
570
571 mullw $t0,$h4,$s1 # h4*s1
572 addc $d3,$d3,$t2
573 adde $D3,$D3,$t3
574 addc $d1,$d1,$t0
575
576 mullw $t1,$h4,$s2 # h4*s2
577 addze $D1,$D1
578 addc $d2,$d2,$t1
579 addze $D2,$D2
580
581 mullw $t2,$h4,$s3 # h4*s3
582 addc $d3,$d3,$t2
583 addze $D3,$D3
584
585 mullw $h4,$h4,$r0 # h4*r0
586
587 addc $h1,$d1,$D0
588 adde $h2,$d2,$D1
589 adde $h3,$d3,$D2
590 adde $h4,$h4,$D3
591
592 andc $D0,$h4,$mask # final reduction step
593 and $h4,$h4,$mask
594 srwi $D1,$D0,2
595 add $D0,$D0,$D1
596 addc $h0,$d0,$D0
597 addze $h1,$h1
598 addze $h2,$h2
599 addze $h3,$h3
4b8736a2 600 addze $h4,$h4
9e58d119
AP
601
602 bdnz Loop
603
604 stw $h0,0($ctx) # store hash value
605 stw $h1,4($ctx)
606 stw $h2,8($ctx)
607 stw $h3,12($ctx)
608 stw $h4,16($ctx)
609
610 $POP r14,`$FRAME-$SIZE_T*18`($sp)
611 $POP r15,`$FRAME-$SIZE_T*17`($sp)
612 $POP r16,`$FRAME-$SIZE_T*16`($sp)
613 $POP r17,`$FRAME-$SIZE_T*15`($sp)
614 $POP r18,`$FRAME-$SIZE_T*14`($sp)
615 $POP r19,`$FRAME-$SIZE_T*13`($sp)
616 $POP r20,`$FRAME-$SIZE_T*12`($sp)
617 $POP r21,`$FRAME-$SIZE_T*11`($sp)
618 $POP r22,`$FRAME-$SIZE_T*10`($sp)
619 $POP r23,`$FRAME-$SIZE_T*9`($sp)
620 $POP r24,`$FRAME-$SIZE_T*8`($sp)
621 $POP r25,`$FRAME-$SIZE_T*7`($sp)
622 $POP r26,`$FRAME-$SIZE_T*6`($sp)
623 $POP r27,`$FRAME-$SIZE_T*5`($sp)
624 $POP r28,`$FRAME-$SIZE_T*4`($sp)
625 $POP r29,`$FRAME-$SIZE_T*3`($sp)
626 $POP r30,`$FRAME-$SIZE_T*2`($sp)
627 $POP r31,`$FRAME-$SIZE_T*1`($sp)
628 addi $sp,$sp,$FRAME
629Labort:
630 blr
631 .long 0
632 .byte 0,12,4,1,0x80,18,4,0
633.size .poly1305_blocks,.-.poly1305_blocks
a28e4890
AP
634___
635{
636my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
9e58d119 637
a28e4890 638$code.=<<___;
9e58d119 639.globl .poly1305_emit
a28e4890 640.align 5
9e58d119 641.poly1305_emit:
a28e4890
AP
642 lwz r0,24($ctx) # is_base2_26
643 lwz $h0,0($ctx) # load hash value
644 lwz $h1,4($ctx)
645 lwz $h2,8($ctx)
646 lwz $h3,12($ctx)
647 lwz $h4,16($ctx)
648 cmplwi r0,0
649 beq Lemit_base2_32
650
651 slwi $t0,$h1,26 # base 2^26 -> base 2^32
652 srwi $h1,$h1,6
653 slwi $t1,$h2,20
654 srwi $h2,$h2,12
655 addc $h0,$h0,$t0
656 slwi $t0,$h3,14
657 srwi $h3,$h3,18
658 adde $h1,$h1,$t1
659 slwi $t1,$h4,8
660 srwi $h4,$h4,24
661 adde $h2,$h2,$t0
662 adde $h3,$h3,$t1
663 addze $h4,$h4
664
665Lemit_base2_32:
666 addic r0,$h0,5 # compare to modulus
667 addze r0,$h1
668 addze r0,$h2
669 addze r0,$h3
670 addze r0,$h4
671
672 srwi r0,r0,2 # see if it carried/borrowed
673 neg r0,r0
674 andi. r0,r0,5
675
676 addc $h0,$h0,r0
677 lwz r0,0($nonce)
678 addze $h1,$h1
679 lwz $t0,4($nonce)
680 addze $h2,$h2
681 lwz $t1,8($nonce)
682 addze $h3,$h3
683 lwz $h4,12($nonce)
684
685 addc $h0,$h0,r0 # accumulate nonce
686 adde $h1,$h1,$t0
687 adde $h2,$h2,$t1
688 adde $h3,$h3,$h4
689
690 addi $ctx,$mac,-1
691 addi $mac,$mac,7
692
693 stbu $h0,1($ctx) # write [little-endian] result
694 srwi $h0,$h0,8
695 stbu $h2,1($mac)
696 srwi $h2,$h2,8
697
698 stbu $h0,1($ctx)
699 srwi $h0,$h0,8
700 stbu $h2,1($mac)
701 srwi $h2,$h2,8
702
703 stbu $h0,1($ctx)
704 srwi $h0,$h0,8
705 stbu $h2,1($mac)
706 srwi $h2,$h2,8
707
708 stbu $h0,1($ctx)
709 stbu $h2,1($mac)
710
711 stbu $h1,1($ctx)
712 srwi $h1,$h1,8
713 stbu $h3,1($mac)
714 srwi $h3,$h3,8
715
716 stbu $h1,1($ctx)
717 srwi $h1,$h1,8
718 stbu $h3,1($mac)
719 srwi $h3,$h3,8
720
721 stbu $h1,1($ctx)
722 srwi $h1,$h1,8
723 stbu $h3,1($mac)
724 srwi $h3,$h3,8
725
726 stbu $h1,1($ctx)
727 stbu $h3,1($mac)
728
729 blr
730 .long 0
731 .byte 0,12,0x14,0,0,0,3,0
732.size .poly1305_emit,.-.poly1305_emit
733___
734} }
735{{{
736########################################################################
737# PowerISA 2.07/VSX section #
738########################################################################
739
740my $LOCALS= 6*$SIZE_T;
741my $VSXFRAME = $LOCALS + 6*$SIZE_T;
742 $VSXFRAME += 128; # local variables
743 $VSXFRAME += 13*16; # v20-v31 offload
744
745my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
746
747########################################################################
748# Layout of opaque area is following:
749#
750# unsigned __int32 h[5]; # current hash value base 2^26
751# unsigned __int32 pad;
752# unsigned __int32 is_base2_26, pad;
753# unsigned __int64 r[2]; # key value base 2^64
754# struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
755#
756# where r^n are base 2^26 digits of powers of multiplier key. There are
757# 5 digits, but last four are interleaved with multiples of 5, totalling
758# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
759# powers is as they appear in register, not memory.
760
761my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
762my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
763my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
764my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
765my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
766my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
767my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
768my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
769my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
770
771 if ($flavour =~ /64/) {
772###############################################################################
773# setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
774# but the base 2^26 computational part is same...
775
776my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
777my $mask = "r0";
778
779$code.=<<___;
780.globl .poly1305_blocks_vsx
781.align 5
782.poly1305_blocks_vsx:
783 lwz r7,24($ctx) # is_base2_26
784 cmpldi $len,128
785 bge __poly1305_blocks_vsx
786
787 neg r0,r7 # is_base2_26 as mask
788 lwz r7,0($ctx) # load hash base 2^26
789 lwz r8,4($ctx)
790 lwz r9,8($ctx)
791 lwz r10,12($ctx)
792 lwz r11,16($ctx)
793
794 sldi r8,r8,26 # base 2^26 -> base 2^64
795 sldi r12,r9,52
796 add r7,r7,r8
797 srdi r9,r9,12
798 sldi r10,r10,14
799 addc r7,r7,r12
800 sldi r8,r11,40
801 adde r9,r9,r10
802 srdi r11,r11,24
803 addc r9,r9,r8
804 addze r11,r11
805
806 ld r8,0($ctx) # load hash base 2^64
807 ld r10,8($ctx)
808 ld r12,16($ctx)
809
810 xor r7,r7,r8 # select between radixes
811 xor r9,r9,r10
812 xor r11,r11,r12
813 and r7,r7,r0
814 and r9,r9,r0
815 and r11,r11,r0
816 xor r7,r7,r8
817 xor r9,r9,r10
818 xor r11,r11,r12
819
820 li r0,0
821 std r7,0($ctx) # store hash base 2^64
822 std r9,8($ctx)
823 std r11,16($ctx)
824 stw r0,24($ctx) # clear is_base2_26
825
826 b Lpoly1305_blocks
827 .long 0
828 .byte 0,12,0x14,0,0,0,4,0
829.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
830
831.align 5
832__poly1305_mul:
833 mulld $d0,$h0,$r0 # h0*r0
834 mulhdu $d1,$h0,$r0
835
836 mulld $t0,$h1,$s1 # h1*5*r1
837 mulhdu $t1,$h1,$s1
838 addc $d0,$d0,$t0
839 adde $d1,$d1,$t1
840
841 mulld $t0,$h0,$r1 # h0*r1
842 mulhdu $d2,$h0,$r1
843 addc $d1,$d1,$t0
844 addze $d2,$d2
845
846 mulld $t0,$h1,$r0 # h1*r0
847 mulhdu $t1,$h1,$r0
848 addc $d1,$d1,$t0
849 adde $d2,$d2,$t1
850
851 mulld $t0,$h2,$s1 # h2*5*r1
852 mulld $t1,$h2,$r0 # h2*r0
853 addc $d1,$d1,$t0
854 adde $d2,$d2,$t1
855
856 andc $t0,$d2,$mask # final reduction step
857 and $h2,$d2,$mask
858 srdi $t1,$t0,2
859 add $t0,$t0,$t1
860 addc $h0,$d0,$t0
861 addze $h1,$d1
862 addze $h2,$h2
863
864 blr
865 .long 0
866 .byte 0,12,0x14,0,0,0,0,0
867.size __poly1305_mul,.-__poly1305_mul
868
869.align 5
870__poly1305_splat:
871 extrdi $d0,$h0,26,38
872 extrdi $d1,$h0,26,12
873 stw $d0,0x00($t1)
874
875 extrdi $d2,$h0,12,0
876 slwi $d0,$d1,2
877 stw $d1,0x10($t1)
878 add $d0,$d0,$d1 # * 5
879 stw $d0,0x20($t1)
880
881 insrdi $d2,$h1,14,38
882 slwi $d0,$d2,2
883 stw $d2,0x30($t1)
884 add $d0,$d0,$d2 # * 5
885 stw $d0,0x40($t1)
886
887 extrdi $d1,$h1,26,24
888 extrdi $d2,$h1,24,0
889 slwi $d0,$d1,2
890 stw $d1,0x50($t1)
891 add $d0,$d0,$d1 # * 5
892 stw $d0,0x60($t1)
893
894 insrdi $d2,$h2,3,37
895 slwi $d0,$d2,2
896 stw $d2,0x70($t1)
897 add $d0,$d0,$d2 # * 5
898 stw $d0,0x80($t1)
899
900 blr
901 .long 0
902 .byte 0,12,0x14,0,0,0,0,0
903.size __poly1305_splat,.-__poly1305_splat
904
905.align 5
906__poly1305_blocks_vsx:
907 $STU $sp,-$VSXFRAME($sp)
9e58d119 908 mflr r0
a28e4890
AP
909 li r10,`15+$LOCALS+128`
910 li r11,`31+$LOCALS+128`
911 mfspr r12,256
912 stvx v20,r10,$sp
913 addi r10,r10,32
914 stvx v21,r11,$sp
915 addi r11,r11,32
916 stvx v22,r10,$sp
917 addi r10,r10,32
918 stvx v23,r10,$sp
919 addi r10,r10,32
920 stvx v24,r11,$sp
921 addi r11,r11,32
922 stvx v25,r10,$sp
923 addi r10,r10,32
924 stvx v26,r10,$sp
925 addi r10,r10,32
926 stvx v27,r11,$sp
927 addi r11,r11,32
928 stvx v28,r10,$sp
929 addi r10,r10,32
930 stvx v29,r11,$sp
931 addi r11,r11,32
932 stvx v30,r10,$sp
933 stvx v31,r11,$sp
934 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
935 li r12,-1
936 mtspr 256,r12 # preserve all AltiVec registers
937 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
938 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
939 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
940 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
941 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
942 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
943
944 bl LPICmeup
945
946 li $x10,0x10
947 li $x20,0x20
948 li $x30,0x30
949 li $x40,0x40
950 li $x50,0x50
951 lvx_u $mask26,$x00,$const
952 lvx_u $_26,$x10,$const
953 lvx_u $_40,$x20,$const
954 lvx_u $I2perm,$x30,$const
955 lvx_u $padbits,$x40,$const
956
957 cmplwi r7,0 # is_base2_26?
958 bne Lskip_init_vsx
959
960 ld $r0,32($ctx) # load key base 2^64
961 ld $r1,40($ctx)
962 srdi $s1,$r1,2
963 li $mask,3
964 add $s1,$s1,$r1 # s1 = r1 + r1>>2
965
966 mr $h0,$r0 # "calculate" r^1
967 mr $h1,$r1
968 li $h2,0
969 addi $t1,$ctx,`48+(12^$BIG_ENDIAN)`
970 bl __poly1305_splat
971
c2969ff6 972 bl __poly1305_mul # calculate r^2
a28e4890
AP
973 addi $t1,$ctx,`48+(4^$BIG_ENDIAN)`
974 bl __poly1305_splat
975
c2969ff6 976 bl __poly1305_mul # calculate r^3
a28e4890
AP
977 addi $t1,$ctx,`48+(8^$BIG_ENDIAN)`
978 bl __poly1305_splat
979
c2969ff6 980 bl __poly1305_mul # calculate r^4
a28e4890
AP
981 addi $t1,$ctx,`48+(0^$BIG_ENDIAN)`
982 bl __poly1305_splat
983
984 ld $h0,0($ctx) # load hash
985 ld $h1,8($ctx)
986 ld $h2,16($ctx)
987
988 extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26
989 extrdi $d1,$h0,26,12
990 extrdi $d2,$h0,12,0
991 mtvrwz $H0,$d0
992 insrdi $d2,$h1,14,38
993 mtvrwz $H1,$d1
994 extrdi $d1,$h1,26,24
995 mtvrwz $H2,$d2
996 extrdi $d2,$h1,24,0
997 mtvrwz $H3,$d1
998 insrdi $d2,$h2,3,37
999 mtvrwz $H4,$d2
1000___
1001 } else {
1002###############################################################################
1003# 32-bit initialization
1004
1005my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
1006my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
1007
1008$code.=<<___;
1009.globl .poly1305_blocks_vsx
1010.align 5
1011.poly1305_blocks_vsx:
1012 lwz r7,24($ctx) # is_base2_26
1013 cmplwi $len,128
1014 bge __poly1305_blocks_vsx
1015 cmplwi r7,0
1016 beq Lpoly1305_blocks
9e58d119
AP
1017
1018 lwz $h0,0($ctx) # load hash
1019 lwz $h1,4($ctx)
1020 lwz $h2,8($ctx)
1021 lwz $h3,12($ctx)
1022 lwz $h4,16($ctx)
1023
a28e4890
AP
1024 slwi $t0,$h1,26 # base 2^26 -> base 2^32
1025 srwi $h1,$h1,6
1026 slwi $t1,$h2,20
1027 srwi $h2,$h2,12
1028 addc $h0,$h0,$t0
1029 slwi $t0,$h3,14
1030 srwi $h3,$h3,18
1031 adde $h1,$h1,$t1
1032 slwi $t1,$h4,8
1033 srwi $h4,$h4,24
1034 adde $h2,$h2,$t0
1035 li $t0,0
1036 adde $h3,$h3,$t1
1037 addze $h4,$h4
9e58d119 1038
a28e4890
AP
1039 stw $h0,0($ctx) # store hash base 2^32
1040 stw $h1,4($ctx)
1041 stw $h2,8($ctx)
1042 stw $h3,12($ctx)
1043 stw $h4,16($ctx)
1044 stw $t0,24($ctx) # clear is_base2_26
9e58d119 1045
a28e4890
AP
1046 b Lpoly1305_blocks
1047 .long 0
1048 .byte 0,12,0x14,0,0,0,4,0
1049.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
1050
1051.align 5
1052__poly1305_mul:
1053 vmulouw $ACC0,$H0,$R0
1054 vmulouw $ACC1,$H1,$R0
1055 vmulouw $ACC2,$H2,$R0
1056 vmulouw $ACC3,$H3,$R0
1057 vmulouw $ACC4,$H4,$R0
1058
1059 vmulouw $T0,$H4,$S1
1060 vaddudm $ACC0,$ACC0,$T0
1061 vmulouw $T0,$H0,$R1
1062 vaddudm $ACC1,$ACC1,$T0
1063 vmulouw $T0,$H1,$R1
1064 vaddudm $ACC2,$ACC2,$T0
1065 vmulouw $T0,$H2,$R1
1066 vaddudm $ACC3,$ACC3,$T0
1067 vmulouw $T0,$H3,$R1
1068 vaddudm $ACC4,$ACC4,$T0
1069
1070 vmulouw $T0,$H3,$S2
1071 vaddudm $ACC0,$ACC0,$T0
1072 vmulouw $T0,$H4,$S2
1073 vaddudm $ACC1,$ACC1,$T0
1074 vmulouw $T0,$H0,$R2
1075 vaddudm $ACC2,$ACC2,$T0
1076 vmulouw $T0,$H1,$R2
1077 vaddudm $ACC3,$ACC3,$T0
1078 vmulouw $T0,$H2,$R2
1079 vaddudm $ACC4,$ACC4,$T0
1080
1081 vmulouw $T0,$H2,$S3
1082 vaddudm $ACC0,$ACC0,$T0
1083 vmulouw $T0,$H3,$S3
1084 vaddudm $ACC1,$ACC1,$T0
1085 vmulouw $T0,$H4,$S3
1086 vaddudm $ACC2,$ACC2,$T0
1087 vmulouw $T0,$H0,$R3
1088 vaddudm $ACC3,$ACC3,$T0
1089 vmulouw $T0,$H1,$R3
1090 vaddudm $ACC4,$ACC4,$T0
1091
1092 vmulouw $T0,$H1,$S4
1093 vaddudm $ACC0,$ACC0,$T0
1094 vmulouw $T0,$H2,$S4
1095 vaddudm $ACC1,$ACC1,$T0
1096 vmulouw $T0,$H3,$S4
1097 vaddudm $ACC2,$ACC2,$T0
1098 vmulouw $T0,$H4,$S4
1099 vaddudm $ACC3,$ACC3,$T0
1100 vmulouw $T0,$H0,$R4
1101 vaddudm $ACC4,$ACC4,$T0
1102
1103 ################################################################
1104 # lazy reduction
1105
1106 vspltisb $T0,2
1107 vsrd $H4,$ACC3,$_26
1108 vsrd $H1,$ACC0,$_26
1109 vand $H3,$ACC3,$mask26
1110 vand $H0,$ACC0,$mask26
1111 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1112 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1113
1114 vsrd $ACC4,$H4,$_26
1115 vsrd $ACC1,$H1,$_26
1116 vand $H4,$H4,$mask26
1117 vand $H1,$H1,$mask26
1118 vaddudm $H0,$H0,$ACC4
1119 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1120
1121 vsld $ACC4,$ACC4,$T0 # <<2
1122 vsrd $ACC2,$H2,$_26
1123 vand $H2,$H2,$mask26
1124 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1125 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1126
1127 vsrd $ACC0,$H0,$_26
1128 vsrd $ACC3,$H3,$_26
1129 vand $H0,$H0,$mask26
1130 vand $H3,$H3,$mask26
1131 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1132 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1133
1134 blr
1135 .long 0
1136 .byte 0,12,0x14,0,0,0,0,0
1137.size __poly1305_mul,.-__poly1305_mul
1138
1139.align 5
1140__poly1305_blocks_vsx:
1141 $STU $sp,-$VSXFRAME($sp)
1142 mflr r0
1143 li r10,`15+$LOCALS+128`
1144 li r11,`31+$LOCALS+128`
1145 mfspr r12,256
1146 stvx v20,r10,$sp
1147 addi r10,r10,32
1148 stvx v21,r11,$sp
1149 addi r11,r11,32
1150 stvx v22,r10,$sp
1151 addi r10,r10,32
1152 stvx v23,r10,$sp
1153 addi r10,r10,32
1154 stvx v24,r11,$sp
1155 addi r11,r11,32
1156 stvx v25,r10,$sp
1157 addi r10,r10,32
1158 stvx v26,r10,$sp
1159 addi r10,r10,32
1160 stvx v27,r11,$sp
1161 addi r11,r11,32
1162 stvx v28,r10,$sp
1163 addi r10,r10,32
1164 stvx v29,r11,$sp
1165 addi r11,r11,32
1166 stvx v30,r10,$sp
1167 stvx v31,r11,$sp
1168 stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
1169 li r12,-1
1170 mtspr 256,r12 # preserve all AltiVec registers
1171 $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
1172 $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
1173 $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
1174 $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
1175 $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
1176 $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
1177
1178 bl LPICmeup
1179
1180 li $x10,0x10
1181 li $x20,0x20
1182 li $x30,0x30
1183 li $x40,0x40
1184 li $x50,0x50
1185 lvx_u $mask26,$x00,$const
1186 lvx_u $_26,$x10,$const
1187 lvx_u $_40,$x20,$const
1188 lvx_u $I2perm,$x30,$const
1189 lvx_u $padbits,$x40,$const
1190
1191 cmplwi r7,0 # is_base2_26?
1192 bne Lskip_init_vsx
1193
1194 lwz $h1,32($ctx) # load key base 2^32
1195 lwz $h2,36($ctx)
1196 lwz $h3,40($ctx)
1197 lwz $h4,44($ctx)
1198
1199 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
1200 extrwi $h1,$h1,6,0
1201 insrwi $h1,$h2,20,6
1202 extrwi $h2,$h2,12,0
1203 insrwi $h2,$h3,14,6
1204 extrwi $h3,$h3,18,0
1205 insrwi $h3,$h4,8,6
1206 extrwi $h4,$h4,24,0
1207
1208 mtvrwz $R0,$h0
1209 slwi $h0,$h1,2
1210 mtvrwz $R1,$h1
1211 add $h1,$h1,$h0
1212 mtvrwz $S1,$h1
1213 slwi $h1,$h2,2
1214 mtvrwz $R2,$h2
1215 add $h2,$h2,$h1
1216 mtvrwz $S2,$h2
1217 slwi $h2,$h3,2
1218 mtvrwz $R3,$h3
1219 add $h3,$h3,$h2
1220 mtvrwz $S3,$h3
1221 slwi $h3,$h4,2
1222 mtvrwz $R4,$h4
1223 add $h4,$h4,$h3
1224 mtvrwz $S4,$h4
1225
1226 vmr $H0,$R0
1227 vmr $H1,$R1
1228 vmr $H2,$R2
1229 vmr $H3,$R3
1230 vmr $H4,$R4
1231
1232 bl __poly1305_mul # r^1:- * r^1:-
1233
1234 vpermdi $R0,$H0,$R0,0b00
1235 vpermdi $R1,$H1,$R1,0b00
1236 vpermdi $R2,$H2,$R2,0b00
1237 vpermdi $R3,$H3,$R3,0b00
1238 vpermdi $R4,$H4,$R4,0b00
1239 vpermdi $H0,$H0,$H0,0b00
1240 vpermdi $H1,$H1,$H1,0b00
1241 vpermdi $H2,$H2,$H2,0b00
1242 vpermdi $H3,$H3,$H3,0b00
1243 vpermdi $H4,$H4,$H4,0b00
1244 vsld $S1,$R1,$T0 # <<2
1245 vsld $S2,$R2,$T0
1246 vsld $S3,$R3,$T0
1247 vsld $S4,$R4,$T0
1248 vaddudm $S1,$S1,$R1
1249 vaddudm $S2,$S2,$R2
1250 vaddudm $S3,$S3,$R3
1251 vaddudm $S4,$S4,$R4
1252
1253 bl __poly1305_mul # r^2:r^2 * r^2:r^1
1254
1255 addi $h0,$ctx,0x60
1256 lwz $h1,0($ctx) # load hash
1257 lwz $h2,4($ctx)
1258 lwz $h3,8($ctx)
1259 lwz $h4,12($ctx)
1260 lwz $t0,16($ctx)
1261
1262 vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3
1263 vmrgow $R1,$R1,$H1
1264 vmrgow $R2,$R2,$H2
1265 vmrgow $R3,$R3,$H3
1266 vmrgow $R4,$R4,$H4
1267 vslw $S1,$R1,$T0 # <<2
1268 vslw $S2,$R2,$T0
1269 vslw $S3,$R3,$T0
1270 vslw $S4,$R4,$T0
1271 vadduwm $S1,$S1,$R1
1272 vadduwm $S2,$S2,$R2
1273 vadduwm $S3,$S3,$R3
1274 vadduwm $S4,$S4,$R4
1275
1276 stvx_u $R0,$x30,$ctx
1277 stvx_u $R1,$x40,$ctx
1278 stvx_u $S1,$x50,$ctx
1279 stvx_u $R2,$x00,$h0
1280 stvx_u $S2,$x10,$h0
1281 stvx_u $R3,$x20,$h0
1282 stvx_u $S3,$x30,$h0
1283 stvx_u $R4,$x40,$h0
1284 stvx_u $S4,$x50,$h0
1285
1286 extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
1287 extrwi $h1,$h1,6,0
1288 mtvrwz $H0,$h0
1289 insrwi $h1,$h2,20,6
1290 extrwi $h2,$h2,12,0
1291 mtvrwz $H1,$h1
1292 insrwi $h2,$h3,14,6
1293 extrwi $h3,$h3,18,0
1294 mtvrwz $H2,$h2
1295 insrwi $h3,$h4,8,6
1296 extrwi $h4,$h4,24,0
1297 mtvrwz $H3,$h3
1298 insrwi $h4,$t0,3,5
1299 mtvrwz $H4,$h4
9e58d119 1300___
a28e4890 1301 }
9e58d119 1302$code.=<<___;
a28e4890
AP
1303 li r0,1
1304 stw r0,24($ctx) # set is_base2_26
1305 b Loaded_vsx
1306
1307.align 4
1308Lskip_init_vsx:
1309 li $x10,4
1310 li $x20,8
1311 li $x30,12
1312 li $x40,16
1313 lvwzx_u $H0,$x00,$ctx
1314 lvwzx_u $H1,$x10,$ctx
1315 lvwzx_u $H2,$x20,$ctx
1316 lvwzx_u $H3,$x30,$ctx
1317 lvwzx_u $H4,$x40,$ctx
1318
1319Loaded_vsx:
1320 li $x10,0x10
1321 li $x20,0x20
1322 li $x30,0x30
1323 li $x40,0x40
1324 li $x50,0x50
1325 li $x60,0x60
1326 li $x70,0x70
1327 addi $ctx_,$ctx,64 # &ctx->r[1]
1328 addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow
1329
1330 vxor $T0,$T0,$T0 # ensure second half is zero
1331 vpermdi $H0,$H0,$T0,0b00
1332 vpermdi $H1,$H1,$T0,0b00
1333 vpermdi $H2,$H2,$T0,0b00
1334 vpermdi $H3,$H3,$T0,0b00
1335 vpermdi $H4,$H4,$T0,0b00
1336
1337 be?lvx_u $_4,$x50,$const # byte swap mask
1338 lvx_u $T1,$x00,$inp # load first input block
1339 lvx_u $T2,$x10,$inp
1340 lvx_u $T3,$x20,$inp
1341 lvx_u $T4,$x30,$inp
1342 be?vperm $T1,$T1,$T1,$_4
1343 be?vperm $T2,$T2,$T2,$_4
1344 be?vperm $T3,$T3,$T3,$_4
1345 be?vperm $T4,$T4,$T4,$_4
1346
1347 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1348 vspltisb $_4,4
1349 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1350 vspltisb $_14,14
1351 vpermdi $I3,$T1,$T2,0b11
1352
1353 vsrd $I1,$I0,$_26
1354 vsrd $I2,$I2,$_4
1355 vsrd $I4,$I3,$_40
1356 vsrd $I3,$I3,$_14
1357 vand $I0,$I0,$mask26
1358 vand $I1,$I1,$mask26
1359 vand $I2,$I2,$mask26
1360 vand $I3,$I3,$mask26
1361
1362 vpermdi $T1,$T3,$T4,0b00
1363 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1364 vpermdi $T3,$T3,$T4,0b11
1365
1366 vsrd $T0,$T1,$_26
1367 vsrd $T2,$T2,$_4
1368 vsrd $T4,$T3,$_40
1369 vsrd $T3,$T3,$_14
1370 vand $T1,$T1,$mask26
1371 vand $T0,$T0,$mask26
1372 vand $T2,$T2,$mask26
1373 vand $T3,$T3,$mask26
1374
1375 # inp[2]:inp[0]:inp[3]:inp[1]
1376 vmrgow $I4,$T4,$I4
1377 vmrgow $I0,$T1,$I0
1378 vmrgow $I1,$T0,$I1
1379 vmrgow $I2,$T2,$I2
1380 vmrgow $I3,$T3,$I3
1381 vor $I4,$I4,$padbits
1382
1383 lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop
1384 lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement
1385 lvx_splt $S1,$x10,$ctx_
1386 lvx_splt $R2,$x20,$ctx_
1387 lvx_splt $S2,$x30,$ctx_
1388 lvx_splt $T1,$x40,$ctx_
1389 lvx_splt $T2,$x50,$ctx_
1390 lvx_splt $T3,$x60,$ctx_
1391 lvx_splt $T4,$x70,$ctx_
1392 stvx $R1,$x00,$_ctx
1393 stvx $S1,$x10,$_ctx
1394 stvx $R2,$x20,$_ctx
1395 stvx $S2,$x30,$_ctx
1396 stvx $T1,$x40,$_ctx
1397 stvx $T2,$x50,$_ctx
1398 stvx $T3,$x60,$_ctx
1399 stvx $T4,$x70,$_ctx
1400
1401 addi $inp,$inp,0x40
1402 addi $const,$const,0x50
1403 addi r0,$len,-64
1404 srdi r0,r0,6
1405 mtctr r0
1406 b Loop_vsx
1407
1408.align 4
1409Loop_vsx:
1410 ################################################################
1411 ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1412 ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1413 ## \___________________/
1414 ##
1415 ## Note that we start with inp[2:3]*r^2. This is because it
1416 ## doesn't depend on reduction in previous iteration.
1417 ################################################################
1418 ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1419 ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1420 ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1421 ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1422 ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1423
1424 vmuleuw $ACC0,$I0,$R0
1425 vmuleuw $ACC1,$I0,$R1
1426 vmuleuw $ACC2,$I0,$R2
1427 vmuleuw $ACC3,$I1,$R2
1428
1429 vmuleuw $T0,$I1,$R0
1430 vaddudm $ACC1,$ACC1,$T0
1431 vmuleuw $T0,$I1,$R1
1432 vaddudm $ACC2,$ACC2,$T0
1433 vmuleuw $ACC4,$I2,$R2
1434 vmuleuw $T0,$I4,$S1
1435 vaddudm $ACC0,$ACC0,$T0
1436 vmuleuw $T0,$I2,$R1
1437 vaddudm $ACC3,$ACC3,$T0
1438 lvx $S3,$x50,$_ctx
1439 vmuleuw $T0,$I3,$R1
1440 vaddudm $ACC4,$ACC4,$T0
1441 lvx $R3,$x40,$_ctx
1442
1443 vaddudm $H2,$H2,$I2
1444 vaddudm $H0,$H0,$I0
1445 vaddudm $H3,$H3,$I3
1446 vaddudm $H1,$H1,$I1
1447 vaddudm $H4,$H4,$I4
1448
1449 vmuleuw $T0,$I3,$S2
1450 vaddudm $ACC0,$ACC0,$T0
1451 vmuleuw $T0,$I4,$S2
1452 vaddudm $ACC1,$ACC1,$T0
1453 vmuleuw $T0,$I2,$R0
1454 vaddudm $ACC2,$ACC2,$T0
1455 vmuleuw $T0,$I3,$R0
1456 vaddudm $ACC3,$ACC3,$T0
1457 lvx $S4,$x70,$_ctx
1458 vmuleuw $T0,$I4,$R0
1459 vaddudm $ACC4,$ACC4,$T0
1460 lvx $R4,$x60,$_ctx
1461
1462 vmuleuw $T0,$I2,$S3
1463 vaddudm $ACC0,$ACC0,$T0
1464 vmuleuw $T0,$I3,$S3
1465 vaddudm $ACC1,$ACC1,$T0
1466 vmuleuw $T0,$I4,$S3
1467 vaddudm $ACC2,$ACC2,$T0
1468 vmuleuw $T0,$I0,$R3
1469 vaddudm $ACC3,$ACC3,$T0
1470 vmuleuw $T0,$I1,$R3
1471 vaddudm $ACC4,$ACC4,$T0
1472
1473 be?lvx_u $_4,$x00,$const # byte swap mask
1474 lvx_u $T1,$x00,$inp # load next input block
1475 lvx_u $T2,$x10,$inp
1476 lvx_u $T3,$x20,$inp
1477 lvx_u $T4,$x30,$inp
1478 be?vperm $T1,$T1,$T1,$_4
1479 be?vperm $T2,$T2,$T2,$_4
1480 be?vperm $T3,$T3,$T3,$_4
1481 be?vperm $T4,$T4,$T4,$_4
1482
1483 vmuleuw $T0,$I1,$S4
1484 vaddudm $ACC0,$ACC0,$T0
1485 vmuleuw $T0,$I2,$S4
1486 vaddudm $ACC1,$ACC1,$T0
1487 vmuleuw $T0,$I3,$S4
1488 vaddudm $ACC2,$ACC2,$T0
1489 vmuleuw $T0,$I4,$S4
1490 vaddudm $ACC3,$ACC3,$T0
1491 vmuleuw $T0,$I0,$R4
1492 vaddudm $ACC4,$ACC4,$T0
1493
1494 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1495 vspltisb $_4,4
1496 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1497 vpermdi $I3,$T1,$T2,0b11
1498
1499 # (hash + inp[0:1]) * r^4
1500 vmulouw $T0,$H0,$R0
1501 vaddudm $ACC0,$ACC0,$T0
1502 vmulouw $T0,$H1,$R0
1503 vaddudm $ACC1,$ACC1,$T0
1504 vmulouw $T0,$H2,$R0
1505 vaddudm $ACC2,$ACC2,$T0
1506 vmulouw $T0,$H3,$R0
1507 vaddudm $ACC3,$ACC3,$T0
1508 vmulouw $T0,$H4,$R0
1509 vaddudm $ACC4,$ACC4,$T0
1510
1511 vpermdi $T1,$T3,$T4,0b00
1512 vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1513 vpermdi $T3,$T3,$T4,0b11
1514
1515 vmulouw $T0,$H2,$S3
1516 vaddudm $ACC0,$ACC0,$T0
1517 vmulouw $T0,$H3,$S3
1518 vaddudm $ACC1,$ACC1,$T0
1519 vmulouw $T0,$H4,$S3
1520 vaddudm $ACC2,$ACC2,$T0
1521 vmulouw $T0,$H0,$R3
1522 vaddudm $ACC3,$ACC3,$T0
1523 lvx $S1,$x10,$_ctx
1524 vmulouw $T0,$H1,$R3
1525 vaddudm $ACC4,$ACC4,$T0
1526 lvx $R1,$x00,$_ctx
1527
1528 vsrd $I1,$I0,$_26
1529 vsrd $I2,$I2,$_4
1530 vsrd $I4,$I3,$_40
1531 vsrd $I3,$I3,$_14
1532
1533 vmulouw $T0,$H1,$S4
1534 vaddudm $ACC0,$ACC0,$T0
1535 vmulouw $T0,$H2,$S4
1536 vaddudm $ACC1,$ACC1,$T0
1537 vmulouw $T0,$H3,$S4
1538 vaddudm $ACC2,$ACC2,$T0
1539 vmulouw $T0,$H4,$S4
1540 vaddudm $ACC3,$ACC3,$T0
1541 lvx $S2,$x30,$_ctx
1542 vmulouw $T0,$H0,$R4
1543 vaddudm $ACC4,$ACC4,$T0
1544 lvx $R2,$x20,$_ctx
1545
1546 vand $I0,$I0,$mask26
1547 vand $I1,$I1,$mask26
1548 vand $I2,$I2,$mask26
1549 vand $I3,$I3,$mask26
1550
1551 vmulouw $T0,$H4,$S1
1552 vaddudm $ACC0,$ACC0,$T0
1553 vmulouw $T0,$H0,$R1
1554 vaddudm $ACC1,$ACC1,$T0
1555 vmulouw $T0,$H1,$R1
1556 vaddudm $ACC2,$ACC2,$T0
1557 vmulouw $T0,$H2,$R1
1558 vaddudm $ACC3,$ACC3,$T0
1559 vmulouw $T0,$H3,$R1
1560 vaddudm $ACC4,$ACC4,$T0
1561
1562 vsrd $T2,$T2,$_4
1563 vsrd $_4,$T1,$_26
1564 vsrd $T4,$T3,$_40
1565 vsrd $T3,$T3,$_14
1566
1567 vmulouw $T0,$H3,$S2
1568 vaddudm $ACC0,$ACC0,$T0
1569 vmulouw $T0,$H4,$S2
1570 vaddudm $ACC1,$ACC1,$T0
1571 vmulouw $T0,$H0,$R2
1572 vaddudm $ACC2,$ACC2,$T0
1573 vmulouw $T0,$H1,$R2
1574 vaddudm $ACC3,$ACC3,$T0
1575 vmulouw $T0,$H2,$R2
1576 vaddudm $ACC4,$ACC4,$T0
1577
1578 vand $T1,$T1,$mask26
1579 vand $_4,$_4,$mask26
1580 vand $T2,$T2,$mask26
1581 vand $T3,$T3,$mask26
1582
1583 ################################################################
1584 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1585 # and P. Schwabe
1586
1587 vspltisb $T0,2
1588 vsrd $H4,$ACC3,$_26
1589 vsrd $H1,$ACC0,$_26
1590 vand $H3,$ACC3,$mask26
1591 vand $H0,$ACC0,$mask26
1592 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1593 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1594
1595 vmrgow $I4,$T4,$I4
1596 vmrgow $I0,$T1,$I0
1597 vmrgow $I1,$_4,$I1
1598 vmrgow $I2,$T2,$I2
1599 vmrgow $I3,$T3,$I3
1600 vor $I4,$I4,$padbits
1601
1602 vsrd $ACC4,$H4,$_26
1603 vsrd $ACC1,$H1,$_26
1604 vand $H4,$H4,$mask26
1605 vand $H1,$H1,$mask26
1606 vaddudm $H0,$H0,$ACC4
1607 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1608
1609 vsld $ACC4,$ACC4,$T0 # <<2
1610 vsrd $ACC2,$H2,$_26
1611 vand $H2,$H2,$mask26
1612 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1613 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1614
1615 vsrd $ACC0,$H0,$_26
1616 vsrd $ACC3,$H3,$_26
1617 vand $H0,$H0,$mask26
1618 vand $H3,$H3,$mask26
1619 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1620 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1621
1622 addi $inp,$inp,0x40
1623 bdnz Loop_vsx
1624
1625 neg $len,$len
1626 andi. $len,$len,0x30
1627 sub $inp,$inp,$len
1628
1629 lvx_u $R0,$x30,$ctx # load all powers
1630 lvx_u $R1,$x00,$ctx_
1631 lvx_u $S1,$x10,$ctx_
1632 lvx_u $R2,$x20,$ctx_
1633 lvx_u $S2,$x30,$ctx_
1634
1635Last_vsx:
1636 vmuleuw $ACC0,$I0,$R0
1637 vmuleuw $ACC1,$I1,$R0
1638 vmuleuw $ACC2,$I2,$R0
1639 vmuleuw $ACC3,$I3,$R0
1640 vmuleuw $ACC4,$I4,$R0
1641
1642 vmuleuw $T0,$I4,$S1
1643 vaddudm $ACC0,$ACC0,$T0
1644 vmuleuw $T0,$I0,$R1
1645 vaddudm $ACC1,$ACC1,$T0
1646 vmuleuw $T0,$I1,$R1
1647 vaddudm $ACC2,$ACC2,$T0
1648 vmuleuw $T0,$I2,$R1
1649 vaddudm $ACC3,$ACC3,$T0
1650 lvx_u $S3,$x50,$ctx_
1651 vmuleuw $T0,$I3,$R1
1652 vaddudm $ACC4,$ACC4,$T0
1653 lvx_u $R3,$x40,$ctx_
1654
1655 vaddudm $H2,$H2,$I2
1656 vaddudm $H0,$H0,$I0
1657 vaddudm $H3,$H3,$I3
1658 vaddudm $H1,$H1,$I1
1659 vaddudm $H4,$H4,$I4
1660
1661 vmuleuw $T0,$I3,$S2
1662 vaddudm $ACC0,$ACC0,$T0
1663 vmuleuw $T0,$I4,$S2
1664 vaddudm $ACC1,$ACC1,$T0
1665 vmuleuw $T0,$I0,$R2
1666 vaddudm $ACC2,$ACC2,$T0
1667 vmuleuw $T0,$I1,$R2
1668 vaddudm $ACC3,$ACC3,$T0
1669 lvx_u $S4,$x70,$ctx_
1670 vmuleuw $T0,$I2,$R2
1671 vaddudm $ACC4,$ACC4,$T0
1672 lvx_u $R4,$x60,$ctx_
1673
1674 vmuleuw $T0,$I2,$S3
1675 vaddudm $ACC0,$ACC0,$T0
1676 vmuleuw $T0,$I3,$S3
1677 vaddudm $ACC1,$ACC1,$T0
1678 vmuleuw $T0,$I4,$S3
1679 vaddudm $ACC2,$ACC2,$T0
1680 vmuleuw $T0,$I0,$R3
1681 vaddudm $ACC3,$ACC3,$T0
1682 vmuleuw $T0,$I1,$R3
1683 vaddudm $ACC4,$ACC4,$T0
1684
1685 vmuleuw $T0,$I1,$S4
1686 vaddudm $ACC0,$ACC0,$T0
1687 vmuleuw $T0,$I2,$S4
1688 vaddudm $ACC1,$ACC1,$T0
1689 vmuleuw $T0,$I3,$S4
1690 vaddudm $ACC2,$ACC2,$T0
1691 vmuleuw $T0,$I4,$S4
1692 vaddudm $ACC3,$ACC3,$T0
1693 vmuleuw $T0,$I0,$R4
1694 vaddudm $ACC4,$ACC4,$T0
1695
1696 # (hash + inp[0:1]) * r^4
1697 vmulouw $T0,$H0,$R0
1698 vaddudm $ACC0,$ACC0,$T0
1699 vmulouw $T0,$H1,$R0
1700 vaddudm $ACC1,$ACC1,$T0
1701 vmulouw $T0,$H2,$R0
1702 vaddudm $ACC2,$ACC2,$T0
1703 vmulouw $T0,$H3,$R0
1704 vaddudm $ACC3,$ACC3,$T0
1705 vmulouw $T0,$H4,$R0
1706 vaddudm $ACC4,$ACC4,$T0
1707
1708 vmulouw $T0,$H2,$S3
1709 vaddudm $ACC0,$ACC0,$T0
1710 vmulouw $T0,$H3,$S3
1711 vaddudm $ACC1,$ACC1,$T0
1712 vmulouw $T0,$H4,$S3
1713 vaddudm $ACC2,$ACC2,$T0
1714 vmulouw $T0,$H0,$R3
1715 vaddudm $ACC3,$ACC3,$T0
1716 lvx_u $S1,$x10,$ctx_
1717 vmulouw $T0,$H1,$R3
1718 vaddudm $ACC4,$ACC4,$T0
1719 lvx_u $R1,$x00,$ctx_
1720
1721 vmulouw $T0,$H1,$S4
1722 vaddudm $ACC0,$ACC0,$T0
1723 vmulouw $T0,$H2,$S4
1724 vaddudm $ACC1,$ACC1,$T0
1725 vmulouw $T0,$H3,$S4
1726 vaddudm $ACC2,$ACC2,$T0
1727 vmulouw $T0,$H4,$S4
1728 vaddudm $ACC3,$ACC3,$T0
1729 lvx_u $S2,$x30,$ctx_
1730 vmulouw $T0,$H0,$R4
1731 vaddudm $ACC4,$ACC4,$T0
1732 lvx_u $R2,$x20,$ctx_
1733
1734 vmulouw $T0,$H4,$S1
1735 vaddudm $ACC0,$ACC0,$T0
1736 vmulouw $T0,$H0,$R1
1737 vaddudm $ACC1,$ACC1,$T0
1738 vmulouw $T0,$H1,$R1
1739 vaddudm $ACC2,$ACC2,$T0
1740 vmulouw $T0,$H2,$R1
1741 vaddudm $ACC3,$ACC3,$T0
1742 vmulouw $T0,$H3,$R1
1743 vaddudm $ACC4,$ACC4,$T0
1744
1745 vmulouw $T0,$H3,$S2
1746 vaddudm $ACC0,$ACC0,$T0
1747 vmulouw $T0,$H4,$S2
1748 vaddudm $ACC1,$ACC1,$T0
1749 vmulouw $T0,$H0,$R2
1750 vaddudm $ACC2,$ACC2,$T0
1751 vmulouw $T0,$H1,$R2
1752 vaddudm $ACC3,$ACC3,$T0
1753 vmulouw $T0,$H2,$R2
1754 vaddudm $ACC4,$ACC4,$T0
1755
1756 ################################################################
1757 # horizontal addition
1758
1759 vpermdi $H0,$ACC0,$ACC0,0b10
1760 vpermdi $H1,$ACC1,$ACC1,0b10
1761 vpermdi $H2,$ACC2,$ACC2,0b10
1762 vpermdi $H3,$ACC3,$ACC3,0b10
1763 vpermdi $H4,$ACC4,$ACC4,0b10
1764 vaddudm $ACC0,$ACC0,$H0
1765 vaddudm $ACC1,$ACC1,$H1
1766 vaddudm $ACC2,$ACC2,$H2
1767 vaddudm $ACC3,$ACC3,$H3
1768 vaddudm $ACC4,$ACC4,$H4
1769
1770 ################################################################
1771 # lazy reduction
1772
1773 vspltisb $T0,2
1774 vsrd $H4,$ACC3,$_26
1775 vsrd $H1,$ACC0,$_26
1776 vand $H3,$ACC3,$mask26
1777 vand $H0,$ACC0,$mask26
1778 vaddudm $H4,$H4,$ACC4 # h3 -> h4
1779 vaddudm $H1,$H1,$ACC1 # h0 -> h1
1780
1781 vsrd $ACC4,$H4,$_26
1782 vsrd $ACC1,$H1,$_26
1783 vand $H4,$H4,$mask26
1784 vand $H1,$H1,$mask26
1785 vaddudm $H0,$H0,$ACC4
1786 vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
1787
1788 vsld $ACC4,$ACC4,$T0 # <<2
1789 vsrd $ACC2,$H2,$_26
1790 vand $H2,$H2,$mask26
1791 vaddudm $H0,$H0,$ACC4 # h4 -> h0
1792 vaddudm $H3,$H3,$ACC2 # h2 -> h3
1793
1794 vsrd $ACC0,$H0,$_26
1795 vsrd $ACC3,$H3,$_26
1796 vand $H0,$H0,$mask26
1797 vand $H3,$H3,$mask26
1798 vaddudm $H1,$H1,$ACC0 # h0 -> h1
1799 vaddudm $H4,$H4,$ACC3 # h3 -> h4
1800
1801 beq Ldone_vsx
1802
1803 add r6,$const,$len
1804
1805 be?lvx_u $_4,$x00,$const # byte swap mask
1806 lvx_u $T1,$x00,$inp # load last partial input block
1807 lvx_u $T2,$x10,$inp
1808 lvx_u $T3,$x20,$inp
1809 lvx_u $T4,$x30,$inp
1810 be?vperm $T1,$T1,$T1,$_4
1811 be?vperm $T2,$T2,$T2,$_4
1812 be?vperm $T3,$T3,$T3,$_4
1813 be?vperm $T4,$T4,$T4,$_4
1814
1815 vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
1816 vspltisb $_4,4
1817 vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
1818 vpermdi $I3,$T1,$T2,0b11
1819
1820 vsrd $I1,$I0,$_26
1821 vsrd $I2,$I2,$_4
1822 vsrd $I4,$I3,$_40
1823 vsrd $I3,$I3,$_14
1824 vand $I0,$I0,$mask26
1825 vand $I1,$I1,$mask26
1826 vand $I2,$I2,$mask26
1827 vand $I3,$I3,$mask26
1828
1829 vpermdi $T0,$T3,$T4,0b00
1830 vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
1831 vpermdi $T2,$T3,$T4,0b11
1832
1833 lvx_u $ACC0,$x00,r6
1834 lvx_u $ACC1,$x30,r6
1835
1836 vsrd $T3,$T0,$_26
1837 vsrd $T1,$T1,$_4
1838 vsrd $T4,$T2,$_40
1839 vsrd $T2,$T2,$_14
1840 vand $T0,$T0,$mask26
1841 vand $T3,$T3,$mask26
1842 vand $T1,$T1,$mask26
1843 vand $T2,$T2,$mask26
1844
1845 # inp[2]:inp[0]:inp[3]:inp[1]
1846 vmrgow $I4,$T4,$I4
1847 vmrgow $I0,$T0,$I0
1848 vmrgow $I1,$T3,$I1
1849 vmrgow $I2,$T1,$I2
1850 vmrgow $I3,$T2,$I3
1851 vor $I4,$I4,$padbits
1852
1853 vperm $H0,$H0,$H0,$ACC0 # move hash to right lane
1854 vand $I0,$I0, $ACC1 # mask redundant input lane[s]
1855 vperm $H1,$H1,$H1,$ACC0
1856 vand $I1,$I1, $ACC1
1857 vperm $H2,$H2,$H2,$ACC0
1858 vand $I2,$I2, $ACC1
1859 vperm $H3,$H3,$H3,$ACC0
1860 vand $I3,$I3, $ACC1
1861 vperm $H4,$H4,$H4,$ACC0
1862 vand $I4,$I4, $ACC1
1863
1864 vaddudm $I0,$I0,$H0 # accumulate hash
1865 vxor $H0,$H0,$H0 # wipe hash value
1866 vaddudm $I1,$I1,$H1
1867 vxor $H1,$H1,$H1
1868 vaddudm $I2,$I2,$H2
1869 vxor $H2,$H2,$H2
1870 vaddudm $I3,$I3,$H3
1871 vxor $H3,$H3,$H3
1872 vaddudm $I4,$I4,$H4
1873 vxor $H4,$H4,$H4
1874
1875 xor. $len,$len,$len
1876 b Last_vsx
1877
1878.align 4
1879Ldone_vsx:
1880 $POP r0,`$VSXFRAME+$LRSAVE`($sp)
1881 li $x10,4
1882 li $x20,8
1883 li $x30,12
1884 li $x40,16
1885 stvwx_u $H0,$x00,$ctx # store hash
1886 stvwx_u $H1,$x10,$ctx
1887 stvwx_u $H2,$x20,$ctx
1888 stvwx_u $H3,$x30,$ctx
1889 stvwx_u $H4,$x40,$ctx
1890
1891 lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
1892 mtlr r0
1893 li r10,`15+$LOCALS+128`
1894 li r11,`31+$LOCALS+128`
1895 mtspr 256,r12 # restore vrsave
1896 lvx v20,r10,$sp
1897 addi r10,r10,32
1898 lvx v21,r10,$sp
1899 addi r10,r10,32
1900 lvx v22,r11,$sp
1901 addi r11,r11,32
1902 lvx v23,r10,$sp
1903 addi r10,r10,32
1904 lvx v24,r11,$sp
1905 addi r11,r11,32
1906 lvx v25,r10,$sp
1907 addi r10,r10,32
1908 lvx v26,r11,$sp
1909 addi r11,r11,32
1910 lvx v27,r10,$sp
1911 addi r10,r10,32
1912 lvx v28,r11,$sp
1913 addi r11,r11,32
1914 lvx v29,r10,$sp
1915 addi r10,r10,32
1916 lvx v30,r11,$sp
1917 lvx v31,r10,$sp
1918 $POP r27,`$VSXFRAME-$SIZE_T*5`($sp)
1919 $POP r28,`$VSXFRAME-$SIZE_T*4`($sp)
1920 $POP r29,`$VSXFRAME-$SIZE_T*3`($sp)
1921 $POP r30,`$VSXFRAME-$SIZE_T*2`($sp)
1922 $POP r31,`$VSXFRAME-$SIZE_T*1`($sp)
1923 addi $sp,$sp,$VSXFRAME
9e58d119
AP
1924 blr
1925 .long 0
a28e4890
AP
1926 .byte 0,12,0x04,1,0x80,5,4,0
1927 .long 0
1928.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
1929
1930.align 6
1931LPICmeup:
1932 mflr r0
1933 bcl 20,31,\$+4
1934 mflr $const # vvvvvv "distance" between . and 1st data entry
1935 addi $const,$const,`64-8`
1936 mtlr r0
1937 blr
1938 .long 0
1939 .byte 0,12,0x14,0,0,0,0,0
1940 .space `64-9*4`
1941
1942.quad 0x0000000003ffffff,0x0000000003ffffff # mask26
1943.quad 0x000000000000001a,0x000000000000001a # _26
1944.quad 0x0000000000000028,0x0000000000000028 # _40
1945.quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm
1946.quad 0x0100000001000000,0x0100000001000000 # padbits
1947.quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian
1948
1949.quad 0x0000000000000000,0x0000000004050607 # magic tail masks
1950.quad 0x0405060700000000,0x0000000000000000
1951.quad 0x0000000000000000,0x0405060700000000
1952
1953.quad 0xffffffff00000000,0xffffffffffffffff
1954.quad 0xffffffff00000000,0xffffffff00000000
1955.quad 0x0000000000000000,0xffffffff00000000
9e58d119 1956___
a28e4890 1957}}}
9e58d119 1958$code.=<<___;
a28e4890 1959.asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
9e58d119
AP
1960___
1961
a28e4890
AP
1962foreach (split("\n",$code)) {
1963 s/\`([^\`]*)\`/eval($1)/ge;
1964
1965 # instructions prefixed with '?' are endian-specific and need
1966 # to be adjusted accordingly...
1967 if ($flavour !~ /le$/) { # big-endian
1968 s/be\?// or
1969 s/le\?/#le#/
1970 } else { # little-endian
1971 s/le\?// or
1972 s/be\?/#be#/
1973 }
1974
1975 print $_,"\n";
1976}
9e58d119 1977close STDOUT;