]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
b6461792 | 2 | # Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
49d3b641 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9e58d119 AP |
9 | # |
10 | # ==================================================================== | |
a28e4890 AP |
11 | # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL |
12 | # project. The module is dual licensed under OpenSSL and CRYPTOGAMS | |
13 | # licenses depending on where you obtain it. For further details see | |
14 | # https://github.com/dot-asm/cryptogams/. | |
9e58d119 AP |
15 | # ==================================================================== |
16 | # | |
17 | # This module implements Poly1305 hash for PowerPC. | |
18 | # | |
19 | # June 2015 | |
20 | # | |
21 | # Numbers are cycles per processed byte with poly1305_blocks alone, | |
22 | # and improvement coefficients relative to gcc-generated code. | |
23 | # | |
24 | # -m32 -m64 | |
25 | # | |
26 | # Freescale e300 14.8/+80% - | |
4b8736a2 AP |
27 | # PPC74x0 7.60/+60% - |
28 | # PPC970 7.00/+114% 3.51/+205% | |
29 | # POWER7 3.75/+260% 1.93/+100% | |
30 | # POWER8 - 2.03/+200% | |
41013cd6 | 31 | # POWER9 - 2.00/+150% |
9e58d119 AP |
32 | # |
33 | # Do we need floating-point implementation for PPC? Results presented | |
34 | # in poly1305_ieee754.c are tricky to compare to, because they are for | |
35 | # compiler-generated code. On the other hand it's known that floating- | |
36 | # point performance can be dominated by FPU latency, which means that | |
37 | # there is limit even for ideally optimized (and even vectorized) code. | |
38 | # And this limit is estimated to be higher than above -m64 results. Or | |
39 | # in other words floating-point implementation can be meaningful to | |
40 | # consider only in 32-bit application context. We probably have to | |
41 | # recognize that 32-bit builds are getting less popular on high-end | |
42 | # systems and therefore tend to target embedded ones, which might not | |
43 | # even have FPU... | |
44 | # | |
45 | # On side note, Power ISA 2.07 enables vector base 2^26 implementation, | |
46 | # and POWER8 might have capacity to break 1.0 cycle per byte barrier... | |
a28e4890 AP |
47 | # |
48 | # January 2019 | |
49 | # | |
50 | # ... Unfortunately not:-( Estimate was a projection of ARM result, | |
51 | # but ARM has vector multiply-n-add instruction, while PowerISA does | |
52 | # not, not one usable in the context. Improvement is ~40% over -m64 | |
53 | # result above and is ~1.43 on little-endian systems. | |
9e58d119 | 54 | |
1aa89a7a RL |
55 | # $output is the last argument if it looks like a file (it has an extension) |
56 | # $flavour is the first argument if it doesn't look like a file | |
57 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
58 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
9e58d119 AP |
59 | |
60 | if ($flavour =~ /64/) { | |
61 | $SIZE_T =8; | |
62 | $LRSAVE =2*$SIZE_T; | |
63 | $UCMP ="cmpld"; | |
64 | $STU ="stdu"; | |
65 | $POP ="ld"; | |
66 | $PUSH ="std"; | |
67 | } elsif ($flavour =~ /32/) { | |
68 | $SIZE_T =4; | |
69 | $LRSAVE =$SIZE_T; | |
70 | $UCMP ="cmplw"; | |
71 | $STU ="stwu"; | |
72 | $POP ="lwz"; | |
73 | $PUSH ="stw"; | |
74 | } else { die "nonsense $flavour"; } | |
75 | ||
60250017 | 76 | # Define endianness based on flavour |
9e58d119 AP |
77 | # i.e.: linux64le |
78 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; | |
79 | ||
80 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
81 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
82 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
83 | die "can't locate ppc-xlate.pl"; | |
84 | ||
1aa89a7a RL |
85 | open STDOUT,"| $^X $xlate $flavour \"$output\"" |
86 | or die "can't call $xlate: $!"; | |
9e58d119 AP |
87 | |
88 | $FRAME=24*$SIZE_T; | |
89 | ||
90 | $sp="r1"; | |
91 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); | |
92 | my ($mac,$nonce)=($inp,$len); | |
93 | my $mask = "r0"; | |
94 | ||
95 | $code=<<___; | |
96 | .machine "any" | |
97 | .text | |
98 | ___ | |
99 | if ($flavour =~ /64/) { | |
100 | ############################################################################### | |
101 | # base 2^64 implementation | |
102 | ||
103 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31)); | |
104 | ||
105 | $code.=<<___; | |
106 | .globl .poly1305_init_int | |
107 | .align 4 | |
108 | .poly1305_init_int: | |
109 | xor r0,r0,r0 | |
110 | std r0,0($ctx) # zero hash value | |
111 | std r0,8($ctx) | |
112 | std r0,16($ctx) | |
a28e4890 | 113 | stw r0,24($ctx) # clear is_base2_26 |
9e58d119 AP |
114 | |
115 | $UCMP $inp,r0 | |
116 | beq- Lno_key | |
117 | ___ | |
118 | $code.=<<___ if ($LITTLE_ENDIAN); | |
119 | ld $d0,0($inp) # load key material | |
120 | ld $d1,8($inp) | |
121 | ___ | |
122 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
123 | li $h0,4 | |
124 | lwbrx $d0,0,$inp # load key material | |
125 | li $d1,8 | |
126 | lwbrx $h0,$h0,$inp | |
127 | li $h1,12 | |
128 | lwbrx $d1,$d1,$inp | |
129 | lwbrx $h1,$h1,$inp | |
130 | insrdi $d0,$h0,32,0 | |
131 | insrdi $d1,$h1,32,0 | |
132 | ___ | |
133 | $code.=<<___; | |
134 | lis $h1,0xfff # 0x0fff0000 | |
135 | ori $h1,$h1,0xfffc # 0x0ffffffc | |
136 | insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc | |
137 | ori $h0,$h1,3 # 0x0ffffffc0fffffff | |
138 | ||
139 | and $d0,$d0,$h0 | |
140 | and $d1,$d1,$h1 | |
141 | ||
142 | std $d0,32($ctx) # store key | |
143 | std $d1,40($ctx) | |
144 | ||
145 | Lno_key: | |
146 | xor r3,r3,r3 | |
147 | blr | |
148 | .long 0 | |
149 | .byte 0,12,0x14,0,0,0,2,0 | |
150 | .size .poly1305_init_int,.-.poly1305_init_int | |
151 | ||
152 | .globl .poly1305_blocks | |
153 | .align 4 | |
154 | .poly1305_blocks: | |
a28e4890 | 155 | Lpoly1305_blocks: |
9e58d119 AP |
156 | srdi. $len,$len,4 |
157 | beq- Labort | |
158 | ||
159 | $STU $sp,-$FRAME($sp) | |
160 | mflr r0 | |
161 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
162 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
163 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
164 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
165 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
166 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
167 | ||
168 | ld $r0,32($ctx) # load key | |
169 | ld $r1,40($ctx) | |
170 | ||
171 | ld $h0,0($ctx) # load hash value | |
172 | ld $h1,8($ctx) | |
173 | ld $h2,16($ctx) | |
174 | ||
175 | srdi $s1,$r1,2 | |
176 | mtctr $len | |
177 | add $s1,$s1,$r1 # s1 = r1 + r1>>2 | |
178 | li $mask,3 | |
179 | b Loop | |
180 | ||
181 | .align 4 | |
182 | Loop: | |
183 | ___ | |
184 | $code.=<<___ if ($LITTLE_ENDIAN); | |
185 | ld $t0,0($inp) # load input | |
186 | ld $t1,8($inp) | |
187 | ___ | |
188 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
189 | li $d0,4 | |
190 | lwbrx $t0,0,$inp # load input | |
191 | li $t1,8 | |
192 | lwbrx $d0,$d0,$inp | |
193 | li $d1,12 | |
194 | lwbrx $t1,$t1,$inp | |
195 | lwbrx $d1,$d1,$inp | |
196 | insrdi $t0,$d0,32,0 | |
197 | insrdi $t1,$d1,32,0 | |
198 | ___ | |
199 | $code.=<<___; | |
200 | addi $inp,$inp,16 | |
201 | ||
202 | addc $h0,$h0,$t0 # accumulate input | |
203 | adde $h1,$h1,$t1 | |
204 | ||
205 | mulld $d0,$h0,$r0 # h0*r0 | |
206 | mulhdu $d1,$h0,$r0 | |
207 | adde $h2,$h2,$padbit | |
208 | ||
209 | mulld $t0,$h1,$s1 # h1*5*r1 | |
210 | mulhdu $t1,$h1,$s1 | |
211 | addc $d0,$d0,$t0 | |
212 | adde $d1,$d1,$t1 | |
213 | ||
214 | mulld $t0,$h0,$r1 # h0*r1 | |
215 | mulhdu $d2,$h0,$r1 | |
216 | addc $d1,$d1,$t0 | |
217 | addze $d2,$d2 | |
218 | ||
219 | mulld $t0,$h1,$r0 # h1*r0 | |
220 | mulhdu $t1,$h1,$r0 | |
221 | addc $d1,$d1,$t0 | |
222 | adde $d2,$d2,$t1 | |
223 | ||
224 | mulld $t0,$h2,$s1 # h2*5*r1 | |
225 | mulld $t1,$h2,$r0 # h2*r0 | |
226 | addc $d1,$d1,$t0 | |
227 | adde $d2,$d2,$t1 | |
228 | ||
229 | andc $t0,$d2,$mask # final reduction step | |
230 | and $h2,$d2,$mask | |
231 | srdi $t1,$t0,2 | |
232 | add $t0,$t0,$t1 | |
233 | addc $h0,$d0,$t0 | |
234 | addze $h1,$d1 | |
4b8736a2 | 235 | addze $h2,$h2 |
9e58d119 AP |
236 | |
237 | bdnz Loop | |
238 | ||
239 | std $h0,0($ctx) # store hash value | |
240 | std $h1,8($ctx) | |
241 | std $h2,16($ctx) | |
242 | ||
243 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
244 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
245 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
246 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
247 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
248 | addi $sp,$sp,$FRAME | |
249 | Labort: | |
250 | blr | |
251 | .long 0 | |
252 | .byte 0,12,4,1,0x80,5,4,0 | |
253 | .size .poly1305_blocks,.-.poly1305_blocks | |
a28e4890 AP |
254 | ___ |
255 | { | |
256 | my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12)); | |
9e58d119 | 257 | |
a28e4890 | 258 | $code.=<<___; |
9e58d119 | 259 | .globl .poly1305_emit |
a28e4890 | 260 | .align 5 |
9e58d119 | 261 | .poly1305_emit: |
a28e4890 AP |
262 | lwz $h0,0($ctx) # load hash value base 2^26 |
263 | lwz $h1,4($ctx) | |
264 | lwz $h2,8($ctx) | |
265 | lwz $h3,12($ctx) | |
266 | lwz $h4,16($ctx) | |
267 | lwz r0,24($ctx) # is_base2_26 | |
268 | ||
269 | sldi $h1,$h1,26 # base 2^26 -> base 2^64 | |
270 | sldi $t0,$h2,52 | |
271 | srdi $h2,$h2,12 | |
272 | sldi $h3,$h3,14 | |
273 | add $h0,$h0,$h1 | |
274 | addc $h0,$h0,$t0 | |
275 | sldi $t0,$h4,40 | |
276 | srdi $h4,$h4,24 | |
277 | adde $h1,$h2,$h3 | |
278 | addc $h1,$h1,$t0 | |
279 | addze $h2,$h4 | |
280 | ||
281 | ld $h3,0($ctx) # load hash value base 2^64 | |
282 | ld $h4,8($ctx) | |
283 | ld $t0,16($ctx) | |
284 | ||
285 | neg r0,r0 | |
286 | xor $h0,$h0,$h3 # choose between radixes | |
287 | xor $h1,$h1,$h4 | |
288 | xor $h2,$h2,$t0 | |
289 | and $h0,$h0,r0 | |
290 | and $h1,$h1,r0 | |
291 | and $h2,$h2,r0 | |
292 | xor $h0,$h0,$h3 | |
293 | xor $h1,$h1,$h4 | |
294 | xor $h2,$h2,$t0 | |
295 | ||
296 | addic $h3,$h0,5 # compare to modulus | |
297 | addze $h4,$h1 | |
298 | addze $t0,$h2 | |
299 | ||
300 | srdi $t0,$t0,2 # see if it carried/borrowed | |
301 | neg $t0,$t0 | |
302 | ||
303 | andc $h0,$h0,$t0 | |
304 | and $h3,$h3,$t0 | |
305 | andc $h1,$h1,$t0 | |
306 | and $h4,$h4,$t0 | |
307 | or $h0,$h0,$h3 | |
308 | or $h1,$h1,$h4 | |
309 | ||
310 | lwz $t0,4($nonce) | |
311 | lwz $h2,12($nonce) | |
312 | lwz $h3,0($nonce) | |
313 | lwz $h4,8($nonce) | |
314 | ||
315 | insrdi $h3,$t0,32,0 | |
316 | insrdi $h4,$h2,32,0 | |
317 | ||
318 | addc $h0,$h0,$h3 # accumulate nonce | |
319 | adde $h1,$h1,$h4 | |
320 | ||
321 | addi $ctx,$mac,-1 | |
322 | addi $mac,$mac,7 | |
323 | ||
324 | stbu $h0,1($ctx) # write [little-endian] result | |
325 | srdi $h0,$h0,8 | |
326 | stbu $h1,1($mac) | |
327 | srdi $h1,$h1,8 | |
328 | ||
329 | stbu $h0,1($ctx) | |
330 | srdi $h0,$h0,8 | |
331 | stbu $h1,1($mac) | |
332 | srdi $h1,$h1,8 | |
333 | ||
334 | stbu $h0,1($ctx) | |
335 | srdi $h0,$h0,8 | |
336 | stbu $h1,1($mac) | |
337 | srdi $h1,$h1,8 | |
338 | ||
339 | stbu $h0,1($ctx) | |
340 | srdi $h0,$h0,8 | |
341 | stbu $h1,1($mac) | |
342 | srdi $h1,$h1,8 | |
343 | ||
344 | stbu $h0,1($ctx) | |
345 | srdi $h0,$h0,8 | |
346 | stbu $h1,1($mac) | |
347 | srdi $h1,$h1,8 | |
348 | ||
349 | stbu $h0,1($ctx) | |
350 | srdi $h0,$h0,8 | |
351 | stbu $h1,1($mac) | |
352 | srdi $h1,$h1,8 | |
353 | ||
354 | stbu $h0,1($ctx) | |
355 | srdi $h0,$h0,8 | |
356 | stbu $h1,1($mac) | |
357 | srdi $h1,$h1,8 | |
358 | ||
359 | stbu $h0,1($ctx) | |
360 | stbu $h1,1($mac) | |
9e58d119 | 361 | |
9e58d119 AP |
362 | blr |
363 | .long 0 | |
364 | .byte 0,12,0x14,0,0,0,3,0 | |
365 | .size .poly1305_emit,.-.poly1305_emit | |
366 | ___ | |
a28e4890 | 367 | } } else { |
9e58d119 AP |
368 | ############################################################################### |
369 | # base 2^32 implementation | |
370 | ||
371 | my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3, | |
372 | $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3 | |
373 | ) = map("r$_",(7..12,14..31)); | |
374 | ||
375 | $code.=<<___; | |
376 | .globl .poly1305_init_int | |
377 | .align 4 | |
378 | .poly1305_init_int: | |
379 | xor r0,r0,r0 | |
380 | stw r0,0($ctx) # zero hash value | |
381 | stw r0,4($ctx) | |
382 | stw r0,8($ctx) | |
383 | stw r0,12($ctx) | |
384 | stw r0,16($ctx) | |
a28e4890 | 385 | stw r0,24($ctx) # clear is_base2_26 |
9e58d119 AP |
386 | |
387 | $UCMP $inp,r0 | |
388 | beq- Lno_key | |
389 | ___ | |
390 | $code.=<<___ if ($LITTLE_ENDIAN); | |
391 | lw $h0,0($inp) # load key material | |
392 | lw $h1,4($inp) | |
393 | lw $h2,8($inp) | |
394 | lw $h3,12($inp) | |
395 | ___ | |
396 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
397 | li $h1,4 | |
398 | lwbrx $h0,0,$inp # load key material | |
399 | li $h2,8 | |
400 | lwbrx $h1,$h1,$inp | |
401 | li $h3,12 | |
402 | lwbrx $h2,$h2,$inp | |
403 | lwbrx $h3,$h3,$inp | |
404 | ___ | |
405 | $code.=<<___; | |
406 | lis $mask,0xf000 # 0xf0000000 | |
407 | li $r0,-4 | |
408 | andc $r0,$r0,$mask # 0x0ffffffc | |
409 | ||
410 | andc $h0,$h0,$mask | |
411 | and $h1,$h1,$r0 | |
412 | and $h2,$h2,$r0 | |
413 | and $h3,$h3,$r0 | |
414 | ||
415 | stw $h0,32($ctx) # store key | |
416 | stw $h1,36($ctx) | |
417 | stw $h2,40($ctx) | |
418 | stw $h3,44($ctx) | |
419 | ||
420 | Lno_key: | |
421 | xor r3,r3,r3 | |
422 | blr | |
423 | .long 0 | |
424 | .byte 0,12,0x14,0,0,0,2,0 | |
425 | .size .poly1305_init_int,.-.poly1305_init_int | |
426 | ||
427 | .globl .poly1305_blocks | |
428 | .align 4 | |
429 | .poly1305_blocks: | |
a28e4890 | 430 | Lpoly1305_blocks: |
9e58d119 AP |
431 | srwi. $len,$len,4 |
432 | beq- Labort | |
433 | ||
434 | $STU $sp,-$FRAME($sp) | |
435 | mflr r0 | |
436 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
437 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
438 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
439 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
440 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
441 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
442 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
443 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
444 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
445 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
446 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
447 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
448 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
449 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
450 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
451 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
452 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
453 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
454 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
455 | ||
456 | lwz $r0,32($ctx) # load key | |
457 | lwz $r1,36($ctx) | |
458 | lwz $r2,40($ctx) | |
459 | lwz $r3,44($ctx) | |
460 | ||
461 | lwz $h0,0($ctx) # load hash value | |
462 | lwz $h1,4($ctx) | |
463 | lwz $h2,8($ctx) | |
464 | lwz $h3,12($ctx) | |
465 | lwz $h4,16($ctx) | |
466 | ||
467 | srwi $s1,$r1,2 | |
468 | srwi $s2,$r2,2 | |
469 | srwi $s3,$r3,2 | |
470 | add $s1,$s1,$r1 # si = ri + ri>>2 | |
471 | add $s2,$s2,$r2 | |
472 | add $s3,$s3,$r3 | |
473 | mtctr $len | |
474 | li $mask,3 | |
475 | b Loop | |
476 | ||
477 | .align 4 | |
478 | Loop: | |
479 | ___ | |
480 | $code.=<<___ if ($LITTLE_ENDIAN); | |
481 | lwz $d0,0($inp) # load input | |
482 | lwz $d1,4($inp) | |
483 | lwz $d2,8($inp) | |
484 | lwz $d3,12($inp) | |
485 | ___ | |
486 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
487 | li $d1,4 | |
488 | lwbrx $d0,0,$inp # load input | |
489 | li $d2,8 | |
490 | lwbrx $d1,$d1,$inp | |
491 | li $d3,12 | |
492 | lwbrx $d2,$d2,$inp | |
493 | lwbrx $d3,$d3,$inp | |
494 | ___ | |
495 | $code.=<<___; | |
496 | addi $inp,$inp,16 | |
497 | ||
498 | addc $h0,$h0,$d0 # accumulate input | |
499 | adde $h1,$h1,$d1 | |
500 | adde $h2,$h2,$d2 | |
501 | ||
502 | mullw $d0,$h0,$r0 # h0*r0 | |
503 | mulhwu $D0,$h0,$r0 | |
504 | ||
505 | mullw $d1,$h0,$r1 # h0*r1 | |
506 | mulhwu $D1,$h0,$r1 | |
507 | ||
508 | mullw $d2,$h0,$r2 # h0*r2 | |
509 | mulhwu $D2,$h0,$r2 | |
510 | ||
511 | adde $h3,$h3,$d3 | |
512 | adde $h4,$h4,$padbit | |
513 | ||
514 | mullw $d3,$h0,$r3 # h0*r3 | |
515 | mulhwu $D3,$h0,$r3 | |
516 | ||
517 | mullw $t0,$h1,$s3 # h1*s3 | |
518 | mulhwu $t1,$h1,$s3 | |
519 | ||
520 | mullw $t2,$h1,$r0 # h1*r0 | |
521 | mulhwu $t3,$h1,$r0 | |
522 | addc $d0,$d0,$t0 | |
523 | adde $D0,$D0,$t1 | |
524 | ||
525 | mullw $t0,$h1,$r1 # h1*r1 | |
526 | mulhwu $t1,$h1,$r1 | |
527 | addc $d1,$d1,$t2 | |
528 | adde $D1,$D1,$t3 | |
529 | ||
530 | mullw $t2,$h1,$r2 # h1*r2 | |
531 | mulhwu $t3,$h1,$r2 | |
532 | addc $d2,$d2,$t0 | |
533 | adde $D2,$D2,$t1 | |
534 | ||
535 | mullw $t0,$h2,$s2 # h2*s2 | |
536 | mulhwu $t1,$h2,$s2 | |
537 | addc $d3,$d3,$t2 | |
538 | adde $D3,$D3,$t3 | |
539 | ||
540 | mullw $t2,$h2,$s3 # h2*s3 | |
541 | mulhwu $t3,$h2,$s3 | |
542 | addc $d0,$d0,$t0 | |
543 | adde $D0,$D0,$t1 | |
544 | ||
545 | mullw $t0,$h2,$r0 # h2*r0 | |
546 | mulhwu $t1,$h2,$r0 | |
547 | addc $d1,$d1,$t2 | |
548 | adde $D1,$D1,$t3 | |
549 | ||
550 | mullw $t2,$h2,$r1 # h2*r1 | |
551 | mulhwu $t3,$h2,$r1 | |
552 | addc $d2,$d2,$t0 | |
553 | adde $D2,$D2,$t1 | |
554 | ||
555 | mullw $t0,$h3,$s1 # h3*s1 | |
556 | mulhwu $t1,$h3,$s1 | |
557 | addc $d3,$d3,$t2 | |
558 | adde $D3,$D3,$t3 | |
559 | ||
560 | mullw $t2,$h3,$s2 # h3*s2 | |
561 | mulhwu $t3,$h3,$s2 | |
562 | addc $d0,$d0,$t0 | |
563 | adde $D0,$D0,$t1 | |
564 | ||
565 | mullw $t0,$h3,$s3 # h3*s3 | |
566 | mulhwu $t1,$h3,$s3 | |
567 | addc $d1,$d1,$t2 | |
568 | adde $D1,$D1,$t3 | |
569 | ||
570 | mullw $t2,$h3,$r0 # h3*r0 | |
571 | mulhwu $t3,$h3,$r0 | |
572 | addc $d2,$d2,$t0 | |
573 | adde $D2,$D2,$t1 | |
574 | ||
575 | mullw $t0,$h4,$s1 # h4*s1 | |
576 | addc $d3,$d3,$t2 | |
577 | adde $D3,$D3,$t3 | |
578 | addc $d1,$d1,$t0 | |
579 | ||
580 | mullw $t1,$h4,$s2 # h4*s2 | |
581 | addze $D1,$D1 | |
582 | addc $d2,$d2,$t1 | |
583 | addze $D2,$D2 | |
584 | ||
585 | mullw $t2,$h4,$s3 # h4*s3 | |
586 | addc $d3,$d3,$t2 | |
587 | addze $D3,$D3 | |
588 | ||
589 | mullw $h4,$h4,$r0 # h4*r0 | |
590 | ||
591 | addc $h1,$d1,$D0 | |
592 | adde $h2,$d2,$D1 | |
593 | adde $h3,$d3,$D2 | |
594 | adde $h4,$h4,$D3 | |
595 | ||
596 | andc $D0,$h4,$mask # final reduction step | |
597 | and $h4,$h4,$mask | |
598 | srwi $D1,$D0,2 | |
599 | add $D0,$D0,$D1 | |
600 | addc $h0,$d0,$D0 | |
601 | addze $h1,$h1 | |
602 | addze $h2,$h2 | |
603 | addze $h3,$h3 | |
4b8736a2 | 604 | addze $h4,$h4 |
9e58d119 AP |
605 | |
606 | bdnz Loop | |
607 | ||
608 | stw $h0,0($ctx) # store hash value | |
609 | stw $h1,4($ctx) | |
610 | stw $h2,8($ctx) | |
611 | stw $h3,12($ctx) | |
612 | stw $h4,16($ctx) | |
613 | ||
614 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
615 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
616 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
617 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
618 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
619 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
620 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
621 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
622 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
623 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
624 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
625 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
626 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
627 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
628 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
629 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
630 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
631 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
632 | addi $sp,$sp,$FRAME | |
633 | Labort: | |
634 | blr | |
635 | .long 0 | |
636 | .byte 0,12,4,1,0x80,18,4,0 | |
637 | .size .poly1305_blocks,.-.poly1305_blocks | |
a28e4890 AP |
638 | ___ |
639 | { | |
640 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12)); | |
9e58d119 | 641 | |
a28e4890 | 642 | $code.=<<___; |
9e58d119 | 643 | .globl .poly1305_emit |
a28e4890 | 644 | .align 5 |
9e58d119 | 645 | .poly1305_emit: |
a28e4890 AP |
646 | lwz r0,24($ctx) # is_base2_26 |
647 | lwz $h0,0($ctx) # load hash value | |
648 | lwz $h1,4($ctx) | |
649 | lwz $h2,8($ctx) | |
650 | lwz $h3,12($ctx) | |
651 | lwz $h4,16($ctx) | |
652 | cmplwi r0,0 | |
653 | beq Lemit_base2_32 | |
654 | ||
655 | slwi $t0,$h1,26 # base 2^26 -> base 2^32 | |
656 | srwi $h1,$h1,6 | |
657 | slwi $t1,$h2,20 | |
658 | srwi $h2,$h2,12 | |
659 | addc $h0,$h0,$t0 | |
660 | slwi $t0,$h3,14 | |
661 | srwi $h3,$h3,18 | |
662 | adde $h1,$h1,$t1 | |
663 | slwi $t1,$h4,8 | |
664 | srwi $h4,$h4,24 | |
665 | adde $h2,$h2,$t0 | |
666 | adde $h3,$h3,$t1 | |
667 | addze $h4,$h4 | |
668 | ||
669 | Lemit_base2_32: | |
670 | addic r0,$h0,5 # compare to modulus | |
671 | addze r0,$h1 | |
672 | addze r0,$h2 | |
673 | addze r0,$h3 | |
674 | addze r0,$h4 | |
675 | ||
676 | srwi r0,r0,2 # see if it carried/borrowed | |
677 | neg r0,r0 | |
678 | andi. r0,r0,5 | |
679 | ||
680 | addc $h0,$h0,r0 | |
681 | lwz r0,0($nonce) | |
682 | addze $h1,$h1 | |
683 | lwz $t0,4($nonce) | |
684 | addze $h2,$h2 | |
685 | lwz $t1,8($nonce) | |
686 | addze $h3,$h3 | |
687 | lwz $h4,12($nonce) | |
688 | ||
689 | addc $h0,$h0,r0 # accumulate nonce | |
690 | adde $h1,$h1,$t0 | |
691 | adde $h2,$h2,$t1 | |
692 | adde $h3,$h3,$h4 | |
693 | ||
694 | addi $ctx,$mac,-1 | |
695 | addi $mac,$mac,7 | |
696 | ||
697 | stbu $h0,1($ctx) # write [little-endian] result | |
698 | srwi $h0,$h0,8 | |
699 | stbu $h2,1($mac) | |
700 | srwi $h2,$h2,8 | |
701 | ||
702 | stbu $h0,1($ctx) | |
703 | srwi $h0,$h0,8 | |
704 | stbu $h2,1($mac) | |
705 | srwi $h2,$h2,8 | |
706 | ||
707 | stbu $h0,1($ctx) | |
708 | srwi $h0,$h0,8 | |
709 | stbu $h2,1($mac) | |
710 | srwi $h2,$h2,8 | |
711 | ||
712 | stbu $h0,1($ctx) | |
713 | stbu $h2,1($mac) | |
714 | ||
715 | stbu $h1,1($ctx) | |
716 | srwi $h1,$h1,8 | |
717 | stbu $h3,1($mac) | |
718 | srwi $h3,$h3,8 | |
719 | ||
720 | stbu $h1,1($ctx) | |
721 | srwi $h1,$h1,8 | |
722 | stbu $h3,1($mac) | |
723 | srwi $h3,$h3,8 | |
724 | ||
725 | stbu $h1,1($ctx) | |
726 | srwi $h1,$h1,8 | |
727 | stbu $h3,1($mac) | |
728 | srwi $h3,$h3,8 | |
729 | ||
730 | stbu $h1,1($ctx) | |
731 | stbu $h3,1($mac) | |
732 | ||
733 | blr | |
734 | .long 0 | |
735 | .byte 0,12,0x14,0,0,0,3,0 | |
736 | .size .poly1305_emit,.-.poly1305_emit | |
737 | ___ | |
738 | } } | |
739 | {{{ | |
740 | ######################################################################## | |
741 | # PowerISA 2.07/VSX section # | |
742 | ######################################################################## | |
743 | ||
744 | my $LOCALS= 6*$SIZE_T; | |
745 | my $VSXFRAME = $LOCALS + 6*$SIZE_T; | |
746 | $VSXFRAME += 128; # local variables | |
8d847a3f | 747 | $VSXFRAME += 12*16; # v20-v31 offload |
a28e4890 AP |
748 | |
749 | my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0; | |
750 | ||
751 | ######################################################################## | |
752 | # Layout of opaque area is following: | |
753 | # | |
754 | # unsigned __int32 h[5]; # current hash value base 2^26 | |
755 | # unsigned __int32 pad; | |
756 | # unsigned __int32 is_base2_26, pad; | |
757 | # unsigned __int64 r[2]; # key value base 2^64 | |
758 | # struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9]; | |
759 | # | |
760 | # where r^n are base 2^26 digits of powers of multiplier key. There are | |
761 | # 5 digits, but last four are interleaved with multiples of 5, totalling | |
762 | # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of | |
763 | # powers is as they appear in register, not memory. | |
764 | ||
765 | my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4)); | |
766 | my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9)); | |
767 | my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14)); | |
768 | my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2); | |
769 | my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19)); | |
770 | my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24)); | |
771 | my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31)); | |
772 | my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31))); | |
773 | my ($ctx_,$_ctx,$const) = map("r$_",(10..12)); | |
774 | ||
775 | if ($flavour =~ /64/) { | |
776 | ############################################################################### | |
777 | # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms, | |
778 | # but the base 2^26 computational part is same... | |
779 | ||
780 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31)); | |
781 | my $mask = "r0"; | |
782 | ||
783 | $code.=<<___; | |
784 | .globl .poly1305_blocks_vsx | |
785 | .align 5 | |
786 | .poly1305_blocks_vsx: | |
787 | lwz r7,24($ctx) # is_base2_26 | |
788 | cmpldi $len,128 | |
789 | bge __poly1305_blocks_vsx | |
790 | ||
791 | neg r0,r7 # is_base2_26 as mask | |
792 | lwz r7,0($ctx) # load hash base 2^26 | |
793 | lwz r8,4($ctx) | |
794 | lwz r9,8($ctx) | |
795 | lwz r10,12($ctx) | |
796 | lwz r11,16($ctx) | |
797 | ||
798 | sldi r8,r8,26 # base 2^26 -> base 2^64 | |
799 | sldi r12,r9,52 | |
800 | add r7,r7,r8 | |
801 | srdi r9,r9,12 | |
802 | sldi r10,r10,14 | |
803 | addc r7,r7,r12 | |
804 | sldi r8,r11,40 | |
805 | adde r9,r9,r10 | |
806 | srdi r11,r11,24 | |
807 | addc r9,r9,r8 | |
808 | addze r11,r11 | |
809 | ||
810 | ld r8,0($ctx) # load hash base 2^64 | |
811 | ld r10,8($ctx) | |
812 | ld r12,16($ctx) | |
813 | ||
814 | xor r7,r7,r8 # select between radixes | |
815 | xor r9,r9,r10 | |
816 | xor r11,r11,r12 | |
817 | and r7,r7,r0 | |
818 | and r9,r9,r0 | |
819 | and r11,r11,r0 | |
820 | xor r7,r7,r8 | |
821 | xor r9,r9,r10 | |
822 | xor r11,r11,r12 | |
823 | ||
824 | li r0,0 | |
825 | std r7,0($ctx) # store hash base 2^64 | |
826 | std r9,8($ctx) | |
827 | std r11,16($ctx) | |
828 | stw r0,24($ctx) # clear is_base2_26 | |
829 | ||
830 | b Lpoly1305_blocks | |
831 | .long 0 | |
832 | .byte 0,12,0x14,0,0,0,4,0 | |
833 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx | |
834 | ||
835 | .align 5 | |
836 | __poly1305_mul: | |
837 | mulld $d0,$h0,$r0 # h0*r0 | |
838 | mulhdu $d1,$h0,$r0 | |
839 | ||
840 | mulld $t0,$h1,$s1 # h1*5*r1 | |
841 | mulhdu $t1,$h1,$s1 | |
842 | addc $d0,$d0,$t0 | |
843 | adde $d1,$d1,$t1 | |
844 | ||
845 | mulld $t0,$h0,$r1 # h0*r1 | |
846 | mulhdu $d2,$h0,$r1 | |
847 | addc $d1,$d1,$t0 | |
848 | addze $d2,$d2 | |
849 | ||
850 | mulld $t0,$h1,$r0 # h1*r0 | |
851 | mulhdu $t1,$h1,$r0 | |
852 | addc $d1,$d1,$t0 | |
853 | adde $d2,$d2,$t1 | |
854 | ||
855 | mulld $t0,$h2,$s1 # h2*5*r1 | |
856 | mulld $t1,$h2,$r0 # h2*r0 | |
857 | addc $d1,$d1,$t0 | |
858 | adde $d2,$d2,$t1 | |
859 | ||
860 | andc $t0,$d2,$mask # final reduction step | |
861 | and $h2,$d2,$mask | |
862 | srdi $t1,$t0,2 | |
863 | add $t0,$t0,$t1 | |
864 | addc $h0,$d0,$t0 | |
865 | addze $h1,$d1 | |
866 | addze $h2,$h2 | |
867 | ||
868 | blr | |
869 | .long 0 | |
870 | .byte 0,12,0x14,0,0,0,0,0 | |
871 | .size __poly1305_mul,.-__poly1305_mul | |
872 | ||
873 | .align 5 | |
874 | __poly1305_splat: | |
875 | extrdi $d0,$h0,26,38 | |
876 | extrdi $d1,$h0,26,12 | |
877 | stw $d0,0x00($t1) | |
878 | ||
879 | extrdi $d2,$h0,12,0 | |
880 | slwi $d0,$d1,2 | |
881 | stw $d1,0x10($t1) | |
882 | add $d0,$d0,$d1 # * 5 | |
883 | stw $d0,0x20($t1) | |
884 | ||
885 | insrdi $d2,$h1,14,38 | |
886 | slwi $d0,$d2,2 | |
887 | stw $d2,0x30($t1) | |
888 | add $d0,$d0,$d2 # * 5 | |
889 | stw $d0,0x40($t1) | |
890 | ||
891 | extrdi $d1,$h1,26,24 | |
892 | extrdi $d2,$h1,24,0 | |
893 | slwi $d0,$d1,2 | |
894 | stw $d1,0x50($t1) | |
895 | add $d0,$d0,$d1 # * 5 | |
896 | stw $d0,0x60($t1) | |
897 | ||
898 | insrdi $d2,$h2,3,37 | |
899 | slwi $d0,$d2,2 | |
900 | stw $d2,0x70($t1) | |
901 | add $d0,$d0,$d2 # * 5 | |
902 | stw $d0,0x80($t1) | |
903 | ||
904 | blr | |
905 | .long 0 | |
906 | .byte 0,12,0x14,0,0,0,0,0 | |
907 | .size __poly1305_splat,.-__poly1305_splat | |
908 | ||
909 | .align 5 | |
910 | __poly1305_blocks_vsx: | |
911 | $STU $sp,-$VSXFRAME($sp) | |
9e58d119 | 912 | mflr r0 |
a28e4890 AP |
913 | li r10,`15+$LOCALS+128` |
914 | li r11,`31+$LOCALS+128` | |
915 | mfspr r12,256 | |
916 | stvx v20,r10,$sp | |
917 | addi r10,r10,32 | |
918 | stvx v21,r11,$sp | |
919 | addi r11,r11,32 | |
920 | stvx v22,r10,$sp | |
921 | addi r10,r10,32 | |
8d847a3f | 922 | stvx v23,r11,$sp |
a28e4890 | 923 | addi r11,r11,32 |
8d847a3f | 924 | stvx v24,r10,$sp |
a28e4890 | 925 | addi r10,r10,32 |
8d847a3f RM |
926 | stvx v25,r11,$sp |
927 | addi r11,r11,32 | |
a28e4890 AP |
928 | stvx v26,r10,$sp |
929 | addi r10,r10,32 | |
930 | stvx v27,r11,$sp | |
931 | addi r11,r11,32 | |
932 | stvx v28,r10,$sp | |
933 | addi r10,r10,32 | |
934 | stvx v29,r11,$sp | |
935 | addi r11,r11,32 | |
936 | stvx v30,r10,$sp | |
937 | stvx v31,r11,$sp | |
938 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave | |
939 | li r12,-1 | |
940 | mtspr 256,r12 # preserve all AltiVec registers | |
941 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) | |
942 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
943 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
944 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
945 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
946 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) | |
947 | ||
948 | bl LPICmeup | |
949 | ||
950 | li $x10,0x10 | |
951 | li $x20,0x20 | |
952 | li $x30,0x30 | |
953 | li $x40,0x40 | |
954 | li $x50,0x50 | |
955 | lvx_u $mask26,$x00,$const | |
956 | lvx_u $_26,$x10,$const | |
957 | lvx_u $_40,$x20,$const | |
958 | lvx_u $I2perm,$x30,$const | |
959 | lvx_u $padbits,$x40,$const | |
960 | ||
961 | cmplwi r7,0 # is_base2_26? | |
962 | bne Lskip_init_vsx | |
963 | ||
964 | ld $r0,32($ctx) # load key base 2^64 | |
965 | ld $r1,40($ctx) | |
966 | srdi $s1,$r1,2 | |
967 | li $mask,3 | |
968 | add $s1,$s1,$r1 # s1 = r1 + r1>>2 | |
969 | ||
970 | mr $h0,$r0 # "calculate" r^1 | |
971 | mr $h1,$r1 | |
972 | li $h2,0 | |
973 | addi $t1,$ctx,`48+(12^$BIG_ENDIAN)` | |
974 | bl __poly1305_splat | |
975 | ||
c2969ff6 | 976 | bl __poly1305_mul # calculate r^2 |
a28e4890 AP |
977 | addi $t1,$ctx,`48+(4^$BIG_ENDIAN)` |
978 | bl __poly1305_splat | |
979 | ||
c2969ff6 | 980 | bl __poly1305_mul # calculate r^3 |
a28e4890 AP |
981 | addi $t1,$ctx,`48+(8^$BIG_ENDIAN)` |
982 | bl __poly1305_splat | |
983 | ||
c2969ff6 | 984 | bl __poly1305_mul # calculate r^4 |
a28e4890 AP |
985 | addi $t1,$ctx,`48+(0^$BIG_ENDIAN)` |
986 | bl __poly1305_splat | |
987 | ||
988 | ld $h0,0($ctx) # load hash | |
989 | ld $h1,8($ctx) | |
990 | ld $h2,16($ctx) | |
991 | ||
992 | extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26 | |
993 | extrdi $d1,$h0,26,12 | |
994 | extrdi $d2,$h0,12,0 | |
995 | mtvrwz $H0,$d0 | |
996 | insrdi $d2,$h1,14,38 | |
997 | mtvrwz $H1,$d1 | |
998 | extrdi $d1,$h1,26,24 | |
999 | mtvrwz $H2,$d2 | |
1000 | extrdi $d2,$h1,24,0 | |
1001 | mtvrwz $H3,$d1 | |
1002 | insrdi $d2,$h2,3,37 | |
1003 | mtvrwz $H4,$d2 | |
1004 | ___ | |
1005 | } else { | |
1006 | ############################################################################### | |
1007 | # 32-bit initialization | |
1008 | ||
1009 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12)); | |
1010 | my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4); | |
1011 | ||
1012 | $code.=<<___; | |
1013 | .globl .poly1305_blocks_vsx | |
1014 | .align 5 | |
1015 | .poly1305_blocks_vsx: | |
1016 | lwz r7,24($ctx) # is_base2_26 | |
1017 | cmplwi $len,128 | |
1018 | bge __poly1305_blocks_vsx | |
1019 | cmplwi r7,0 | |
1020 | beq Lpoly1305_blocks | |
9e58d119 AP |
1021 | |
1022 | lwz $h0,0($ctx) # load hash | |
1023 | lwz $h1,4($ctx) | |
1024 | lwz $h2,8($ctx) | |
1025 | lwz $h3,12($ctx) | |
1026 | lwz $h4,16($ctx) | |
1027 | ||
a28e4890 AP |
1028 | slwi $t0,$h1,26 # base 2^26 -> base 2^32 |
1029 | srwi $h1,$h1,6 | |
1030 | slwi $t1,$h2,20 | |
1031 | srwi $h2,$h2,12 | |
1032 | addc $h0,$h0,$t0 | |
1033 | slwi $t0,$h3,14 | |
1034 | srwi $h3,$h3,18 | |
1035 | adde $h1,$h1,$t1 | |
1036 | slwi $t1,$h4,8 | |
1037 | srwi $h4,$h4,24 | |
1038 | adde $h2,$h2,$t0 | |
1039 | li $t0,0 | |
1040 | adde $h3,$h3,$t1 | |
1041 | addze $h4,$h4 | |
9e58d119 | 1042 | |
a28e4890 AP |
1043 | stw $h0,0($ctx) # store hash base 2^32 |
1044 | stw $h1,4($ctx) | |
1045 | stw $h2,8($ctx) | |
1046 | stw $h3,12($ctx) | |
1047 | stw $h4,16($ctx) | |
1048 | stw $t0,24($ctx) # clear is_base2_26 | |
9e58d119 | 1049 | |
a28e4890 AP |
1050 | b Lpoly1305_blocks |
1051 | .long 0 | |
1052 | .byte 0,12,0x14,0,0,0,4,0 | |
1053 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx | |
1054 | ||
1055 | .align 5 | |
1056 | __poly1305_mul: | |
1057 | vmulouw $ACC0,$H0,$R0 | |
1058 | vmulouw $ACC1,$H1,$R0 | |
1059 | vmulouw $ACC2,$H2,$R0 | |
1060 | vmulouw $ACC3,$H3,$R0 | |
1061 | vmulouw $ACC4,$H4,$R0 | |
1062 | ||
1063 | vmulouw $T0,$H4,$S1 | |
1064 | vaddudm $ACC0,$ACC0,$T0 | |
1065 | vmulouw $T0,$H0,$R1 | |
1066 | vaddudm $ACC1,$ACC1,$T0 | |
1067 | vmulouw $T0,$H1,$R1 | |
1068 | vaddudm $ACC2,$ACC2,$T0 | |
1069 | vmulouw $T0,$H2,$R1 | |
1070 | vaddudm $ACC3,$ACC3,$T0 | |
1071 | vmulouw $T0,$H3,$R1 | |
1072 | vaddudm $ACC4,$ACC4,$T0 | |
1073 | ||
1074 | vmulouw $T0,$H3,$S2 | |
1075 | vaddudm $ACC0,$ACC0,$T0 | |
1076 | vmulouw $T0,$H4,$S2 | |
1077 | vaddudm $ACC1,$ACC1,$T0 | |
1078 | vmulouw $T0,$H0,$R2 | |
1079 | vaddudm $ACC2,$ACC2,$T0 | |
1080 | vmulouw $T0,$H1,$R2 | |
1081 | vaddudm $ACC3,$ACC3,$T0 | |
1082 | vmulouw $T0,$H2,$R2 | |
1083 | vaddudm $ACC4,$ACC4,$T0 | |
1084 | ||
1085 | vmulouw $T0,$H2,$S3 | |
1086 | vaddudm $ACC0,$ACC0,$T0 | |
1087 | vmulouw $T0,$H3,$S3 | |
1088 | vaddudm $ACC1,$ACC1,$T0 | |
1089 | vmulouw $T0,$H4,$S3 | |
1090 | vaddudm $ACC2,$ACC2,$T0 | |
1091 | vmulouw $T0,$H0,$R3 | |
1092 | vaddudm $ACC3,$ACC3,$T0 | |
1093 | vmulouw $T0,$H1,$R3 | |
1094 | vaddudm $ACC4,$ACC4,$T0 | |
1095 | ||
1096 | vmulouw $T0,$H1,$S4 | |
1097 | vaddudm $ACC0,$ACC0,$T0 | |
1098 | vmulouw $T0,$H2,$S4 | |
1099 | vaddudm $ACC1,$ACC1,$T0 | |
1100 | vmulouw $T0,$H3,$S4 | |
1101 | vaddudm $ACC2,$ACC2,$T0 | |
1102 | vmulouw $T0,$H4,$S4 | |
1103 | vaddudm $ACC3,$ACC3,$T0 | |
1104 | vmulouw $T0,$H0,$R4 | |
1105 | vaddudm $ACC4,$ACC4,$T0 | |
1106 | ||
1107 | ################################################################ | |
1108 | # lazy reduction | |
1109 | ||
1110 | vspltisb $T0,2 | |
1111 | vsrd $H4,$ACC3,$_26 | |
1112 | vsrd $H1,$ACC0,$_26 | |
1113 | vand $H3,$ACC3,$mask26 | |
1114 | vand $H0,$ACC0,$mask26 | |
1115 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1116 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1117 | ||
1118 | vsrd $ACC4,$H4,$_26 | |
1119 | vsrd $ACC1,$H1,$_26 | |
1120 | vand $H4,$H4,$mask26 | |
1121 | vand $H1,$H1,$mask26 | |
1122 | vaddudm $H0,$H0,$ACC4 | |
1123 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1124 | ||
1125 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1126 | vsrd $ACC2,$H2,$_26 | |
1127 | vand $H2,$H2,$mask26 | |
1128 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1129 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1130 | ||
1131 | vsrd $ACC0,$H0,$_26 | |
1132 | vsrd $ACC3,$H3,$_26 | |
1133 | vand $H0,$H0,$mask26 | |
1134 | vand $H3,$H3,$mask26 | |
1135 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1136 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1137 | ||
1138 | blr | |
1139 | .long 0 | |
1140 | .byte 0,12,0x14,0,0,0,0,0 | |
1141 | .size __poly1305_mul,.-__poly1305_mul | |
1142 | ||
1143 | .align 5 | |
1144 | __poly1305_blocks_vsx: | |
1145 | $STU $sp,-$VSXFRAME($sp) | |
1146 | mflr r0 | |
1147 | li r10,`15+$LOCALS+128` | |
1148 | li r11,`31+$LOCALS+128` | |
1149 | mfspr r12,256 | |
1150 | stvx v20,r10,$sp | |
1151 | addi r10,r10,32 | |
1152 | stvx v21,r11,$sp | |
1153 | addi r11,r11,32 | |
1154 | stvx v22,r10,$sp | |
1155 | addi r10,r10,32 | |
8d847a3f | 1156 | stvx v23,r11,$sp |
a28e4890 | 1157 | addi r11,r11,32 |
8d847a3f | 1158 | stvx v24,r10,$sp |
a28e4890 | 1159 | addi r10,r10,32 |
8d847a3f RM |
1160 | stvx v25,r11,$sp |
1161 | addi r11,r11,32 | |
a28e4890 AP |
1162 | stvx v26,r10,$sp |
1163 | addi r10,r10,32 | |
1164 | stvx v27,r11,$sp | |
1165 | addi r11,r11,32 | |
1166 | stvx v28,r10,$sp | |
1167 | addi r10,r10,32 | |
1168 | stvx v29,r11,$sp | |
1169 | addi r11,r11,32 | |
1170 | stvx v30,r10,$sp | |
1171 | stvx v31,r11,$sp | |
1172 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave | |
1173 | li r12,-1 | |
1174 | mtspr 256,r12 # preserve all AltiVec registers | |
1175 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) | |
1176 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
1177 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
1178 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
1179 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
1180 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) | |
1181 | ||
1182 | bl LPICmeup | |
1183 | ||
1184 | li $x10,0x10 | |
1185 | li $x20,0x20 | |
1186 | li $x30,0x30 | |
1187 | li $x40,0x40 | |
1188 | li $x50,0x50 | |
1189 | lvx_u $mask26,$x00,$const | |
1190 | lvx_u $_26,$x10,$const | |
1191 | lvx_u $_40,$x20,$const | |
1192 | lvx_u $I2perm,$x30,$const | |
1193 | lvx_u $padbits,$x40,$const | |
1194 | ||
1195 | cmplwi r7,0 # is_base2_26? | |
1196 | bne Lskip_init_vsx | |
1197 | ||
1198 | lwz $h1,32($ctx) # load key base 2^32 | |
1199 | lwz $h2,36($ctx) | |
1200 | lwz $h3,40($ctx) | |
1201 | lwz $h4,44($ctx) | |
1202 | ||
1203 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 | |
1204 | extrwi $h1,$h1,6,0 | |
1205 | insrwi $h1,$h2,20,6 | |
1206 | extrwi $h2,$h2,12,0 | |
1207 | insrwi $h2,$h3,14,6 | |
1208 | extrwi $h3,$h3,18,0 | |
1209 | insrwi $h3,$h4,8,6 | |
1210 | extrwi $h4,$h4,24,0 | |
1211 | ||
1212 | mtvrwz $R0,$h0 | |
1213 | slwi $h0,$h1,2 | |
1214 | mtvrwz $R1,$h1 | |
1215 | add $h1,$h1,$h0 | |
1216 | mtvrwz $S1,$h1 | |
1217 | slwi $h1,$h2,2 | |
1218 | mtvrwz $R2,$h2 | |
1219 | add $h2,$h2,$h1 | |
1220 | mtvrwz $S2,$h2 | |
1221 | slwi $h2,$h3,2 | |
1222 | mtvrwz $R3,$h3 | |
1223 | add $h3,$h3,$h2 | |
1224 | mtvrwz $S3,$h3 | |
1225 | slwi $h3,$h4,2 | |
1226 | mtvrwz $R4,$h4 | |
1227 | add $h4,$h4,$h3 | |
1228 | mtvrwz $S4,$h4 | |
1229 | ||
1230 | vmr $H0,$R0 | |
1231 | vmr $H1,$R1 | |
1232 | vmr $H2,$R2 | |
1233 | vmr $H3,$R3 | |
1234 | vmr $H4,$R4 | |
1235 | ||
1236 | bl __poly1305_mul # r^1:- * r^1:- | |
1237 | ||
1238 | vpermdi $R0,$H0,$R0,0b00 | |
1239 | vpermdi $R1,$H1,$R1,0b00 | |
1240 | vpermdi $R2,$H2,$R2,0b00 | |
1241 | vpermdi $R3,$H3,$R3,0b00 | |
1242 | vpermdi $R4,$H4,$R4,0b00 | |
1243 | vpermdi $H0,$H0,$H0,0b00 | |
1244 | vpermdi $H1,$H1,$H1,0b00 | |
1245 | vpermdi $H2,$H2,$H2,0b00 | |
1246 | vpermdi $H3,$H3,$H3,0b00 | |
1247 | vpermdi $H4,$H4,$H4,0b00 | |
1248 | vsld $S1,$R1,$T0 # <<2 | |
1249 | vsld $S2,$R2,$T0 | |
1250 | vsld $S3,$R3,$T0 | |
1251 | vsld $S4,$R4,$T0 | |
1252 | vaddudm $S1,$S1,$R1 | |
1253 | vaddudm $S2,$S2,$R2 | |
1254 | vaddudm $S3,$S3,$R3 | |
1255 | vaddudm $S4,$S4,$R4 | |
1256 | ||
1257 | bl __poly1305_mul # r^2:r^2 * r^2:r^1 | |
1258 | ||
1259 | addi $h0,$ctx,0x60 | |
1260 | lwz $h1,0($ctx) # load hash | |
1261 | lwz $h2,4($ctx) | |
1262 | lwz $h3,8($ctx) | |
1263 | lwz $h4,12($ctx) | |
1264 | lwz $t0,16($ctx) | |
1265 | ||
1266 | vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3 | |
1267 | vmrgow $R1,$R1,$H1 | |
1268 | vmrgow $R2,$R2,$H2 | |
1269 | vmrgow $R3,$R3,$H3 | |
1270 | vmrgow $R4,$R4,$H4 | |
1271 | vslw $S1,$R1,$T0 # <<2 | |
1272 | vslw $S2,$R2,$T0 | |
1273 | vslw $S3,$R3,$T0 | |
1274 | vslw $S4,$R4,$T0 | |
1275 | vadduwm $S1,$S1,$R1 | |
1276 | vadduwm $S2,$S2,$R2 | |
1277 | vadduwm $S3,$S3,$R3 | |
1278 | vadduwm $S4,$S4,$R4 | |
1279 | ||
1280 | stvx_u $R0,$x30,$ctx | |
1281 | stvx_u $R1,$x40,$ctx | |
1282 | stvx_u $S1,$x50,$ctx | |
1283 | stvx_u $R2,$x00,$h0 | |
1284 | stvx_u $S2,$x10,$h0 | |
1285 | stvx_u $R3,$x20,$h0 | |
1286 | stvx_u $S3,$x30,$h0 | |
1287 | stvx_u $R4,$x40,$h0 | |
1288 | stvx_u $S4,$x50,$h0 | |
1289 | ||
1290 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 | |
1291 | extrwi $h1,$h1,6,0 | |
1292 | mtvrwz $H0,$h0 | |
1293 | insrwi $h1,$h2,20,6 | |
1294 | extrwi $h2,$h2,12,0 | |
1295 | mtvrwz $H1,$h1 | |
1296 | insrwi $h2,$h3,14,6 | |
1297 | extrwi $h3,$h3,18,0 | |
1298 | mtvrwz $H2,$h2 | |
1299 | insrwi $h3,$h4,8,6 | |
1300 | extrwi $h4,$h4,24,0 | |
1301 | mtvrwz $H3,$h3 | |
1302 | insrwi $h4,$t0,3,5 | |
1303 | mtvrwz $H4,$h4 | |
9e58d119 | 1304 | ___ |
a28e4890 | 1305 | } |
9e58d119 | 1306 | $code.=<<___; |
a28e4890 AP |
1307 | li r0,1 |
1308 | stw r0,24($ctx) # set is_base2_26 | |
1309 | b Loaded_vsx | |
1310 | ||
1311 | .align 4 | |
1312 | Lskip_init_vsx: | |
1313 | li $x10,4 | |
1314 | li $x20,8 | |
1315 | li $x30,12 | |
1316 | li $x40,16 | |
1317 | lvwzx_u $H0,$x00,$ctx | |
1318 | lvwzx_u $H1,$x10,$ctx | |
1319 | lvwzx_u $H2,$x20,$ctx | |
1320 | lvwzx_u $H3,$x30,$ctx | |
1321 | lvwzx_u $H4,$x40,$ctx | |
1322 | ||
1323 | Loaded_vsx: | |
1324 | li $x10,0x10 | |
1325 | li $x20,0x20 | |
1326 | li $x30,0x30 | |
1327 | li $x40,0x40 | |
1328 | li $x50,0x50 | |
1329 | li $x60,0x60 | |
1330 | li $x70,0x70 | |
1331 | addi $ctx_,$ctx,64 # &ctx->r[1] | |
1332 | addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow | |
1333 | ||
1334 | vxor $T0,$T0,$T0 # ensure second half is zero | |
1335 | vpermdi $H0,$H0,$T0,0b00 | |
1336 | vpermdi $H1,$H1,$T0,0b00 | |
1337 | vpermdi $H2,$H2,$T0,0b00 | |
1338 | vpermdi $H3,$H3,$T0,0b00 | |
1339 | vpermdi $H4,$H4,$T0,0b00 | |
1340 | ||
1341 | be?lvx_u $_4,$x50,$const # byte swap mask | |
1342 | lvx_u $T1,$x00,$inp # load first input block | |
1343 | lvx_u $T2,$x10,$inp | |
1344 | lvx_u $T3,$x20,$inp | |
1345 | lvx_u $T4,$x30,$inp | |
1346 | be?vperm $T1,$T1,$T1,$_4 | |
1347 | be?vperm $T2,$T2,$T2,$_4 | |
1348 | be?vperm $T3,$T3,$T3,$_4 | |
1349 | be?vperm $T4,$T4,$T4,$_4 | |
1350 | ||
1351 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1352 | vspltisb $_4,4 | |
1353 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1354 | vspltisb $_14,14 | |
1355 | vpermdi $I3,$T1,$T2,0b11 | |
1356 | ||
1357 | vsrd $I1,$I0,$_26 | |
1358 | vsrd $I2,$I2,$_4 | |
1359 | vsrd $I4,$I3,$_40 | |
1360 | vsrd $I3,$I3,$_14 | |
1361 | vand $I0,$I0,$mask26 | |
1362 | vand $I1,$I1,$mask26 | |
1363 | vand $I2,$I2,$mask26 | |
1364 | vand $I3,$I3,$mask26 | |
1365 | ||
1366 | vpermdi $T1,$T3,$T4,0b00 | |
1367 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1368 | vpermdi $T3,$T3,$T4,0b11 | |
1369 | ||
1370 | vsrd $T0,$T1,$_26 | |
1371 | vsrd $T2,$T2,$_4 | |
1372 | vsrd $T4,$T3,$_40 | |
1373 | vsrd $T3,$T3,$_14 | |
1374 | vand $T1,$T1,$mask26 | |
1375 | vand $T0,$T0,$mask26 | |
1376 | vand $T2,$T2,$mask26 | |
1377 | vand $T3,$T3,$mask26 | |
1378 | ||
1379 | # inp[2]:inp[0]:inp[3]:inp[1] | |
1380 | vmrgow $I4,$T4,$I4 | |
1381 | vmrgow $I0,$T1,$I0 | |
1382 | vmrgow $I1,$T0,$I1 | |
1383 | vmrgow $I2,$T2,$I2 | |
1384 | vmrgow $I3,$T3,$I3 | |
1385 | vor $I4,$I4,$padbits | |
1386 | ||
1387 | lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop | |
1388 | lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement | |
1389 | lvx_splt $S1,$x10,$ctx_ | |
1390 | lvx_splt $R2,$x20,$ctx_ | |
1391 | lvx_splt $S2,$x30,$ctx_ | |
1392 | lvx_splt $T1,$x40,$ctx_ | |
1393 | lvx_splt $T2,$x50,$ctx_ | |
1394 | lvx_splt $T3,$x60,$ctx_ | |
1395 | lvx_splt $T4,$x70,$ctx_ | |
1396 | stvx $R1,$x00,$_ctx | |
1397 | stvx $S1,$x10,$_ctx | |
1398 | stvx $R2,$x20,$_ctx | |
1399 | stvx $S2,$x30,$_ctx | |
1400 | stvx $T1,$x40,$_ctx | |
1401 | stvx $T2,$x50,$_ctx | |
1402 | stvx $T3,$x60,$_ctx | |
1403 | stvx $T4,$x70,$_ctx | |
1404 | ||
1405 | addi $inp,$inp,0x40 | |
1406 | addi $const,$const,0x50 | |
1407 | addi r0,$len,-64 | |
1408 | srdi r0,r0,6 | |
1409 | mtctr r0 | |
1410 | b Loop_vsx | |
1411 | ||
1412 | .align 4 | |
1413 | Loop_vsx: | |
1414 | ################################################################ | |
1415 | ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 | |
1416 | ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r | |
1417 | ## \___________________/ | |
1418 | ## | |
1419 | ## Note that we start with inp[2:3]*r^2. This is because it | |
1420 | ## doesn't depend on reduction in previous iteration. | |
1421 | ################################################################ | |
1422 | ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 | |
1423 | ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 | |
1424 | ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 | |
1425 | ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 | |
1426 | ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 | |
1427 | ||
1428 | vmuleuw $ACC0,$I0,$R0 | |
1429 | vmuleuw $ACC1,$I0,$R1 | |
1430 | vmuleuw $ACC2,$I0,$R2 | |
1431 | vmuleuw $ACC3,$I1,$R2 | |
1432 | ||
1433 | vmuleuw $T0,$I1,$R0 | |
1434 | vaddudm $ACC1,$ACC1,$T0 | |
1435 | vmuleuw $T0,$I1,$R1 | |
1436 | vaddudm $ACC2,$ACC2,$T0 | |
1437 | vmuleuw $ACC4,$I2,$R2 | |
1438 | vmuleuw $T0,$I4,$S1 | |
1439 | vaddudm $ACC0,$ACC0,$T0 | |
1440 | vmuleuw $T0,$I2,$R1 | |
1441 | vaddudm $ACC3,$ACC3,$T0 | |
1442 | lvx $S3,$x50,$_ctx | |
1443 | vmuleuw $T0,$I3,$R1 | |
1444 | vaddudm $ACC4,$ACC4,$T0 | |
1445 | lvx $R3,$x40,$_ctx | |
1446 | ||
1447 | vaddudm $H2,$H2,$I2 | |
1448 | vaddudm $H0,$H0,$I0 | |
1449 | vaddudm $H3,$H3,$I3 | |
1450 | vaddudm $H1,$H1,$I1 | |
1451 | vaddudm $H4,$H4,$I4 | |
1452 | ||
1453 | vmuleuw $T0,$I3,$S2 | |
1454 | vaddudm $ACC0,$ACC0,$T0 | |
1455 | vmuleuw $T0,$I4,$S2 | |
1456 | vaddudm $ACC1,$ACC1,$T0 | |
1457 | vmuleuw $T0,$I2,$R0 | |
1458 | vaddudm $ACC2,$ACC2,$T0 | |
1459 | vmuleuw $T0,$I3,$R0 | |
1460 | vaddudm $ACC3,$ACC3,$T0 | |
1461 | lvx $S4,$x70,$_ctx | |
1462 | vmuleuw $T0,$I4,$R0 | |
1463 | vaddudm $ACC4,$ACC4,$T0 | |
1464 | lvx $R4,$x60,$_ctx | |
1465 | ||
1466 | vmuleuw $T0,$I2,$S3 | |
1467 | vaddudm $ACC0,$ACC0,$T0 | |
1468 | vmuleuw $T0,$I3,$S3 | |
1469 | vaddudm $ACC1,$ACC1,$T0 | |
1470 | vmuleuw $T0,$I4,$S3 | |
1471 | vaddudm $ACC2,$ACC2,$T0 | |
1472 | vmuleuw $T0,$I0,$R3 | |
1473 | vaddudm $ACC3,$ACC3,$T0 | |
1474 | vmuleuw $T0,$I1,$R3 | |
1475 | vaddudm $ACC4,$ACC4,$T0 | |
1476 | ||
1477 | be?lvx_u $_4,$x00,$const # byte swap mask | |
1478 | lvx_u $T1,$x00,$inp # load next input block | |
1479 | lvx_u $T2,$x10,$inp | |
1480 | lvx_u $T3,$x20,$inp | |
1481 | lvx_u $T4,$x30,$inp | |
1482 | be?vperm $T1,$T1,$T1,$_4 | |
1483 | be?vperm $T2,$T2,$T2,$_4 | |
1484 | be?vperm $T3,$T3,$T3,$_4 | |
1485 | be?vperm $T4,$T4,$T4,$_4 | |
1486 | ||
1487 | vmuleuw $T0,$I1,$S4 | |
1488 | vaddudm $ACC0,$ACC0,$T0 | |
1489 | vmuleuw $T0,$I2,$S4 | |
1490 | vaddudm $ACC1,$ACC1,$T0 | |
1491 | vmuleuw $T0,$I3,$S4 | |
1492 | vaddudm $ACC2,$ACC2,$T0 | |
1493 | vmuleuw $T0,$I4,$S4 | |
1494 | vaddudm $ACC3,$ACC3,$T0 | |
1495 | vmuleuw $T0,$I0,$R4 | |
1496 | vaddudm $ACC4,$ACC4,$T0 | |
1497 | ||
1498 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1499 | vspltisb $_4,4 | |
1500 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1501 | vpermdi $I3,$T1,$T2,0b11 | |
1502 | ||
1503 | # (hash + inp[0:1]) * r^4 | |
1504 | vmulouw $T0,$H0,$R0 | |
1505 | vaddudm $ACC0,$ACC0,$T0 | |
1506 | vmulouw $T0,$H1,$R0 | |
1507 | vaddudm $ACC1,$ACC1,$T0 | |
1508 | vmulouw $T0,$H2,$R0 | |
1509 | vaddudm $ACC2,$ACC2,$T0 | |
1510 | vmulouw $T0,$H3,$R0 | |
1511 | vaddudm $ACC3,$ACC3,$T0 | |
1512 | vmulouw $T0,$H4,$R0 | |
1513 | vaddudm $ACC4,$ACC4,$T0 | |
1514 | ||
1515 | vpermdi $T1,$T3,$T4,0b00 | |
1516 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1517 | vpermdi $T3,$T3,$T4,0b11 | |
1518 | ||
1519 | vmulouw $T0,$H2,$S3 | |
1520 | vaddudm $ACC0,$ACC0,$T0 | |
1521 | vmulouw $T0,$H3,$S3 | |
1522 | vaddudm $ACC1,$ACC1,$T0 | |
1523 | vmulouw $T0,$H4,$S3 | |
1524 | vaddudm $ACC2,$ACC2,$T0 | |
1525 | vmulouw $T0,$H0,$R3 | |
1526 | vaddudm $ACC3,$ACC3,$T0 | |
1527 | lvx $S1,$x10,$_ctx | |
1528 | vmulouw $T0,$H1,$R3 | |
1529 | vaddudm $ACC4,$ACC4,$T0 | |
1530 | lvx $R1,$x00,$_ctx | |
1531 | ||
1532 | vsrd $I1,$I0,$_26 | |
1533 | vsrd $I2,$I2,$_4 | |
1534 | vsrd $I4,$I3,$_40 | |
1535 | vsrd $I3,$I3,$_14 | |
1536 | ||
1537 | vmulouw $T0,$H1,$S4 | |
1538 | vaddudm $ACC0,$ACC0,$T0 | |
1539 | vmulouw $T0,$H2,$S4 | |
1540 | vaddudm $ACC1,$ACC1,$T0 | |
1541 | vmulouw $T0,$H3,$S4 | |
1542 | vaddudm $ACC2,$ACC2,$T0 | |
1543 | vmulouw $T0,$H4,$S4 | |
1544 | vaddudm $ACC3,$ACC3,$T0 | |
1545 | lvx $S2,$x30,$_ctx | |
1546 | vmulouw $T0,$H0,$R4 | |
1547 | vaddudm $ACC4,$ACC4,$T0 | |
1548 | lvx $R2,$x20,$_ctx | |
1549 | ||
1550 | vand $I0,$I0,$mask26 | |
1551 | vand $I1,$I1,$mask26 | |
1552 | vand $I2,$I2,$mask26 | |
1553 | vand $I3,$I3,$mask26 | |
1554 | ||
1555 | vmulouw $T0,$H4,$S1 | |
1556 | vaddudm $ACC0,$ACC0,$T0 | |
1557 | vmulouw $T0,$H0,$R1 | |
1558 | vaddudm $ACC1,$ACC1,$T0 | |
1559 | vmulouw $T0,$H1,$R1 | |
1560 | vaddudm $ACC2,$ACC2,$T0 | |
1561 | vmulouw $T0,$H2,$R1 | |
1562 | vaddudm $ACC3,$ACC3,$T0 | |
1563 | vmulouw $T0,$H3,$R1 | |
1564 | vaddudm $ACC4,$ACC4,$T0 | |
1565 | ||
1566 | vsrd $T2,$T2,$_4 | |
1567 | vsrd $_4,$T1,$_26 | |
1568 | vsrd $T4,$T3,$_40 | |
1569 | vsrd $T3,$T3,$_14 | |
1570 | ||
1571 | vmulouw $T0,$H3,$S2 | |
1572 | vaddudm $ACC0,$ACC0,$T0 | |
1573 | vmulouw $T0,$H4,$S2 | |
1574 | vaddudm $ACC1,$ACC1,$T0 | |
1575 | vmulouw $T0,$H0,$R2 | |
1576 | vaddudm $ACC2,$ACC2,$T0 | |
1577 | vmulouw $T0,$H1,$R2 | |
1578 | vaddudm $ACC3,$ACC3,$T0 | |
1579 | vmulouw $T0,$H2,$R2 | |
1580 | vaddudm $ACC4,$ACC4,$T0 | |
1581 | ||
1582 | vand $T1,$T1,$mask26 | |
1583 | vand $_4,$_4,$mask26 | |
1584 | vand $T2,$T2,$mask26 | |
1585 | vand $T3,$T3,$mask26 | |
1586 | ||
1587 | ################################################################ | |
1588 | # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein | |
1589 | # and P. Schwabe | |
1590 | ||
1591 | vspltisb $T0,2 | |
1592 | vsrd $H4,$ACC3,$_26 | |
1593 | vsrd $H1,$ACC0,$_26 | |
1594 | vand $H3,$ACC3,$mask26 | |
1595 | vand $H0,$ACC0,$mask26 | |
1596 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1597 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1598 | ||
1599 | vmrgow $I4,$T4,$I4 | |
1600 | vmrgow $I0,$T1,$I0 | |
1601 | vmrgow $I1,$_4,$I1 | |
1602 | vmrgow $I2,$T2,$I2 | |
1603 | vmrgow $I3,$T3,$I3 | |
1604 | vor $I4,$I4,$padbits | |
1605 | ||
1606 | vsrd $ACC4,$H4,$_26 | |
1607 | vsrd $ACC1,$H1,$_26 | |
1608 | vand $H4,$H4,$mask26 | |
1609 | vand $H1,$H1,$mask26 | |
1610 | vaddudm $H0,$H0,$ACC4 | |
1611 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1612 | ||
1613 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1614 | vsrd $ACC2,$H2,$_26 | |
1615 | vand $H2,$H2,$mask26 | |
1616 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1617 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1618 | ||
1619 | vsrd $ACC0,$H0,$_26 | |
1620 | vsrd $ACC3,$H3,$_26 | |
1621 | vand $H0,$H0,$mask26 | |
1622 | vand $H3,$H3,$mask26 | |
1623 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1624 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1625 | ||
1626 | addi $inp,$inp,0x40 | |
1627 | bdnz Loop_vsx | |
1628 | ||
1629 | neg $len,$len | |
1630 | andi. $len,$len,0x30 | |
1631 | sub $inp,$inp,$len | |
1632 | ||
1633 | lvx_u $R0,$x30,$ctx # load all powers | |
1634 | lvx_u $R1,$x00,$ctx_ | |
1635 | lvx_u $S1,$x10,$ctx_ | |
1636 | lvx_u $R2,$x20,$ctx_ | |
1637 | lvx_u $S2,$x30,$ctx_ | |
1638 | ||
1639 | Last_vsx: | |
1640 | vmuleuw $ACC0,$I0,$R0 | |
1641 | vmuleuw $ACC1,$I1,$R0 | |
1642 | vmuleuw $ACC2,$I2,$R0 | |
1643 | vmuleuw $ACC3,$I3,$R0 | |
1644 | vmuleuw $ACC4,$I4,$R0 | |
1645 | ||
1646 | vmuleuw $T0,$I4,$S1 | |
1647 | vaddudm $ACC0,$ACC0,$T0 | |
1648 | vmuleuw $T0,$I0,$R1 | |
1649 | vaddudm $ACC1,$ACC1,$T0 | |
1650 | vmuleuw $T0,$I1,$R1 | |
1651 | vaddudm $ACC2,$ACC2,$T0 | |
1652 | vmuleuw $T0,$I2,$R1 | |
1653 | vaddudm $ACC3,$ACC3,$T0 | |
1654 | lvx_u $S3,$x50,$ctx_ | |
1655 | vmuleuw $T0,$I3,$R1 | |
1656 | vaddudm $ACC4,$ACC4,$T0 | |
1657 | lvx_u $R3,$x40,$ctx_ | |
1658 | ||
1659 | vaddudm $H2,$H2,$I2 | |
1660 | vaddudm $H0,$H0,$I0 | |
1661 | vaddudm $H3,$H3,$I3 | |
1662 | vaddudm $H1,$H1,$I1 | |
1663 | vaddudm $H4,$H4,$I4 | |
1664 | ||
1665 | vmuleuw $T0,$I3,$S2 | |
1666 | vaddudm $ACC0,$ACC0,$T0 | |
1667 | vmuleuw $T0,$I4,$S2 | |
1668 | vaddudm $ACC1,$ACC1,$T0 | |
1669 | vmuleuw $T0,$I0,$R2 | |
1670 | vaddudm $ACC2,$ACC2,$T0 | |
1671 | vmuleuw $T0,$I1,$R2 | |
1672 | vaddudm $ACC3,$ACC3,$T0 | |
1673 | lvx_u $S4,$x70,$ctx_ | |
1674 | vmuleuw $T0,$I2,$R2 | |
1675 | vaddudm $ACC4,$ACC4,$T0 | |
1676 | lvx_u $R4,$x60,$ctx_ | |
1677 | ||
1678 | vmuleuw $T0,$I2,$S3 | |
1679 | vaddudm $ACC0,$ACC0,$T0 | |
1680 | vmuleuw $T0,$I3,$S3 | |
1681 | vaddudm $ACC1,$ACC1,$T0 | |
1682 | vmuleuw $T0,$I4,$S3 | |
1683 | vaddudm $ACC2,$ACC2,$T0 | |
1684 | vmuleuw $T0,$I0,$R3 | |
1685 | vaddudm $ACC3,$ACC3,$T0 | |
1686 | vmuleuw $T0,$I1,$R3 | |
1687 | vaddudm $ACC4,$ACC4,$T0 | |
1688 | ||
1689 | vmuleuw $T0,$I1,$S4 | |
1690 | vaddudm $ACC0,$ACC0,$T0 | |
1691 | vmuleuw $T0,$I2,$S4 | |
1692 | vaddudm $ACC1,$ACC1,$T0 | |
1693 | vmuleuw $T0,$I3,$S4 | |
1694 | vaddudm $ACC2,$ACC2,$T0 | |
1695 | vmuleuw $T0,$I4,$S4 | |
1696 | vaddudm $ACC3,$ACC3,$T0 | |
1697 | vmuleuw $T0,$I0,$R4 | |
1698 | vaddudm $ACC4,$ACC4,$T0 | |
1699 | ||
1700 | # (hash + inp[0:1]) * r^4 | |
1701 | vmulouw $T0,$H0,$R0 | |
1702 | vaddudm $ACC0,$ACC0,$T0 | |
1703 | vmulouw $T0,$H1,$R0 | |
1704 | vaddudm $ACC1,$ACC1,$T0 | |
1705 | vmulouw $T0,$H2,$R0 | |
1706 | vaddudm $ACC2,$ACC2,$T0 | |
1707 | vmulouw $T0,$H3,$R0 | |
1708 | vaddudm $ACC3,$ACC3,$T0 | |
1709 | vmulouw $T0,$H4,$R0 | |
1710 | vaddudm $ACC4,$ACC4,$T0 | |
1711 | ||
1712 | vmulouw $T0,$H2,$S3 | |
1713 | vaddudm $ACC0,$ACC0,$T0 | |
1714 | vmulouw $T0,$H3,$S3 | |
1715 | vaddudm $ACC1,$ACC1,$T0 | |
1716 | vmulouw $T0,$H4,$S3 | |
1717 | vaddudm $ACC2,$ACC2,$T0 | |
1718 | vmulouw $T0,$H0,$R3 | |
1719 | vaddudm $ACC3,$ACC3,$T0 | |
1720 | lvx_u $S1,$x10,$ctx_ | |
1721 | vmulouw $T0,$H1,$R3 | |
1722 | vaddudm $ACC4,$ACC4,$T0 | |
1723 | lvx_u $R1,$x00,$ctx_ | |
1724 | ||
1725 | vmulouw $T0,$H1,$S4 | |
1726 | vaddudm $ACC0,$ACC0,$T0 | |
1727 | vmulouw $T0,$H2,$S4 | |
1728 | vaddudm $ACC1,$ACC1,$T0 | |
1729 | vmulouw $T0,$H3,$S4 | |
1730 | vaddudm $ACC2,$ACC2,$T0 | |
1731 | vmulouw $T0,$H4,$S4 | |
1732 | vaddudm $ACC3,$ACC3,$T0 | |
1733 | lvx_u $S2,$x30,$ctx_ | |
1734 | vmulouw $T0,$H0,$R4 | |
1735 | vaddudm $ACC4,$ACC4,$T0 | |
1736 | lvx_u $R2,$x20,$ctx_ | |
1737 | ||
1738 | vmulouw $T0,$H4,$S1 | |
1739 | vaddudm $ACC0,$ACC0,$T0 | |
1740 | vmulouw $T0,$H0,$R1 | |
1741 | vaddudm $ACC1,$ACC1,$T0 | |
1742 | vmulouw $T0,$H1,$R1 | |
1743 | vaddudm $ACC2,$ACC2,$T0 | |
1744 | vmulouw $T0,$H2,$R1 | |
1745 | vaddudm $ACC3,$ACC3,$T0 | |
1746 | vmulouw $T0,$H3,$R1 | |
1747 | vaddudm $ACC4,$ACC4,$T0 | |
1748 | ||
1749 | vmulouw $T0,$H3,$S2 | |
1750 | vaddudm $ACC0,$ACC0,$T0 | |
1751 | vmulouw $T0,$H4,$S2 | |
1752 | vaddudm $ACC1,$ACC1,$T0 | |
1753 | vmulouw $T0,$H0,$R2 | |
1754 | vaddudm $ACC2,$ACC2,$T0 | |
1755 | vmulouw $T0,$H1,$R2 | |
1756 | vaddudm $ACC3,$ACC3,$T0 | |
1757 | vmulouw $T0,$H2,$R2 | |
1758 | vaddudm $ACC4,$ACC4,$T0 | |
1759 | ||
1760 | ################################################################ | |
1761 | # horizontal addition | |
1762 | ||
1763 | vpermdi $H0,$ACC0,$ACC0,0b10 | |
1764 | vpermdi $H1,$ACC1,$ACC1,0b10 | |
1765 | vpermdi $H2,$ACC2,$ACC2,0b10 | |
1766 | vpermdi $H3,$ACC3,$ACC3,0b10 | |
1767 | vpermdi $H4,$ACC4,$ACC4,0b10 | |
1768 | vaddudm $ACC0,$ACC0,$H0 | |
1769 | vaddudm $ACC1,$ACC1,$H1 | |
1770 | vaddudm $ACC2,$ACC2,$H2 | |
1771 | vaddudm $ACC3,$ACC3,$H3 | |
1772 | vaddudm $ACC4,$ACC4,$H4 | |
1773 | ||
1774 | ################################################################ | |
1775 | # lazy reduction | |
1776 | ||
1777 | vspltisb $T0,2 | |
1778 | vsrd $H4,$ACC3,$_26 | |
1779 | vsrd $H1,$ACC0,$_26 | |
1780 | vand $H3,$ACC3,$mask26 | |
1781 | vand $H0,$ACC0,$mask26 | |
1782 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1783 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1784 | ||
1785 | vsrd $ACC4,$H4,$_26 | |
1786 | vsrd $ACC1,$H1,$_26 | |
1787 | vand $H4,$H4,$mask26 | |
1788 | vand $H1,$H1,$mask26 | |
1789 | vaddudm $H0,$H0,$ACC4 | |
1790 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1791 | ||
1792 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1793 | vsrd $ACC2,$H2,$_26 | |
1794 | vand $H2,$H2,$mask26 | |
1795 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1796 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1797 | ||
1798 | vsrd $ACC0,$H0,$_26 | |
1799 | vsrd $ACC3,$H3,$_26 | |
1800 | vand $H0,$H0,$mask26 | |
1801 | vand $H3,$H3,$mask26 | |
1802 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1803 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1804 | ||
1805 | beq Ldone_vsx | |
1806 | ||
1807 | add r6,$const,$len | |
1808 | ||
1809 | be?lvx_u $_4,$x00,$const # byte swap mask | |
1810 | lvx_u $T1,$x00,$inp # load last partial input block | |
1811 | lvx_u $T2,$x10,$inp | |
1812 | lvx_u $T3,$x20,$inp | |
1813 | lvx_u $T4,$x30,$inp | |
1814 | be?vperm $T1,$T1,$T1,$_4 | |
1815 | be?vperm $T2,$T2,$T2,$_4 | |
1816 | be?vperm $T3,$T3,$T3,$_4 | |
1817 | be?vperm $T4,$T4,$T4,$_4 | |
1818 | ||
1819 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1820 | vspltisb $_4,4 | |
1821 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1822 | vpermdi $I3,$T1,$T2,0b11 | |
1823 | ||
1824 | vsrd $I1,$I0,$_26 | |
1825 | vsrd $I2,$I2,$_4 | |
1826 | vsrd $I4,$I3,$_40 | |
1827 | vsrd $I3,$I3,$_14 | |
1828 | vand $I0,$I0,$mask26 | |
1829 | vand $I1,$I1,$mask26 | |
1830 | vand $I2,$I2,$mask26 | |
1831 | vand $I3,$I3,$mask26 | |
1832 | ||
1833 | vpermdi $T0,$T3,$T4,0b00 | |
1834 | vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1835 | vpermdi $T2,$T3,$T4,0b11 | |
1836 | ||
1837 | lvx_u $ACC0,$x00,r6 | |
1838 | lvx_u $ACC1,$x30,r6 | |
1839 | ||
1840 | vsrd $T3,$T0,$_26 | |
1841 | vsrd $T1,$T1,$_4 | |
1842 | vsrd $T4,$T2,$_40 | |
1843 | vsrd $T2,$T2,$_14 | |
1844 | vand $T0,$T0,$mask26 | |
1845 | vand $T3,$T3,$mask26 | |
1846 | vand $T1,$T1,$mask26 | |
1847 | vand $T2,$T2,$mask26 | |
1848 | ||
1849 | # inp[2]:inp[0]:inp[3]:inp[1] | |
1850 | vmrgow $I4,$T4,$I4 | |
1851 | vmrgow $I0,$T0,$I0 | |
1852 | vmrgow $I1,$T3,$I1 | |
1853 | vmrgow $I2,$T1,$I2 | |
1854 | vmrgow $I3,$T2,$I3 | |
1855 | vor $I4,$I4,$padbits | |
1856 | ||
1857 | vperm $H0,$H0,$H0,$ACC0 # move hash to right lane | |
1858 | vand $I0,$I0, $ACC1 # mask redundant input lane[s] | |
1859 | vperm $H1,$H1,$H1,$ACC0 | |
1860 | vand $I1,$I1, $ACC1 | |
1861 | vperm $H2,$H2,$H2,$ACC0 | |
1862 | vand $I2,$I2, $ACC1 | |
1863 | vperm $H3,$H3,$H3,$ACC0 | |
1864 | vand $I3,$I3, $ACC1 | |
1865 | vperm $H4,$H4,$H4,$ACC0 | |
1866 | vand $I4,$I4, $ACC1 | |
1867 | ||
1868 | vaddudm $I0,$I0,$H0 # accumulate hash | |
1869 | vxor $H0,$H0,$H0 # wipe hash value | |
1870 | vaddudm $I1,$I1,$H1 | |
1871 | vxor $H1,$H1,$H1 | |
1872 | vaddudm $I2,$I2,$H2 | |
1873 | vxor $H2,$H2,$H2 | |
1874 | vaddudm $I3,$I3,$H3 | |
1875 | vxor $H3,$H3,$H3 | |
1876 | vaddudm $I4,$I4,$H4 | |
1877 | vxor $H4,$H4,$H4 | |
1878 | ||
1879 | xor. $len,$len,$len | |
1880 | b Last_vsx | |
1881 | ||
1882 | .align 4 | |
1883 | Ldone_vsx: | |
1884 | $POP r0,`$VSXFRAME+$LRSAVE`($sp) | |
1885 | li $x10,4 | |
1886 | li $x20,8 | |
1887 | li $x30,12 | |
1888 | li $x40,16 | |
1889 | stvwx_u $H0,$x00,$ctx # store hash | |
1890 | stvwx_u $H1,$x10,$ctx | |
1891 | stvwx_u $H2,$x20,$ctx | |
1892 | stvwx_u $H3,$x30,$ctx | |
1893 | stvwx_u $H4,$x40,$ctx | |
1894 | ||
1895 | lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave | |
1896 | mtlr r0 | |
1897 | li r10,`15+$LOCALS+128` | |
1898 | li r11,`31+$LOCALS+128` | |
1899 | mtspr 256,r12 # restore vrsave | |
1900 | lvx v20,r10,$sp | |
1901 | addi r10,r10,32 | |
8d847a3f | 1902 | lvx v21,r11,$sp |
a28e4890 | 1903 | addi r11,r11,32 |
8d847a3f | 1904 | lvx v22,r10,$sp |
a28e4890 | 1905 | addi r10,r10,32 |
8d847a3f | 1906 | lvx v23,r11,$sp |
a28e4890 | 1907 | addi r11,r11,32 |
8d847a3f | 1908 | lvx v24,r10,$sp |
a28e4890 | 1909 | addi r10,r10,32 |
8d847a3f | 1910 | lvx v25,r11,$sp |
a28e4890 | 1911 | addi r11,r11,32 |
8d847a3f | 1912 | lvx v26,r10,$sp |
a28e4890 | 1913 | addi r10,r10,32 |
8d847a3f | 1914 | lvx v27,r11,$sp |
a28e4890 | 1915 | addi r11,r11,32 |
8d847a3f | 1916 | lvx v28,r10,$sp |
a28e4890 | 1917 | addi r10,r10,32 |
8d847a3f RM |
1918 | lvx v29,r11,$sp |
1919 | addi r11,r11,32 | |
1920 | lvx v30,r10,$sp | |
1921 | lvx v31,r11,$sp | |
a28e4890 AP |
1922 | $POP r27,`$VSXFRAME-$SIZE_T*5`($sp) |
1923 | $POP r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
1924 | $POP r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
1925 | $POP r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
1926 | $POP r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
1927 | addi $sp,$sp,$VSXFRAME | |
9e58d119 AP |
1928 | blr |
1929 | .long 0 | |
a28e4890 AP |
1930 | .byte 0,12,0x04,1,0x80,5,4,0 |
1931 | .long 0 | |
1932 | .size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx | |
1933 | ||
1934 | .align 6 | |
1935 | LPICmeup: | |
1936 | mflr r0 | |
1937 | bcl 20,31,\$+4 | |
1938 | mflr $const # vvvvvv "distance" between . and 1st data entry | |
1939 | addi $const,$const,`64-8` | |
1940 | mtlr r0 | |
1941 | blr | |
1942 | .long 0 | |
1943 | .byte 0,12,0x14,0,0,0,0,0 | |
1944 | .space `64-9*4` | |
1945 | ||
1946 | .quad 0x0000000003ffffff,0x0000000003ffffff # mask26 | |
1947 | .quad 0x000000000000001a,0x000000000000001a # _26 | |
1948 | .quad 0x0000000000000028,0x0000000000000028 # _40 | |
1949 | .quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm | |
1950 | .quad 0x0100000001000000,0x0100000001000000 # padbits | |
1951 | .quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian | |
1952 | ||
1953 | .quad 0x0000000000000000,0x0000000004050607 # magic tail masks | |
1954 | .quad 0x0405060700000000,0x0000000000000000 | |
1955 | .quad 0x0000000000000000,0x0405060700000000 | |
1956 | ||
1957 | .quad 0xffffffff00000000,0xffffffffffffffff | |
1958 | .quad 0xffffffff00000000,0xffffffff00000000 | |
1959 | .quad 0x0000000000000000,0xffffffff00000000 | |
9e58d119 | 1960 | ___ |
a28e4890 | 1961 | }}} |
9e58d119 | 1962 | $code.=<<___; |
a28e4890 | 1963 | .asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm" |
9e58d119 AP |
1964 | ___ |
1965 | ||
a28e4890 AP |
1966 | foreach (split("\n",$code)) { |
1967 | s/\`([^\`]*)\`/eval($1)/ge; | |
1968 | ||
1969 | # instructions prefixed with '?' are endian-specific and need | |
1970 | # to be adjusted accordingly... | |
1971 | if ($flavour !~ /le$/) { # big-endian | |
1972 | s/be\?// or | |
1973 | s/le\?/#le#/ | |
1974 | } else { # little-endian | |
1975 | s/le\?// or | |
1976 | s/be\?/#be#/ | |
1977 | } | |
1978 | ||
1979 | print $_,"\n"; | |
1980 | } | |
a21314db | 1981 | close STDOUT or die "error closing STDOUT: $!"; |