]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
83cf7abf | 2 | # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
49d3b641 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9e58d119 AP |
9 | # |
10 | # ==================================================================== | |
a28e4890 AP |
11 | # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL |
12 | # project. The module is dual licensed under OpenSSL and CRYPTOGAMS | |
13 | # licenses depending on where you obtain it. For further details see | |
14 | # https://github.com/dot-asm/cryptogams/. | |
9e58d119 AP |
15 | # ==================================================================== |
16 | # | |
17 | # This module implements Poly1305 hash for PowerPC. | |
18 | # | |
19 | # June 2015 | |
20 | # | |
21 | # Numbers are cycles per processed byte with poly1305_blocks alone, | |
22 | # and improvement coefficients relative to gcc-generated code. | |
23 | # | |
24 | # -m32 -m64 | |
25 | # | |
26 | # Freescale e300 14.8/+80% - | |
4b8736a2 AP |
27 | # PPC74x0 7.60/+60% - |
28 | # PPC970 7.00/+114% 3.51/+205% | |
29 | # POWER7 3.75/+260% 1.93/+100% | |
30 | # POWER8 - 2.03/+200% | |
41013cd6 | 31 | # POWER9 - 2.00/+150% |
9e58d119 AP |
32 | # |
33 | # Do we need floating-point implementation for PPC? Results presented | |
34 | # in poly1305_ieee754.c are tricky to compare to, because they are for | |
35 | # compiler-generated code. On the other hand it's known that floating- | |
36 | # point performance can be dominated by FPU latency, which means that | |
37 | # there is limit even for ideally optimized (and even vectorized) code. | |
38 | # And this limit is estimated to be higher than above -m64 results. Or | |
39 | # in other words floating-point implementation can be meaningful to | |
40 | # consider only in 32-bit application context. We probably have to | |
41 | # recognize that 32-bit builds are getting less popular on high-end | |
42 | # systems and therefore tend to target embedded ones, which might not | |
43 | # even have FPU... | |
44 | # | |
45 | # On side note, Power ISA 2.07 enables vector base 2^26 implementation, | |
46 | # and POWER8 might have capacity to break 1.0 cycle per byte barrier... | |
a28e4890 AP |
47 | # |
48 | # January 2019 | |
49 | # | |
50 | # ... Unfortunately not:-( Estimate was a projection of ARM result, | |
51 | # but ARM has vector multiply-n-add instruction, while PowerISA does | |
52 | # not, not one usable in the context. Improvement is ~40% over -m64 | |
53 | # result above and is ~1.43 on little-endian systems. | |
9e58d119 AP |
54 | |
55 | $flavour = shift; | |
56 | ||
57 | if ($flavour =~ /64/) { | |
58 | $SIZE_T =8; | |
59 | $LRSAVE =2*$SIZE_T; | |
60 | $UCMP ="cmpld"; | |
61 | $STU ="stdu"; | |
62 | $POP ="ld"; | |
63 | $PUSH ="std"; | |
64 | } elsif ($flavour =~ /32/) { | |
65 | $SIZE_T =4; | |
66 | $LRSAVE =$SIZE_T; | |
67 | $UCMP ="cmplw"; | |
68 | $STU ="stwu"; | |
69 | $POP ="lwz"; | |
70 | $PUSH ="stw"; | |
71 | } else { die "nonsense $flavour"; } | |
72 | ||
60250017 | 73 | # Define endianness based on flavour |
9e58d119 AP |
74 | # i.e.: linux64le |
75 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; | |
76 | ||
77 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
78 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
79 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
80 | die "can't locate ppc-xlate.pl"; | |
81 | ||
82 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
83 | ||
84 | $FRAME=24*$SIZE_T; | |
85 | ||
86 | $sp="r1"; | |
87 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); | |
88 | my ($mac,$nonce)=($inp,$len); | |
89 | my $mask = "r0"; | |
90 | ||
91 | $code=<<___; | |
92 | .machine "any" | |
93 | .text | |
94 | ___ | |
95 | if ($flavour =~ /64/) { | |
96 | ############################################################################### | |
97 | # base 2^64 implementation | |
98 | ||
99 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31)); | |
100 | ||
101 | $code.=<<___; | |
102 | .globl .poly1305_init_int | |
103 | .align 4 | |
104 | .poly1305_init_int: | |
105 | xor r0,r0,r0 | |
106 | std r0,0($ctx) # zero hash value | |
107 | std r0,8($ctx) | |
108 | std r0,16($ctx) | |
a28e4890 | 109 | stw r0,24($ctx) # clear is_base2_26 |
9e58d119 AP |
110 | |
111 | $UCMP $inp,r0 | |
112 | beq- Lno_key | |
113 | ___ | |
114 | $code.=<<___ if ($LITTLE_ENDIAN); | |
115 | ld $d0,0($inp) # load key material | |
116 | ld $d1,8($inp) | |
117 | ___ | |
118 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
119 | li $h0,4 | |
120 | lwbrx $d0,0,$inp # load key material | |
121 | li $d1,8 | |
122 | lwbrx $h0,$h0,$inp | |
123 | li $h1,12 | |
124 | lwbrx $d1,$d1,$inp | |
125 | lwbrx $h1,$h1,$inp | |
126 | insrdi $d0,$h0,32,0 | |
127 | insrdi $d1,$h1,32,0 | |
128 | ___ | |
129 | $code.=<<___; | |
130 | lis $h1,0xfff # 0x0fff0000 | |
131 | ori $h1,$h1,0xfffc # 0x0ffffffc | |
132 | insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc | |
133 | ori $h0,$h1,3 # 0x0ffffffc0fffffff | |
134 | ||
135 | and $d0,$d0,$h0 | |
136 | and $d1,$d1,$h1 | |
137 | ||
138 | std $d0,32($ctx) # store key | |
139 | std $d1,40($ctx) | |
140 | ||
141 | Lno_key: | |
142 | xor r3,r3,r3 | |
143 | blr | |
144 | .long 0 | |
145 | .byte 0,12,0x14,0,0,0,2,0 | |
146 | .size .poly1305_init_int,.-.poly1305_init_int | |
147 | ||
148 | .globl .poly1305_blocks | |
149 | .align 4 | |
150 | .poly1305_blocks: | |
a28e4890 | 151 | Lpoly1305_blocks: |
9e58d119 AP |
152 | srdi. $len,$len,4 |
153 | beq- Labort | |
154 | ||
155 | $STU $sp,-$FRAME($sp) | |
156 | mflr r0 | |
157 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
158 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
159 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
160 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
161 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
162 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
163 | ||
164 | ld $r0,32($ctx) # load key | |
165 | ld $r1,40($ctx) | |
166 | ||
167 | ld $h0,0($ctx) # load hash value | |
168 | ld $h1,8($ctx) | |
169 | ld $h2,16($ctx) | |
170 | ||
171 | srdi $s1,$r1,2 | |
172 | mtctr $len | |
173 | add $s1,$s1,$r1 # s1 = r1 + r1>>2 | |
174 | li $mask,3 | |
175 | b Loop | |
176 | ||
177 | .align 4 | |
178 | Loop: | |
179 | ___ | |
180 | $code.=<<___ if ($LITTLE_ENDIAN); | |
181 | ld $t0,0($inp) # load input | |
182 | ld $t1,8($inp) | |
183 | ___ | |
184 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
185 | li $d0,4 | |
186 | lwbrx $t0,0,$inp # load input | |
187 | li $t1,8 | |
188 | lwbrx $d0,$d0,$inp | |
189 | li $d1,12 | |
190 | lwbrx $t1,$t1,$inp | |
191 | lwbrx $d1,$d1,$inp | |
192 | insrdi $t0,$d0,32,0 | |
193 | insrdi $t1,$d1,32,0 | |
194 | ___ | |
195 | $code.=<<___; | |
196 | addi $inp,$inp,16 | |
197 | ||
198 | addc $h0,$h0,$t0 # accumulate input | |
199 | adde $h1,$h1,$t1 | |
200 | ||
201 | mulld $d0,$h0,$r0 # h0*r0 | |
202 | mulhdu $d1,$h0,$r0 | |
203 | adde $h2,$h2,$padbit | |
204 | ||
205 | mulld $t0,$h1,$s1 # h1*5*r1 | |
206 | mulhdu $t1,$h1,$s1 | |
207 | addc $d0,$d0,$t0 | |
208 | adde $d1,$d1,$t1 | |
209 | ||
210 | mulld $t0,$h0,$r1 # h0*r1 | |
211 | mulhdu $d2,$h0,$r1 | |
212 | addc $d1,$d1,$t0 | |
213 | addze $d2,$d2 | |
214 | ||
215 | mulld $t0,$h1,$r0 # h1*r0 | |
216 | mulhdu $t1,$h1,$r0 | |
217 | addc $d1,$d1,$t0 | |
218 | adde $d2,$d2,$t1 | |
219 | ||
220 | mulld $t0,$h2,$s1 # h2*5*r1 | |
221 | mulld $t1,$h2,$r0 # h2*r0 | |
222 | addc $d1,$d1,$t0 | |
223 | adde $d2,$d2,$t1 | |
224 | ||
225 | andc $t0,$d2,$mask # final reduction step | |
226 | and $h2,$d2,$mask | |
227 | srdi $t1,$t0,2 | |
228 | add $t0,$t0,$t1 | |
229 | addc $h0,$d0,$t0 | |
230 | addze $h1,$d1 | |
4b8736a2 | 231 | addze $h2,$h2 |
9e58d119 AP |
232 | |
233 | bdnz Loop | |
234 | ||
235 | std $h0,0($ctx) # store hash value | |
236 | std $h1,8($ctx) | |
237 | std $h2,16($ctx) | |
238 | ||
239 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
240 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
241 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
242 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
243 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
244 | addi $sp,$sp,$FRAME | |
245 | Labort: | |
246 | blr | |
247 | .long 0 | |
248 | .byte 0,12,4,1,0x80,5,4,0 | |
249 | .size .poly1305_blocks,.-.poly1305_blocks | |
a28e4890 AP |
250 | ___ |
251 | { | |
252 | my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12)); | |
9e58d119 | 253 | |
a28e4890 | 254 | $code.=<<___; |
9e58d119 | 255 | .globl .poly1305_emit |
a28e4890 | 256 | .align 5 |
9e58d119 | 257 | .poly1305_emit: |
a28e4890 AP |
258 | lwz $h0,0($ctx) # load hash value base 2^26 |
259 | lwz $h1,4($ctx) | |
260 | lwz $h2,8($ctx) | |
261 | lwz $h3,12($ctx) | |
262 | lwz $h4,16($ctx) | |
263 | lwz r0,24($ctx) # is_base2_26 | |
264 | ||
265 | sldi $h1,$h1,26 # base 2^26 -> base 2^64 | |
266 | sldi $t0,$h2,52 | |
267 | srdi $h2,$h2,12 | |
268 | sldi $h3,$h3,14 | |
269 | add $h0,$h0,$h1 | |
270 | addc $h0,$h0,$t0 | |
271 | sldi $t0,$h4,40 | |
272 | srdi $h4,$h4,24 | |
273 | adde $h1,$h2,$h3 | |
274 | addc $h1,$h1,$t0 | |
275 | addze $h2,$h4 | |
276 | ||
277 | ld $h3,0($ctx) # load hash value base 2^64 | |
278 | ld $h4,8($ctx) | |
279 | ld $t0,16($ctx) | |
280 | ||
281 | neg r0,r0 | |
282 | xor $h0,$h0,$h3 # choose between radixes | |
283 | xor $h1,$h1,$h4 | |
284 | xor $h2,$h2,$t0 | |
285 | and $h0,$h0,r0 | |
286 | and $h1,$h1,r0 | |
287 | and $h2,$h2,r0 | |
288 | xor $h0,$h0,$h3 | |
289 | xor $h1,$h1,$h4 | |
290 | xor $h2,$h2,$t0 | |
291 | ||
292 | addic $h3,$h0,5 # compare to modulus | |
293 | addze $h4,$h1 | |
294 | addze $t0,$h2 | |
295 | ||
296 | srdi $t0,$t0,2 # see if it carried/borrowed | |
297 | neg $t0,$t0 | |
298 | ||
299 | andc $h0,$h0,$t0 | |
300 | and $h3,$h3,$t0 | |
301 | andc $h1,$h1,$t0 | |
302 | and $h4,$h4,$t0 | |
303 | or $h0,$h0,$h3 | |
304 | or $h1,$h1,$h4 | |
305 | ||
306 | lwz $t0,4($nonce) | |
307 | lwz $h2,12($nonce) | |
308 | lwz $h3,0($nonce) | |
309 | lwz $h4,8($nonce) | |
310 | ||
311 | insrdi $h3,$t0,32,0 | |
312 | insrdi $h4,$h2,32,0 | |
313 | ||
314 | addc $h0,$h0,$h3 # accumulate nonce | |
315 | adde $h1,$h1,$h4 | |
316 | ||
317 | addi $ctx,$mac,-1 | |
318 | addi $mac,$mac,7 | |
319 | ||
320 | stbu $h0,1($ctx) # write [little-endian] result | |
321 | srdi $h0,$h0,8 | |
322 | stbu $h1,1($mac) | |
323 | srdi $h1,$h1,8 | |
324 | ||
325 | stbu $h0,1($ctx) | |
326 | srdi $h0,$h0,8 | |
327 | stbu $h1,1($mac) | |
328 | srdi $h1,$h1,8 | |
329 | ||
330 | stbu $h0,1($ctx) | |
331 | srdi $h0,$h0,8 | |
332 | stbu $h1,1($mac) | |
333 | srdi $h1,$h1,8 | |
334 | ||
335 | stbu $h0,1($ctx) | |
336 | srdi $h0,$h0,8 | |
337 | stbu $h1,1($mac) | |
338 | srdi $h1,$h1,8 | |
339 | ||
340 | stbu $h0,1($ctx) | |
341 | srdi $h0,$h0,8 | |
342 | stbu $h1,1($mac) | |
343 | srdi $h1,$h1,8 | |
344 | ||
345 | stbu $h0,1($ctx) | |
346 | srdi $h0,$h0,8 | |
347 | stbu $h1,1($mac) | |
348 | srdi $h1,$h1,8 | |
349 | ||
350 | stbu $h0,1($ctx) | |
351 | srdi $h0,$h0,8 | |
352 | stbu $h1,1($mac) | |
353 | srdi $h1,$h1,8 | |
354 | ||
355 | stbu $h0,1($ctx) | |
356 | stbu $h1,1($mac) | |
9e58d119 | 357 | |
9e58d119 AP |
358 | blr |
359 | .long 0 | |
360 | .byte 0,12,0x14,0,0,0,3,0 | |
361 | .size .poly1305_emit,.-.poly1305_emit | |
362 | ___ | |
a28e4890 | 363 | } } else { |
9e58d119 AP |
364 | ############################################################################### |
365 | # base 2^32 implementation | |
366 | ||
367 | my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3, | |
368 | $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3 | |
369 | ) = map("r$_",(7..12,14..31)); | |
370 | ||
371 | $code.=<<___; | |
372 | .globl .poly1305_init_int | |
373 | .align 4 | |
374 | .poly1305_init_int: | |
375 | xor r0,r0,r0 | |
376 | stw r0,0($ctx) # zero hash value | |
377 | stw r0,4($ctx) | |
378 | stw r0,8($ctx) | |
379 | stw r0,12($ctx) | |
380 | stw r0,16($ctx) | |
a28e4890 | 381 | stw r0,24($ctx) # clear is_base2_26 |
9e58d119 AP |
382 | |
383 | $UCMP $inp,r0 | |
384 | beq- Lno_key | |
385 | ___ | |
386 | $code.=<<___ if ($LITTLE_ENDIAN); | |
387 | lw $h0,0($inp) # load key material | |
388 | lw $h1,4($inp) | |
389 | lw $h2,8($inp) | |
390 | lw $h3,12($inp) | |
391 | ___ | |
392 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
393 | li $h1,4 | |
394 | lwbrx $h0,0,$inp # load key material | |
395 | li $h2,8 | |
396 | lwbrx $h1,$h1,$inp | |
397 | li $h3,12 | |
398 | lwbrx $h2,$h2,$inp | |
399 | lwbrx $h3,$h3,$inp | |
400 | ___ | |
401 | $code.=<<___; | |
402 | lis $mask,0xf000 # 0xf0000000 | |
403 | li $r0,-4 | |
404 | andc $r0,$r0,$mask # 0x0ffffffc | |
405 | ||
406 | andc $h0,$h0,$mask | |
407 | and $h1,$h1,$r0 | |
408 | and $h2,$h2,$r0 | |
409 | and $h3,$h3,$r0 | |
410 | ||
411 | stw $h0,32($ctx) # store key | |
412 | stw $h1,36($ctx) | |
413 | stw $h2,40($ctx) | |
414 | stw $h3,44($ctx) | |
415 | ||
416 | Lno_key: | |
417 | xor r3,r3,r3 | |
418 | blr | |
419 | .long 0 | |
420 | .byte 0,12,0x14,0,0,0,2,0 | |
421 | .size .poly1305_init_int,.-.poly1305_init_int | |
422 | ||
423 | .globl .poly1305_blocks | |
424 | .align 4 | |
425 | .poly1305_blocks: | |
a28e4890 | 426 | Lpoly1305_blocks: |
9e58d119 AP |
427 | srwi. $len,$len,4 |
428 | beq- Labort | |
429 | ||
430 | $STU $sp,-$FRAME($sp) | |
431 | mflr r0 | |
432 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
433 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
434 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
435 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
436 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
437 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
438 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
439 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
440 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
441 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
442 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
443 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
444 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
445 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
446 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
447 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
448 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
449 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
450 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
451 | ||
452 | lwz $r0,32($ctx) # load key | |
453 | lwz $r1,36($ctx) | |
454 | lwz $r2,40($ctx) | |
455 | lwz $r3,44($ctx) | |
456 | ||
457 | lwz $h0,0($ctx) # load hash value | |
458 | lwz $h1,4($ctx) | |
459 | lwz $h2,8($ctx) | |
460 | lwz $h3,12($ctx) | |
461 | lwz $h4,16($ctx) | |
462 | ||
463 | srwi $s1,$r1,2 | |
464 | srwi $s2,$r2,2 | |
465 | srwi $s3,$r3,2 | |
466 | add $s1,$s1,$r1 # si = ri + ri>>2 | |
467 | add $s2,$s2,$r2 | |
468 | add $s3,$s3,$r3 | |
469 | mtctr $len | |
470 | li $mask,3 | |
471 | b Loop | |
472 | ||
473 | .align 4 | |
474 | Loop: | |
475 | ___ | |
476 | $code.=<<___ if ($LITTLE_ENDIAN); | |
477 | lwz $d0,0($inp) # load input | |
478 | lwz $d1,4($inp) | |
479 | lwz $d2,8($inp) | |
480 | lwz $d3,12($inp) | |
481 | ___ | |
482 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
483 | li $d1,4 | |
484 | lwbrx $d0,0,$inp # load input | |
485 | li $d2,8 | |
486 | lwbrx $d1,$d1,$inp | |
487 | li $d3,12 | |
488 | lwbrx $d2,$d2,$inp | |
489 | lwbrx $d3,$d3,$inp | |
490 | ___ | |
491 | $code.=<<___; | |
492 | addi $inp,$inp,16 | |
493 | ||
494 | addc $h0,$h0,$d0 # accumulate input | |
495 | adde $h1,$h1,$d1 | |
496 | adde $h2,$h2,$d2 | |
497 | ||
498 | mullw $d0,$h0,$r0 # h0*r0 | |
499 | mulhwu $D0,$h0,$r0 | |
500 | ||
501 | mullw $d1,$h0,$r1 # h0*r1 | |
502 | mulhwu $D1,$h0,$r1 | |
503 | ||
504 | mullw $d2,$h0,$r2 # h0*r2 | |
505 | mulhwu $D2,$h0,$r2 | |
506 | ||
507 | adde $h3,$h3,$d3 | |
508 | adde $h4,$h4,$padbit | |
509 | ||
510 | mullw $d3,$h0,$r3 # h0*r3 | |
511 | mulhwu $D3,$h0,$r3 | |
512 | ||
513 | mullw $t0,$h1,$s3 # h1*s3 | |
514 | mulhwu $t1,$h1,$s3 | |
515 | ||
516 | mullw $t2,$h1,$r0 # h1*r0 | |
517 | mulhwu $t3,$h1,$r0 | |
518 | addc $d0,$d0,$t0 | |
519 | adde $D0,$D0,$t1 | |
520 | ||
521 | mullw $t0,$h1,$r1 # h1*r1 | |
522 | mulhwu $t1,$h1,$r1 | |
523 | addc $d1,$d1,$t2 | |
524 | adde $D1,$D1,$t3 | |
525 | ||
526 | mullw $t2,$h1,$r2 # h1*r2 | |
527 | mulhwu $t3,$h1,$r2 | |
528 | addc $d2,$d2,$t0 | |
529 | adde $D2,$D2,$t1 | |
530 | ||
531 | mullw $t0,$h2,$s2 # h2*s2 | |
532 | mulhwu $t1,$h2,$s2 | |
533 | addc $d3,$d3,$t2 | |
534 | adde $D3,$D3,$t3 | |
535 | ||
536 | mullw $t2,$h2,$s3 # h2*s3 | |
537 | mulhwu $t3,$h2,$s3 | |
538 | addc $d0,$d0,$t0 | |
539 | adde $D0,$D0,$t1 | |
540 | ||
541 | mullw $t0,$h2,$r0 # h2*r0 | |
542 | mulhwu $t1,$h2,$r0 | |
543 | addc $d1,$d1,$t2 | |
544 | adde $D1,$D1,$t3 | |
545 | ||
546 | mullw $t2,$h2,$r1 # h2*r1 | |
547 | mulhwu $t3,$h2,$r1 | |
548 | addc $d2,$d2,$t0 | |
549 | adde $D2,$D2,$t1 | |
550 | ||
551 | mullw $t0,$h3,$s1 # h3*s1 | |
552 | mulhwu $t1,$h3,$s1 | |
553 | addc $d3,$d3,$t2 | |
554 | adde $D3,$D3,$t3 | |
555 | ||
556 | mullw $t2,$h3,$s2 # h3*s2 | |
557 | mulhwu $t3,$h3,$s2 | |
558 | addc $d0,$d0,$t0 | |
559 | adde $D0,$D0,$t1 | |
560 | ||
561 | mullw $t0,$h3,$s3 # h3*s3 | |
562 | mulhwu $t1,$h3,$s3 | |
563 | addc $d1,$d1,$t2 | |
564 | adde $D1,$D1,$t3 | |
565 | ||
566 | mullw $t2,$h3,$r0 # h3*r0 | |
567 | mulhwu $t3,$h3,$r0 | |
568 | addc $d2,$d2,$t0 | |
569 | adde $D2,$D2,$t1 | |
570 | ||
571 | mullw $t0,$h4,$s1 # h4*s1 | |
572 | addc $d3,$d3,$t2 | |
573 | adde $D3,$D3,$t3 | |
574 | addc $d1,$d1,$t0 | |
575 | ||
576 | mullw $t1,$h4,$s2 # h4*s2 | |
577 | addze $D1,$D1 | |
578 | addc $d2,$d2,$t1 | |
579 | addze $D2,$D2 | |
580 | ||
581 | mullw $t2,$h4,$s3 # h4*s3 | |
582 | addc $d3,$d3,$t2 | |
583 | addze $D3,$D3 | |
584 | ||
585 | mullw $h4,$h4,$r0 # h4*r0 | |
586 | ||
587 | addc $h1,$d1,$D0 | |
588 | adde $h2,$d2,$D1 | |
589 | adde $h3,$d3,$D2 | |
590 | adde $h4,$h4,$D3 | |
591 | ||
592 | andc $D0,$h4,$mask # final reduction step | |
593 | and $h4,$h4,$mask | |
594 | srwi $D1,$D0,2 | |
595 | add $D0,$D0,$D1 | |
596 | addc $h0,$d0,$D0 | |
597 | addze $h1,$h1 | |
598 | addze $h2,$h2 | |
599 | addze $h3,$h3 | |
4b8736a2 | 600 | addze $h4,$h4 |
9e58d119 AP |
601 | |
602 | bdnz Loop | |
603 | ||
604 | stw $h0,0($ctx) # store hash value | |
605 | stw $h1,4($ctx) | |
606 | stw $h2,8($ctx) | |
607 | stw $h3,12($ctx) | |
608 | stw $h4,16($ctx) | |
609 | ||
610 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
611 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
612 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
613 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
614 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
615 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
616 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
617 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
618 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
619 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
620 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
621 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
622 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
623 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
624 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
625 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
626 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
627 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
628 | addi $sp,$sp,$FRAME | |
629 | Labort: | |
630 | blr | |
631 | .long 0 | |
632 | .byte 0,12,4,1,0x80,18,4,0 | |
633 | .size .poly1305_blocks,.-.poly1305_blocks | |
a28e4890 AP |
634 | ___ |
635 | { | |
636 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12)); | |
9e58d119 | 637 | |
a28e4890 | 638 | $code.=<<___; |
9e58d119 | 639 | .globl .poly1305_emit |
a28e4890 | 640 | .align 5 |
9e58d119 | 641 | .poly1305_emit: |
a28e4890 AP |
642 | lwz r0,24($ctx) # is_base2_26 |
643 | lwz $h0,0($ctx) # load hash value | |
644 | lwz $h1,4($ctx) | |
645 | lwz $h2,8($ctx) | |
646 | lwz $h3,12($ctx) | |
647 | lwz $h4,16($ctx) | |
648 | cmplwi r0,0 | |
649 | beq Lemit_base2_32 | |
650 | ||
651 | slwi $t0,$h1,26 # base 2^26 -> base 2^32 | |
652 | srwi $h1,$h1,6 | |
653 | slwi $t1,$h2,20 | |
654 | srwi $h2,$h2,12 | |
655 | addc $h0,$h0,$t0 | |
656 | slwi $t0,$h3,14 | |
657 | srwi $h3,$h3,18 | |
658 | adde $h1,$h1,$t1 | |
659 | slwi $t1,$h4,8 | |
660 | srwi $h4,$h4,24 | |
661 | adde $h2,$h2,$t0 | |
662 | adde $h3,$h3,$t1 | |
663 | addze $h4,$h4 | |
664 | ||
665 | Lemit_base2_32: | |
666 | addic r0,$h0,5 # compare to modulus | |
667 | addze r0,$h1 | |
668 | addze r0,$h2 | |
669 | addze r0,$h3 | |
670 | addze r0,$h4 | |
671 | ||
672 | srwi r0,r0,2 # see if it carried/borrowed | |
673 | neg r0,r0 | |
674 | andi. r0,r0,5 | |
675 | ||
676 | addc $h0,$h0,r0 | |
677 | lwz r0,0($nonce) | |
678 | addze $h1,$h1 | |
679 | lwz $t0,4($nonce) | |
680 | addze $h2,$h2 | |
681 | lwz $t1,8($nonce) | |
682 | addze $h3,$h3 | |
683 | lwz $h4,12($nonce) | |
684 | ||
685 | addc $h0,$h0,r0 # accumulate nonce | |
686 | adde $h1,$h1,$t0 | |
687 | adde $h2,$h2,$t1 | |
688 | adde $h3,$h3,$h4 | |
689 | ||
690 | addi $ctx,$mac,-1 | |
691 | addi $mac,$mac,7 | |
692 | ||
693 | stbu $h0,1($ctx) # write [little-endian] result | |
694 | srwi $h0,$h0,8 | |
695 | stbu $h2,1($mac) | |
696 | srwi $h2,$h2,8 | |
697 | ||
698 | stbu $h0,1($ctx) | |
699 | srwi $h0,$h0,8 | |
700 | stbu $h2,1($mac) | |
701 | srwi $h2,$h2,8 | |
702 | ||
703 | stbu $h0,1($ctx) | |
704 | srwi $h0,$h0,8 | |
705 | stbu $h2,1($mac) | |
706 | srwi $h2,$h2,8 | |
707 | ||
708 | stbu $h0,1($ctx) | |
709 | stbu $h2,1($mac) | |
710 | ||
711 | stbu $h1,1($ctx) | |
712 | srwi $h1,$h1,8 | |
713 | stbu $h3,1($mac) | |
714 | srwi $h3,$h3,8 | |
715 | ||
716 | stbu $h1,1($ctx) | |
717 | srwi $h1,$h1,8 | |
718 | stbu $h3,1($mac) | |
719 | srwi $h3,$h3,8 | |
720 | ||
721 | stbu $h1,1($ctx) | |
722 | srwi $h1,$h1,8 | |
723 | stbu $h3,1($mac) | |
724 | srwi $h3,$h3,8 | |
725 | ||
726 | stbu $h1,1($ctx) | |
727 | stbu $h3,1($mac) | |
728 | ||
729 | blr | |
730 | .long 0 | |
731 | .byte 0,12,0x14,0,0,0,3,0 | |
732 | .size .poly1305_emit,.-.poly1305_emit | |
733 | ___ | |
734 | } } | |
735 | {{{ | |
736 | ######################################################################## | |
737 | # PowerISA 2.07/VSX section # | |
738 | ######################################################################## | |
739 | ||
740 | my $LOCALS= 6*$SIZE_T; | |
741 | my $VSXFRAME = $LOCALS + 6*$SIZE_T; | |
742 | $VSXFRAME += 128; # local variables | |
743 | $VSXFRAME += 13*16; # v20-v31 offload | |
744 | ||
745 | my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0; | |
746 | ||
747 | ######################################################################## | |
748 | # Layout of opaque area is following: | |
749 | # | |
750 | # unsigned __int32 h[5]; # current hash value base 2^26 | |
751 | # unsigned __int32 pad; | |
752 | # unsigned __int32 is_base2_26, pad; | |
753 | # unsigned __int64 r[2]; # key value base 2^64 | |
754 | # struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9]; | |
755 | # | |
756 | # where r^n are base 2^26 digits of powers of multiplier key. There are | |
757 | # 5 digits, but last four are interleaved with multiples of 5, totalling | |
758 | # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of | |
759 | # powers is as they appear in register, not memory. | |
760 | ||
761 | my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4)); | |
762 | my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9)); | |
763 | my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14)); | |
764 | my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2); | |
765 | my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19)); | |
766 | my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24)); | |
767 | my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31)); | |
768 | my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31))); | |
769 | my ($ctx_,$_ctx,$const) = map("r$_",(10..12)); | |
770 | ||
771 | if ($flavour =~ /64/) { | |
772 | ############################################################################### | |
773 | # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms, | |
774 | # but the base 2^26 computational part is same... | |
775 | ||
776 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31)); | |
777 | my $mask = "r0"; | |
778 | ||
779 | $code.=<<___; | |
780 | .globl .poly1305_blocks_vsx | |
781 | .align 5 | |
782 | .poly1305_blocks_vsx: | |
783 | lwz r7,24($ctx) # is_base2_26 | |
784 | cmpldi $len,128 | |
785 | bge __poly1305_blocks_vsx | |
786 | ||
787 | neg r0,r7 # is_base2_26 as mask | |
788 | lwz r7,0($ctx) # load hash base 2^26 | |
789 | lwz r8,4($ctx) | |
790 | lwz r9,8($ctx) | |
791 | lwz r10,12($ctx) | |
792 | lwz r11,16($ctx) | |
793 | ||
794 | sldi r8,r8,26 # base 2^26 -> base 2^64 | |
795 | sldi r12,r9,52 | |
796 | add r7,r7,r8 | |
797 | srdi r9,r9,12 | |
798 | sldi r10,r10,14 | |
799 | addc r7,r7,r12 | |
800 | sldi r8,r11,40 | |
801 | adde r9,r9,r10 | |
802 | srdi r11,r11,24 | |
803 | addc r9,r9,r8 | |
804 | addze r11,r11 | |
805 | ||
806 | ld r8,0($ctx) # load hash base 2^64 | |
807 | ld r10,8($ctx) | |
808 | ld r12,16($ctx) | |
809 | ||
810 | xor r7,r7,r8 # select between radixes | |
811 | xor r9,r9,r10 | |
812 | xor r11,r11,r12 | |
813 | and r7,r7,r0 | |
814 | and r9,r9,r0 | |
815 | and r11,r11,r0 | |
816 | xor r7,r7,r8 | |
817 | xor r9,r9,r10 | |
818 | xor r11,r11,r12 | |
819 | ||
820 | li r0,0 | |
821 | std r7,0($ctx) # store hash base 2^64 | |
822 | std r9,8($ctx) | |
823 | std r11,16($ctx) | |
824 | stw r0,24($ctx) # clear is_base2_26 | |
825 | ||
826 | b Lpoly1305_blocks | |
827 | .long 0 | |
828 | .byte 0,12,0x14,0,0,0,4,0 | |
829 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx | |
830 | ||
831 | .align 5 | |
832 | __poly1305_mul: | |
833 | mulld $d0,$h0,$r0 # h0*r0 | |
834 | mulhdu $d1,$h0,$r0 | |
835 | ||
836 | mulld $t0,$h1,$s1 # h1*5*r1 | |
837 | mulhdu $t1,$h1,$s1 | |
838 | addc $d0,$d0,$t0 | |
839 | adde $d1,$d1,$t1 | |
840 | ||
841 | mulld $t0,$h0,$r1 # h0*r1 | |
842 | mulhdu $d2,$h0,$r1 | |
843 | addc $d1,$d1,$t0 | |
844 | addze $d2,$d2 | |
845 | ||
846 | mulld $t0,$h1,$r0 # h1*r0 | |
847 | mulhdu $t1,$h1,$r0 | |
848 | addc $d1,$d1,$t0 | |
849 | adde $d2,$d2,$t1 | |
850 | ||
851 | mulld $t0,$h2,$s1 # h2*5*r1 | |
852 | mulld $t1,$h2,$r0 # h2*r0 | |
853 | addc $d1,$d1,$t0 | |
854 | adde $d2,$d2,$t1 | |
855 | ||
856 | andc $t0,$d2,$mask # final reduction step | |
857 | and $h2,$d2,$mask | |
858 | srdi $t1,$t0,2 | |
859 | add $t0,$t0,$t1 | |
860 | addc $h0,$d0,$t0 | |
861 | addze $h1,$d1 | |
862 | addze $h2,$h2 | |
863 | ||
864 | blr | |
865 | .long 0 | |
866 | .byte 0,12,0x14,0,0,0,0,0 | |
867 | .size __poly1305_mul,.-__poly1305_mul | |
868 | ||
869 | .align 5 | |
870 | __poly1305_splat: | |
871 | extrdi $d0,$h0,26,38 | |
872 | extrdi $d1,$h0,26,12 | |
873 | stw $d0,0x00($t1) | |
874 | ||
875 | extrdi $d2,$h0,12,0 | |
876 | slwi $d0,$d1,2 | |
877 | stw $d1,0x10($t1) | |
878 | add $d0,$d0,$d1 # * 5 | |
879 | stw $d0,0x20($t1) | |
880 | ||
881 | insrdi $d2,$h1,14,38 | |
882 | slwi $d0,$d2,2 | |
883 | stw $d2,0x30($t1) | |
884 | add $d0,$d0,$d2 # * 5 | |
885 | stw $d0,0x40($t1) | |
886 | ||
887 | extrdi $d1,$h1,26,24 | |
888 | extrdi $d2,$h1,24,0 | |
889 | slwi $d0,$d1,2 | |
890 | stw $d1,0x50($t1) | |
891 | add $d0,$d0,$d1 # * 5 | |
892 | stw $d0,0x60($t1) | |
893 | ||
894 | insrdi $d2,$h2,3,37 | |
895 | slwi $d0,$d2,2 | |
896 | stw $d2,0x70($t1) | |
897 | add $d0,$d0,$d2 # * 5 | |
898 | stw $d0,0x80($t1) | |
899 | ||
900 | blr | |
901 | .long 0 | |
902 | .byte 0,12,0x14,0,0,0,0,0 | |
903 | .size __poly1305_splat,.-__poly1305_splat | |
904 | ||
905 | .align 5 | |
906 | __poly1305_blocks_vsx: | |
907 | $STU $sp,-$VSXFRAME($sp) | |
9e58d119 | 908 | mflr r0 |
a28e4890 AP |
909 | li r10,`15+$LOCALS+128` |
910 | li r11,`31+$LOCALS+128` | |
911 | mfspr r12,256 | |
912 | stvx v20,r10,$sp | |
913 | addi r10,r10,32 | |
914 | stvx v21,r11,$sp | |
915 | addi r11,r11,32 | |
916 | stvx v22,r10,$sp | |
917 | addi r10,r10,32 | |
918 | stvx v23,r10,$sp | |
919 | addi r10,r10,32 | |
920 | stvx v24,r11,$sp | |
921 | addi r11,r11,32 | |
922 | stvx v25,r10,$sp | |
923 | addi r10,r10,32 | |
924 | stvx v26,r10,$sp | |
925 | addi r10,r10,32 | |
926 | stvx v27,r11,$sp | |
927 | addi r11,r11,32 | |
928 | stvx v28,r10,$sp | |
929 | addi r10,r10,32 | |
930 | stvx v29,r11,$sp | |
931 | addi r11,r11,32 | |
932 | stvx v30,r10,$sp | |
933 | stvx v31,r11,$sp | |
934 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave | |
935 | li r12,-1 | |
936 | mtspr 256,r12 # preserve all AltiVec registers | |
937 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) | |
938 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
939 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
940 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
941 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
942 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) | |
943 | ||
944 | bl LPICmeup | |
945 | ||
946 | li $x10,0x10 | |
947 | li $x20,0x20 | |
948 | li $x30,0x30 | |
949 | li $x40,0x40 | |
950 | li $x50,0x50 | |
951 | lvx_u $mask26,$x00,$const | |
952 | lvx_u $_26,$x10,$const | |
953 | lvx_u $_40,$x20,$const | |
954 | lvx_u $I2perm,$x30,$const | |
955 | lvx_u $padbits,$x40,$const | |
956 | ||
957 | cmplwi r7,0 # is_base2_26? | |
958 | bne Lskip_init_vsx | |
959 | ||
960 | ld $r0,32($ctx) # load key base 2^64 | |
961 | ld $r1,40($ctx) | |
962 | srdi $s1,$r1,2 | |
963 | li $mask,3 | |
964 | add $s1,$s1,$r1 # s1 = r1 + r1>>2 | |
965 | ||
966 | mr $h0,$r0 # "calculate" r^1 | |
967 | mr $h1,$r1 | |
968 | li $h2,0 | |
969 | addi $t1,$ctx,`48+(12^$BIG_ENDIAN)` | |
970 | bl __poly1305_splat | |
971 | ||
c2969ff6 | 972 | bl __poly1305_mul # calculate r^2 |
a28e4890 AP |
973 | addi $t1,$ctx,`48+(4^$BIG_ENDIAN)` |
974 | bl __poly1305_splat | |
975 | ||
c2969ff6 | 976 | bl __poly1305_mul # calculate r^3 |
a28e4890 AP |
977 | addi $t1,$ctx,`48+(8^$BIG_ENDIAN)` |
978 | bl __poly1305_splat | |
979 | ||
c2969ff6 | 980 | bl __poly1305_mul # calculate r^4 |
a28e4890 AP |
981 | addi $t1,$ctx,`48+(0^$BIG_ENDIAN)` |
982 | bl __poly1305_splat | |
983 | ||
984 | ld $h0,0($ctx) # load hash | |
985 | ld $h1,8($ctx) | |
986 | ld $h2,16($ctx) | |
987 | ||
988 | extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26 | |
989 | extrdi $d1,$h0,26,12 | |
990 | extrdi $d2,$h0,12,0 | |
991 | mtvrwz $H0,$d0 | |
992 | insrdi $d2,$h1,14,38 | |
993 | mtvrwz $H1,$d1 | |
994 | extrdi $d1,$h1,26,24 | |
995 | mtvrwz $H2,$d2 | |
996 | extrdi $d2,$h1,24,0 | |
997 | mtvrwz $H3,$d1 | |
998 | insrdi $d2,$h2,3,37 | |
999 | mtvrwz $H4,$d2 | |
1000 | ___ | |
1001 | } else { | |
1002 | ############################################################################### | |
1003 | # 32-bit initialization | |
1004 | ||
1005 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12)); | |
1006 | my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4); | |
1007 | ||
1008 | $code.=<<___; | |
1009 | .globl .poly1305_blocks_vsx | |
1010 | .align 5 | |
1011 | .poly1305_blocks_vsx: | |
1012 | lwz r7,24($ctx) # is_base2_26 | |
1013 | cmplwi $len,128 | |
1014 | bge __poly1305_blocks_vsx | |
1015 | cmplwi r7,0 | |
1016 | beq Lpoly1305_blocks | |
9e58d119 AP |
1017 | |
1018 | lwz $h0,0($ctx) # load hash | |
1019 | lwz $h1,4($ctx) | |
1020 | lwz $h2,8($ctx) | |
1021 | lwz $h3,12($ctx) | |
1022 | lwz $h4,16($ctx) | |
1023 | ||
a28e4890 AP |
1024 | slwi $t0,$h1,26 # base 2^26 -> base 2^32 |
1025 | srwi $h1,$h1,6 | |
1026 | slwi $t1,$h2,20 | |
1027 | srwi $h2,$h2,12 | |
1028 | addc $h0,$h0,$t0 | |
1029 | slwi $t0,$h3,14 | |
1030 | srwi $h3,$h3,18 | |
1031 | adde $h1,$h1,$t1 | |
1032 | slwi $t1,$h4,8 | |
1033 | srwi $h4,$h4,24 | |
1034 | adde $h2,$h2,$t0 | |
1035 | li $t0,0 | |
1036 | adde $h3,$h3,$t1 | |
1037 | addze $h4,$h4 | |
9e58d119 | 1038 | |
a28e4890 AP |
1039 | stw $h0,0($ctx) # store hash base 2^32 |
1040 | stw $h1,4($ctx) | |
1041 | stw $h2,8($ctx) | |
1042 | stw $h3,12($ctx) | |
1043 | stw $h4,16($ctx) | |
1044 | stw $t0,24($ctx) # clear is_base2_26 | |
9e58d119 | 1045 | |
a28e4890 AP |
1046 | b Lpoly1305_blocks |
1047 | .long 0 | |
1048 | .byte 0,12,0x14,0,0,0,4,0 | |
1049 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx | |
1050 | ||
1051 | .align 5 | |
1052 | __poly1305_mul: | |
1053 | vmulouw $ACC0,$H0,$R0 | |
1054 | vmulouw $ACC1,$H1,$R0 | |
1055 | vmulouw $ACC2,$H2,$R0 | |
1056 | vmulouw $ACC3,$H3,$R0 | |
1057 | vmulouw $ACC4,$H4,$R0 | |
1058 | ||
1059 | vmulouw $T0,$H4,$S1 | |
1060 | vaddudm $ACC0,$ACC0,$T0 | |
1061 | vmulouw $T0,$H0,$R1 | |
1062 | vaddudm $ACC1,$ACC1,$T0 | |
1063 | vmulouw $T0,$H1,$R1 | |
1064 | vaddudm $ACC2,$ACC2,$T0 | |
1065 | vmulouw $T0,$H2,$R1 | |
1066 | vaddudm $ACC3,$ACC3,$T0 | |
1067 | vmulouw $T0,$H3,$R1 | |
1068 | vaddudm $ACC4,$ACC4,$T0 | |
1069 | ||
1070 | vmulouw $T0,$H3,$S2 | |
1071 | vaddudm $ACC0,$ACC0,$T0 | |
1072 | vmulouw $T0,$H4,$S2 | |
1073 | vaddudm $ACC1,$ACC1,$T0 | |
1074 | vmulouw $T0,$H0,$R2 | |
1075 | vaddudm $ACC2,$ACC2,$T0 | |
1076 | vmulouw $T0,$H1,$R2 | |
1077 | vaddudm $ACC3,$ACC3,$T0 | |
1078 | vmulouw $T0,$H2,$R2 | |
1079 | vaddudm $ACC4,$ACC4,$T0 | |
1080 | ||
1081 | vmulouw $T0,$H2,$S3 | |
1082 | vaddudm $ACC0,$ACC0,$T0 | |
1083 | vmulouw $T0,$H3,$S3 | |
1084 | vaddudm $ACC1,$ACC1,$T0 | |
1085 | vmulouw $T0,$H4,$S3 | |
1086 | vaddudm $ACC2,$ACC2,$T0 | |
1087 | vmulouw $T0,$H0,$R3 | |
1088 | vaddudm $ACC3,$ACC3,$T0 | |
1089 | vmulouw $T0,$H1,$R3 | |
1090 | vaddudm $ACC4,$ACC4,$T0 | |
1091 | ||
1092 | vmulouw $T0,$H1,$S4 | |
1093 | vaddudm $ACC0,$ACC0,$T0 | |
1094 | vmulouw $T0,$H2,$S4 | |
1095 | vaddudm $ACC1,$ACC1,$T0 | |
1096 | vmulouw $T0,$H3,$S4 | |
1097 | vaddudm $ACC2,$ACC2,$T0 | |
1098 | vmulouw $T0,$H4,$S4 | |
1099 | vaddudm $ACC3,$ACC3,$T0 | |
1100 | vmulouw $T0,$H0,$R4 | |
1101 | vaddudm $ACC4,$ACC4,$T0 | |
1102 | ||
1103 | ################################################################ | |
1104 | # lazy reduction | |
1105 | ||
1106 | vspltisb $T0,2 | |
1107 | vsrd $H4,$ACC3,$_26 | |
1108 | vsrd $H1,$ACC0,$_26 | |
1109 | vand $H3,$ACC3,$mask26 | |
1110 | vand $H0,$ACC0,$mask26 | |
1111 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1112 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1113 | ||
1114 | vsrd $ACC4,$H4,$_26 | |
1115 | vsrd $ACC1,$H1,$_26 | |
1116 | vand $H4,$H4,$mask26 | |
1117 | vand $H1,$H1,$mask26 | |
1118 | vaddudm $H0,$H0,$ACC4 | |
1119 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1120 | ||
1121 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1122 | vsrd $ACC2,$H2,$_26 | |
1123 | vand $H2,$H2,$mask26 | |
1124 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1125 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1126 | ||
1127 | vsrd $ACC0,$H0,$_26 | |
1128 | vsrd $ACC3,$H3,$_26 | |
1129 | vand $H0,$H0,$mask26 | |
1130 | vand $H3,$H3,$mask26 | |
1131 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1132 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1133 | ||
1134 | blr | |
1135 | .long 0 | |
1136 | .byte 0,12,0x14,0,0,0,0,0 | |
1137 | .size __poly1305_mul,.-__poly1305_mul | |
1138 | ||
1139 | .align 5 | |
1140 | __poly1305_blocks_vsx: | |
1141 | $STU $sp,-$VSXFRAME($sp) | |
1142 | mflr r0 | |
1143 | li r10,`15+$LOCALS+128` | |
1144 | li r11,`31+$LOCALS+128` | |
1145 | mfspr r12,256 | |
1146 | stvx v20,r10,$sp | |
1147 | addi r10,r10,32 | |
1148 | stvx v21,r11,$sp | |
1149 | addi r11,r11,32 | |
1150 | stvx v22,r10,$sp | |
1151 | addi r10,r10,32 | |
1152 | stvx v23,r10,$sp | |
1153 | addi r10,r10,32 | |
1154 | stvx v24,r11,$sp | |
1155 | addi r11,r11,32 | |
1156 | stvx v25,r10,$sp | |
1157 | addi r10,r10,32 | |
1158 | stvx v26,r10,$sp | |
1159 | addi r10,r10,32 | |
1160 | stvx v27,r11,$sp | |
1161 | addi r11,r11,32 | |
1162 | stvx v28,r10,$sp | |
1163 | addi r10,r10,32 | |
1164 | stvx v29,r11,$sp | |
1165 | addi r11,r11,32 | |
1166 | stvx v30,r10,$sp | |
1167 | stvx v31,r11,$sp | |
1168 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave | |
1169 | li r12,-1 | |
1170 | mtspr 256,r12 # preserve all AltiVec registers | |
1171 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) | |
1172 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
1173 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
1174 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
1175 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
1176 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) | |
1177 | ||
1178 | bl LPICmeup | |
1179 | ||
1180 | li $x10,0x10 | |
1181 | li $x20,0x20 | |
1182 | li $x30,0x30 | |
1183 | li $x40,0x40 | |
1184 | li $x50,0x50 | |
1185 | lvx_u $mask26,$x00,$const | |
1186 | lvx_u $_26,$x10,$const | |
1187 | lvx_u $_40,$x20,$const | |
1188 | lvx_u $I2perm,$x30,$const | |
1189 | lvx_u $padbits,$x40,$const | |
1190 | ||
1191 | cmplwi r7,0 # is_base2_26? | |
1192 | bne Lskip_init_vsx | |
1193 | ||
1194 | lwz $h1,32($ctx) # load key base 2^32 | |
1195 | lwz $h2,36($ctx) | |
1196 | lwz $h3,40($ctx) | |
1197 | lwz $h4,44($ctx) | |
1198 | ||
1199 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 | |
1200 | extrwi $h1,$h1,6,0 | |
1201 | insrwi $h1,$h2,20,6 | |
1202 | extrwi $h2,$h2,12,0 | |
1203 | insrwi $h2,$h3,14,6 | |
1204 | extrwi $h3,$h3,18,0 | |
1205 | insrwi $h3,$h4,8,6 | |
1206 | extrwi $h4,$h4,24,0 | |
1207 | ||
1208 | mtvrwz $R0,$h0 | |
1209 | slwi $h0,$h1,2 | |
1210 | mtvrwz $R1,$h1 | |
1211 | add $h1,$h1,$h0 | |
1212 | mtvrwz $S1,$h1 | |
1213 | slwi $h1,$h2,2 | |
1214 | mtvrwz $R2,$h2 | |
1215 | add $h2,$h2,$h1 | |
1216 | mtvrwz $S2,$h2 | |
1217 | slwi $h2,$h3,2 | |
1218 | mtvrwz $R3,$h3 | |
1219 | add $h3,$h3,$h2 | |
1220 | mtvrwz $S3,$h3 | |
1221 | slwi $h3,$h4,2 | |
1222 | mtvrwz $R4,$h4 | |
1223 | add $h4,$h4,$h3 | |
1224 | mtvrwz $S4,$h4 | |
1225 | ||
1226 | vmr $H0,$R0 | |
1227 | vmr $H1,$R1 | |
1228 | vmr $H2,$R2 | |
1229 | vmr $H3,$R3 | |
1230 | vmr $H4,$R4 | |
1231 | ||
1232 | bl __poly1305_mul # r^1:- * r^1:- | |
1233 | ||
1234 | vpermdi $R0,$H0,$R0,0b00 | |
1235 | vpermdi $R1,$H1,$R1,0b00 | |
1236 | vpermdi $R2,$H2,$R2,0b00 | |
1237 | vpermdi $R3,$H3,$R3,0b00 | |
1238 | vpermdi $R4,$H4,$R4,0b00 | |
1239 | vpermdi $H0,$H0,$H0,0b00 | |
1240 | vpermdi $H1,$H1,$H1,0b00 | |
1241 | vpermdi $H2,$H2,$H2,0b00 | |
1242 | vpermdi $H3,$H3,$H3,0b00 | |
1243 | vpermdi $H4,$H4,$H4,0b00 | |
1244 | vsld $S1,$R1,$T0 # <<2 | |
1245 | vsld $S2,$R2,$T0 | |
1246 | vsld $S3,$R3,$T0 | |
1247 | vsld $S4,$R4,$T0 | |
1248 | vaddudm $S1,$S1,$R1 | |
1249 | vaddudm $S2,$S2,$R2 | |
1250 | vaddudm $S3,$S3,$R3 | |
1251 | vaddudm $S4,$S4,$R4 | |
1252 | ||
1253 | bl __poly1305_mul # r^2:r^2 * r^2:r^1 | |
1254 | ||
1255 | addi $h0,$ctx,0x60 | |
1256 | lwz $h1,0($ctx) # load hash | |
1257 | lwz $h2,4($ctx) | |
1258 | lwz $h3,8($ctx) | |
1259 | lwz $h4,12($ctx) | |
1260 | lwz $t0,16($ctx) | |
1261 | ||
1262 | vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3 | |
1263 | vmrgow $R1,$R1,$H1 | |
1264 | vmrgow $R2,$R2,$H2 | |
1265 | vmrgow $R3,$R3,$H3 | |
1266 | vmrgow $R4,$R4,$H4 | |
1267 | vslw $S1,$R1,$T0 # <<2 | |
1268 | vslw $S2,$R2,$T0 | |
1269 | vslw $S3,$R3,$T0 | |
1270 | vslw $S4,$R4,$T0 | |
1271 | vadduwm $S1,$S1,$R1 | |
1272 | vadduwm $S2,$S2,$R2 | |
1273 | vadduwm $S3,$S3,$R3 | |
1274 | vadduwm $S4,$S4,$R4 | |
1275 | ||
1276 | stvx_u $R0,$x30,$ctx | |
1277 | stvx_u $R1,$x40,$ctx | |
1278 | stvx_u $S1,$x50,$ctx | |
1279 | stvx_u $R2,$x00,$h0 | |
1280 | stvx_u $S2,$x10,$h0 | |
1281 | stvx_u $R3,$x20,$h0 | |
1282 | stvx_u $S3,$x30,$h0 | |
1283 | stvx_u $R4,$x40,$h0 | |
1284 | stvx_u $S4,$x50,$h0 | |
1285 | ||
1286 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 | |
1287 | extrwi $h1,$h1,6,0 | |
1288 | mtvrwz $H0,$h0 | |
1289 | insrwi $h1,$h2,20,6 | |
1290 | extrwi $h2,$h2,12,0 | |
1291 | mtvrwz $H1,$h1 | |
1292 | insrwi $h2,$h3,14,6 | |
1293 | extrwi $h3,$h3,18,0 | |
1294 | mtvrwz $H2,$h2 | |
1295 | insrwi $h3,$h4,8,6 | |
1296 | extrwi $h4,$h4,24,0 | |
1297 | mtvrwz $H3,$h3 | |
1298 | insrwi $h4,$t0,3,5 | |
1299 | mtvrwz $H4,$h4 | |
9e58d119 | 1300 | ___ |
a28e4890 | 1301 | } |
9e58d119 | 1302 | $code.=<<___; |
a28e4890 AP |
1303 | li r0,1 |
1304 | stw r0,24($ctx) # set is_base2_26 | |
1305 | b Loaded_vsx | |
1306 | ||
1307 | .align 4 | |
1308 | Lskip_init_vsx: | |
1309 | li $x10,4 | |
1310 | li $x20,8 | |
1311 | li $x30,12 | |
1312 | li $x40,16 | |
1313 | lvwzx_u $H0,$x00,$ctx | |
1314 | lvwzx_u $H1,$x10,$ctx | |
1315 | lvwzx_u $H2,$x20,$ctx | |
1316 | lvwzx_u $H3,$x30,$ctx | |
1317 | lvwzx_u $H4,$x40,$ctx | |
1318 | ||
1319 | Loaded_vsx: | |
1320 | li $x10,0x10 | |
1321 | li $x20,0x20 | |
1322 | li $x30,0x30 | |
1323 | li $x40,0x40 | |
1324 | li $x50,0x50 | |
1325 | li $x60,0x60 | |
1326 | li $x70,0x70 | |
1327 | addi $ctx_,$ctx,64 # &ctx->r[1] | |
1328 | addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow | |
1329 | ||
1330 | vxor $T0,$T0,$T0 # ensure second half is zero | |
1331 | vpermdi $H0,$H0,$T0,0b00 | |
1332 | vpermdi $H1,$H1,$T0,0b00 | |
1333 | vpermdi $H2,$H2,$T0,0b00 | |
1334 | vpermdi $H3,$H3,$T0,0b00 | |
1335 | vpermdi $H4,$H4,$T0,0b00 | |
1336 | ||
1337 | be?lvx_u $_4,$x50,$const # byte swap mask | |
1338 | lvx_u $T1,$x00,$inp # load first input block | |
1339 | lvx_u $T2,$x10,$inp | |
1340 | lvx_u $T3,$x20,$inp | |
1341 | lvx_u $T4,$x30,$inp | |
1342 | be?vperm $T1,$T1,$T1,$_4 | |
1343 | be?vperm $T2,$T2,$T2,$_4 | |
1344 | be?vperm $T3,$T3,$T3,$_4 | |
1345 | be?vperm $T4,$T4,$T4,$_4 | |
1346 | ||
1347 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1348 | vspltisb $_4,4 | |
1349 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1350 | vspltisb $_14,14 | |
1351 | vpermdi $I3,$T1,$T2,0b11 | |
1352 | ||
1353 | vsrd $I1,$I0,$_26 | |
1354 | vsrd $I2,$I2,$_4 | |
1355 | vsrd $I4,$I3,$_40 | |
1356 | vsrd $I3,$I3,$_14 | |
1357 | vand $I0,$I0,$mask26 | |
1358 | vand $I1,$I1,$mask26 | |
1359 | vand $I2,$I2,$mask26 | |
1360 | vand $I3,$I3,$mask26 | |
1361 | ||
1362 | vpermdi $T1,$T3,$T4,0b00 | |
1363 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1364 | vpermdi $T3,$T3,$T4,0b11 | |
1365 | ||
1366 | vsrd $T0,$T1,$_26 | |
1367 | vsrd $T2,$T2,$_4 | |
1368 | vsrd $T4,$T3,$_40 | |
1369 | vsrd $T3,$T3,$_14 | |
1370 | vand $T1,$T1,$mask26 | |
1371 | vand $T0,$T0,$mask26 | |
1372 | vand $T2,$T2,$mask26 | |
1373 | vand $T3,$T3,$mask26 | |
1374 | ||
1375 | # inp[2]:inp[0]:inp[3]:inp[1] | |
1376 | vmrgow $I4,$T4,$I4 | |
1377 | vmrgow $I0,$T1,$I0 | |
1378 | vmrgow $I1,$T0,$I1 | |
1379 | vmrgow $I2,$T2,$I2 | |
1380 | vmrgow $I3,$T3,$I3 | |
1381 | vor $I4,$I4,$padbits | |
1382 | ||
1383 | lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop | |
1384 | lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement | |
1385 | lvx_splt $S1,$x10,$ctx_ | |
1386 | lvx_splt $R2,$x20,$ctx_ | |
1387 | lvx_splt $S2,$x30,$ctx_ | |
1388 | lvx_splt $T1,$x40,$ctx_ | |
1389 | lvx_splt $T2,$x50,$ctx_ | |
1390 | lvx_splt $T3,$x60,$ctx_ | |
1391 | lvx_splt $T4,$x70,$ctx_ | |
1392 | stvx $R1,$x00,$_ctx | |
1393 | stvx $S1,$x10,$_ctx | |
1394 | stvx $R2,$x20,$_ctx | |
1395 | stvx $S2,$x30,$_ctx | |
1396 | stvx $T1,$x40,$_ctx | |
1397 | stvx $T2,$x50,$_ctx | |
1398 | stvx $T3,$x60,$_ctx | |
1399 | stvx $T4,$x70,$_ctx | |
1400 | ||
1401 | addi $inp,$inp,0x40 | |
1402 | addi $const,$const,0x50 | |
1403 | addi r0,$len,-64 | |
1404 | srdi r0,r0,6 | |
1405 | mtctr r0 | |
1406 | b Loop_vsx | |
1407 | ||
1408 | .align 4 | |
1409 | Loop_vsx: | |
1410 | ################################################################ | |
1411 | ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 | |
1412 | ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r | |
1413 | ## \___________________/ | |
1414 | ## | |
1415 | ## Note that we start with inp[2:3]*r^2. This is because it | |
1416 | ## doesn't depend on reduction in previous iteration. | |
1417 | ################################################################ | |
1418 | ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 | |
1419 | ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 | |
1420 | ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 | |
1421 | ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 | |
1422 | ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 | |
1423 | ||
1424 | vmuleuw $ACC0,$I0,$R0 | |
1425 | vmuleuw $ACC1,$I0,$R1 | |
1426 | vmuleuw $ACC2,$I0,$R2 | |
1427 | vmuleuw $ACC3,$I1,$R2 | |
1428 | ||
1429 | vmuleuw $T0,$I1,$R0 | |
1430 | vaddudm $ACC1,$ACC1,$T0 | |
1431 | vmuleuw $T0,$I1,$R1 | |
1432 | vaddudm $ACC2,$ACC2,$T0 | |
1433 | vmuleuw $ACC4,$I2,$R2 | |
1434 | vmuleuw $T0,$I4,$S1 | |
1435 | vaddudm $ACC0,$ACC0,$T0 | |
1436 | vmuleuw $T0,$I2,$R1 | |
1437 | vaddudm $ACC3,$ACC3,$T0 | |
1438 | lvx $S3,$x50,$_ctx | |
1439 | vmuleuw $T0,$I3,$R1 | |
1440 | vaddudm $ACC4,$ACC4,$T0 | |
1441 | lvx $R3,$x40,$_ctx | |
1442 | ||
1443 | vaddudm $H2,$H2,$I2 | |
1444 | vaddudm $H0,$H0,$I0 | |
1445 | vaddudm $H3,$H3,$I3 | |
1446 | vaddudm $H1,$H1,$I1 | |
1447 | vaddudm $H4,$H4,$I4 | |
1448 | ||
1449 | vmuleuw $T0,$I3,$S2 | |
1450 | vaddudm $ACC0,$ACC0,$T0 | |
1451 | vmuleuw $T0,$I4,$S2 | |
1452 | vaddudm $ACC1,$ACC1,$T0 | |
1453 | vmuleuw $T0,$I2,$R0 | |
1454 | vaddudm $ACC2,$ACC2,$T0 | |
1455 | vmuleuw $T0,$I3,$R0 | |
1456 | vaddudm $ACC3,$ACC3,$T0 | |
1457 | lvx $S4,$x70,$_ctx | |
1458 | vmuleuw $T0,$I4,$R0 | |
1459 | vaddudm $ACC4,$ACC4,$T0 | |
1460 | lvx $R4,$x60,$_ctx | |
1461 | ||
1462 | vmuleuw $T0,$I2,$S3 | |
1463 | vaddudm $ACC0,$ACC0,$T0 | |
1464 | vmuleuw $T0,$I3,$S3 | |
1465 | vaddudm $ACC1,$ACC1,$T0 | |
1466 | vmuleuw $T0,$I4,$S3 | |
1467 | vaddudm $ACC2,$ACC2,$T0 | |
1468 | vmuleuw $T0,$I0,$R3 | |
1469 | vaddudm $ACC3,$ACC3,$T0 | |
1470 | vmuleuw $T0,$I1,$R3 | |
1471 | vaddudm $ACC4,$ACC4,$T0 | |
1472 | ||
1473 | be?lvx_u $_4,$x00,$const # byte swap mask | |
1474 | lvx_u $T1,$x00,$inp # load next input block | |
1475 | lvx_u $T2,$x10,$inp | |
1476 | lvx_u $T3,$x20,$inp | |
1477 | lvx_u $T4,$x30,$inp | |
1478 | be?vperm $T1,$T1,$T1,$_4 | |
1479 | be?vperm $T2,$T2,$T2,$_4 | |
1480 | be?vperm $T3,$T3,$T3,$_4 | |
1481 | be?vperm $T4,$T4,$T4,$_4 | |
1482 | ||
1483 | vmuleuw $T0,$I1,$S4 | |
1484 | vaddudm $ACC0,$ACC0,$T0 | |
1485 | vmuleuw $T0,$I2,$S4 | |
1486 | vaddudm $ACC1,$ACC1,$T0 | |
1487 | vmuleuw $T0,$I3,$S4 | |
1488 | vaddudm $ACC2,$ACC2,$T0 | |
1489 | vmuleuw $T0,$I4,$S4 | |
1490 | vaddudm $ACC3,$ACC3,$T0 | |
1491 | vmuleuw $T0,$I0,$R4 | |
1492 | vaddudm $ACC4,$ACC4,$T0 | |
1493 | ||
1494 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1495 | vspltisb $_4,4 | |
1496 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1497 | vpermdi $I3,$T1,$T2,0b11 | |
1498 | ||
1499 | # (hash + inp[0:1]) * r^4 | |
1500 | vmulouw $T0,$H0,$R0 | |
1501 | vaddudm $ACC0,$ACC0,$T0 | |
1502 | vmulouw $T0,$H1,$R0 | |
1503 | vaddudm $ACC1,$ACC1,$T0 | |
1504 | vmulouw $T0,$H2,$R0 | |
1505 | vaddudm $ACC2,$ACC2,$T0 | |
1506 | vmulouw $T0,$H3,$R0 | |
1507 | vaddudm $ACC3,$ACC3,$T0 | |
1508 | vmulouw $T0,$H4,$R0 | |
1509 | vaddudm $ACC4,$ACC4,$T0 | |
1510 | ||
1511 | vpermdi $T1,$T3,$T4,0b00 | |
1512 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1513 | vpermdi $T3,$T3,$T4,0b11 | |
1514 | ||
1515 | vmulouw $T0,$H2,$S3 | |
1516 | vaddudm $ACC0,$ACC0,$T0 | |
1517 | vmulouw $T0,$H3,$S3 | |
1518 | vaddudm $ACC1,$ACC1,$T0 | |
1519 | vmulouw $T0,$H4,$S3 | |
1520 | vaddudm $ACC2,$ACC2,$T0 | |
1521 | vmulouw $T0,$H0,$R3 | |
1522 | vaddudm $ACC3,$ACC3,$T0 | |
1523 | lvx $S1,$x10,$_ctx | |
1524 | vmulouw $T0,$H1,$R3 | |
1525 | vaddudm $ACC4,$ACC4,$T0 | |
1526 | lvx $R1,$x00,$_ctx | |
1527 | ||
1528 | vsrd $I1,$I0,$_26 | |
1529 | vsrd $I2,$I2,$_4 | |
1530 | vsrd $I4,$I3,$_40 | |
1531 | vsrd $I3,$I3,$_14 | |
1532 | ||
1533 | vmulouw $T0,$H1,$S4 | |
1534 | vaddudm $ACC0,$ACC0,$T0 | |
1535 | vmulouw $T0,$H2,$S4 | |
1536 | vaddudm $ACC1,$ACC1,$T0 | |
1537 | vmulouw $T0,$H3,$S4 | |
1538 | vaddudm $ACC2,$ACC2,$T0 | |
1539 | vmulouw $T0,$H4,$S4 | |
1540 | vaddudm $ACC3,$ACC3,$T0 | |
1541 | lvx $S2,$x30,$_ctx | |
1542 | vmulouw $T0,$H0,$R4 | |
1543 | vaddudm $ACC4,$ACC4,$T0 | |
1544 | lvx $R2,$x20,$_ctx | |
1545 | ||
1546 | vand $I0,$I0,$mask26 | |
1547 | vand $I1,$I1,$mask26 | |
1548 | vand $I2,$I2,$mask26 | |
1549 | vand $I3,$I3,$mask26 | |
1550 | ||
1551 | vmulouw $T0,$H4,$S1 | |
1552 | vaddudm $ACC0,$ACC0,$T0 | |
1553 | vmulouw $T0,$H0,$R1 | |
1554 | vaddudm $ACC1,$ACC1,$T0 | |
1555 | vmulouw $T0,$H1,$R1 | |
1556 | vaddudm $ACC2,$ACC2,$T0 | |
1557 | vmulouw $T0,$H2,$R1 | |
1558 | vaddudm $ACC3,$ACC3,$T0 | |
1559 | vmulouw $T0,$H3,$R1 | |
1560 | vaddudm $ACC4,$ACC4,$T0 | |
1561 | ||
1562 | vsrd $T2,$T2,$_4 | |
1563 | vsrd $_4,$T1,$_26 | |
1564 | vsrd $T4,$T3,$_40 | |
1565 | vsrd $T3,$T3,$_14 | |
1566 | ||
1567 | vmulouw $T0,$H3,$S2 | |
1568 | vaddudm $ACC0,$ACC0,$T0 | |
1569 | vmulouw $T0,$H4,$S2 | |
1570 | vaddudm $ACC1,$ACC1,$T0 | |
1571 | vmulouw $T0,$H0,$R2 | |
1572 | vaddudm $ACC2,$ACC2,$T0 | |
1573 | vmulouw $T0,$H1,$R2 | |
1574 | vaddudm $ACC3,$ACC3,$T0 | |
1575 | vmulouw $T0,$H2,$R2 | |
1576 | vaddudm $ACC4,$ACC4,$T0 | |
1577 | ||
1578 | vand $T1,$T1,$mask26 | |
1579 | vand $_4,$_4,$mask26 | |
1580 | vand $T2,$T2,$mask26 | |
1581 | vand $T3,$T3,$mask26 | |
1582 | ||
1583 | ################################################################ | |
1584 | # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein | |
1585 | # and P. Schwabe | |
1586 | ||
1587 | vspltisb $T0,2 | |
1588 | vsrd $H4,$ACC3,$_26 | |
1589 | vsrd $H1,$ACC0,$_26 | |
1590 | vand $H3,$ACC3,$mask26 | |
1591 | vand $H0,$ACC0,$mask26 | |
1592 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1593 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1594 | ||
1595 | vmrgow $I4,$T4,$I4 | |
1596 | vmrgow $I0,$T1,$I0 | |
1597 | vmrgow $I1,$_4,$I1 | |
1598 | vmrgow $I2,$T2,$I2 | |
1599 | vmrgow $I3,$T3,$I3 | |
1600 | vor $I4,$I4,$padbits | |
1601 | ||
1602 | vsrd $ACC4,$H4,$_26 | |
1603 | vsrd $ACC1,$H1,$_26 | |
1604 | vand $H4,$H4,$mask26 | |
1605 | vand $H1,$H1,$mask26 | |
1606 | vaddudm $H0,$H0,$ACC4 | |
1607 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1608 | ||
1609 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1610 | vsrd $ACC2,$H2,$_26 | |
1611 | vand $H2,$H2,$mask26 | |
1612 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1613 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1614 | ||
1615 | vsrd $ACC0,$H0,$_26 | |
1616 | vsrd $ACC3,$H3,$_26 | |
1617 | vand $H0,$H0,$mask26 | |
1618 | vand $H3,$H3,$mask26 | |
1619 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1620 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1621 | ||
1622 | addi $inp,$inp,0x40 | |
1623 | bdnz Loop_vsx | |
1624 | ||
1625 | neg $len,$len | |
1626 | andi. $len,$len,0x30 | |
1627 | sub $inp,$inp,$len | |
1628 | ||
1629 | lvx_u $R0,$x30,$ctx # load all powers | |
1630 | lvx_u $R1,$x00,$ctx_ | |
1631 | lvx_u $S1,$x10,$ctx_ | |
1632 | lvx_u $R2,$x20,$ctx_ | |
1633 | lvx_u $S2,$x30,$ctx_ | |
1634 | ||
1635 | Last_vsx: | |
1636 | vmuleuw $ACC0,$I0,$R0 | |
1637 | vmuleuw $ACC1,$I1,$R0 | |
1638 | vmuleuw $ACC2,$I2,$R0 | |
1639 | vmuleuw $ACC3,$I3,$R0 | |
1640 | vmuleuw $ACC4,$I4,$R0 | |
1641 | ||
1642 | vmuleuw $T0,$I4,$S1 | |
1643 | vaddudm $ACC0,$ACC0,$T0 | |
1644 | vmuleuw $T0,$I0,$R1 | |
1645 | vaddudm $ACC1,$ACC1,$T0 | |
1646 | vmuleuw $T0,$I1,$R1 | |
1647 | vaddudm $ACC2,$ACC2,$T0 | |
1648 | vmuleuw $T0,$I2,$R1 | |
1649 | vaddudm $ACC3,$ACC3,$T0 | |
1650 | lvx_u $S3,$x50,$ctx_ | |
1651 | vmuleuw $T0,$I3,$R1 | |
1652 | vaddudm $ACC4,$ACC4,$T0 | |
1653 | lvx_u $R3,$x40,$ctx_ | |
1654 | ||
1655 | vaddudm $H2,$H2,$I2 | |
1656 | vaddudm $H0,$H0,$I0 | |
1657 | vaddudm $H3,$H3,$I3 | |
1658 | vaddudm $H1,$H1,$I1 | |
1659 | vaddudm $H4,$H4,$I4 | |
1660 | ||
1661 | vmuleuw $T0,$I3,$S2 | |
1662 | vaddudm $ACC0,$ACC0,$T0 | |
1663 | vmuleuw $T0,$I4,$S2 | |
1664 | vaddudm $ACC1,$ACC1,$T0 | |
1665 | vmuleuw $T0,$I0,$R2 | |
1666 | vaddudm $ACC2,$ACC2,$T0 | |
1667 | vmuleuw $T0,$I1,$R2 | |
1668 | vaddudm $ACC3,$ACC3,$T0 | |
1669 | lvx_u $S4,$x70,$ctx_ | |
1670 | vmuleuw $T0,$I2,$R2 | |
1671 | vaddudm $ACC4,$ACC4,$T0 | |
1672 | lvx_u $R4,$x60,$ctx_ | |
1673 | ||
1674 | vmuleuw $T0,$I2,$S3 | |
1675 | vaddudm $ACC0,$ACC0,$T0 | |
1676 | vmuleuw $T0,$I3,$S3 | |
1677 | vaddudm $ACC1,$ACC1,$T0 | |
1678 | vmuleuw $T0,$I4,$S3 | |
1679 | vaddudm $ACC2,$ACC2,$T0 | |
1680 | vmuleuw $T0,$I0,$R3 | |
1681 | vaddudm $ACC3,$ACC3,$T0 | |
1682 | vmuleuw $T0,$I1,$R3 | |
1683 | vaddudm $ACC4,$ACC4,$T0 | |
1684 | ||
1685 | vmuleuw $T0,$I1,$S4 | |
1686 | vaddudm $ACC0,$ACC0,$T0 | |
1687 | vmuleuw $T0,$I2,$S4 | |
1688 | vaddudm $ACC1,$ACC1,$T0 | |
1689 | vmuleuw $T0,$I3,$S4 | |
1690 | vaddudm $ACC2,$ACC2,$T0 | |
1691 | vmuleuw $T0,$I4,$S4 | |
1692 | vaddudm $ACC3,$ACC3,$T0 | |
1693 | vmuleuw $T0,$I0,$R4 | |
1694 | vaddudm $ACC4,$ACC4,$T0 | |
1695 | ||
1696 | # (hash + inp[0:1]) * r^4 | |
1697 | vmulouw $T0,$H0,$R0 | |
1698 | vaddudm $ACC0,$ACC0,$T0 | |
1699 | vmulouw $T0,$H1,$R0 | |
1700 | vaddudm $ACC1,$ACC1,$T0 | |
1701 | vmulouw $T0,$H2,$R0 | |
1702 | vaddudm $ACC2,$ACC2,$T0 | |
1703 | vmulouw $T0,$H3,$R0 | |
1704 | vaddudm $ACC3,$ACC3,$T0 | |
1705 | vmulouw $T0,$H4,$R0 | |
1706 | vaddudm $ACC4,$ACC4,$T0 | |
1707 | ||
1708 | vmulouw $T0,$H2,$S3 | |
1709 | vaddudm $ACC0,$ACC0,$T0 | |
1710 | vmulouw $T0,$H3,$S3 | |
1711 | vaddudm $ACC1,$ACC1,$T0 | |
1712 | vmulouw $T0,$H4,$S3 | |
1713 | vaddudm $ACC2,$ACC2,$T0 | |
1714 | vmulouw $T0,$H0,$R3 | |
1715 | vaddudm $ACC3,$ACC3,$T0 | |
1716 | lvx_u $S1,$x10,$ctx_ | |
1717 | vmulouw $T0,$H1,$R3 | |
1718 | vaddudm $ACC4,$ACC4,$T0 | |
1719 | lvx_u $R1,$x00,$ctx_ | |
1720 | ||
1721 | vmulouw $T0,$H1,$S4 | |
1722 | vaddudm $ACC0,$ACC0,$T0 | |
1723 | vmulouw $T0,$H2,$S4 | |
1724 | vaddudm $ACC1,$ACC1,$T0 | |
1725 | vmulouw $T0,$H3,$S4 | |
1726 | vaddudm $ACC2,$ACC2,$T0 | |
1727 | vmulouw $T0,$H4,$S4 | |
1728 | vaddudm $ACC3,$ACC3,$T0 | |
1729 | lvx_u $S2,$x30,$ctx_ | |
1730 | vmulouw $T0,$H0,$R4 | |
1731 | vaddudm $ACC4,$ACC4,$T0 | |
1732 | lvx_u $R2,$x20,$ctx_ | |
1733 | ||
1734 | vmulouw $T0,$H4,$S1 | |
1735 | vaddudm $ACC0,$ACC0,$T0 | |
1736 | vmulouw $T0,$H0,$R1 | |
1737 | vaddudm $ACC1,$ACC1,$T0 | |
1738 | vmulouw $T0,$H1,$R1 | |
1739 | vaddudm $ACC2,$ACC2,$T0 | |
1740 | vmulouw $T0,$H2,$R1 | |
1741 | vaddudm $ACC3,$ACC3,$T0 | |
1742 | vmulouw $T0,$H3,$R1 | |
1743 | vaddudm $ACC4,$ACC4,$T0 | |
1744 | ||
1745 | vmulouw $T0,$H3,$S2 | |
1746 | vaddudm $ACC0,$ACC0,$T0 | |
1747 | vmulouw $T0,$H4,$S2 | |
1748 | vaddudm $ACC1,$ACC1,$T0 | |
1749 | vmulouw $T0,$H0,$R2 | |
1750 | vaddudm $ACC2,$ACC2,$T0 | |
1751 | vmulouw $T0,$H1,$R2 | |
1752 | vaddudm $ACC3,$ACC3,$T0 | |
1753 | vmulouw $T0,$H2,$R2 | |
1754 | vaddudm $ACC4,$ACC4,$T0 | |
1755 | ||
1756 | ################################################################ | |
1757 | # horizontal addition | |
1758 | ||
1759 | vpermdi $H0,$ACC0,$ACC0,0b10 | |
1760 | vpermdi $H1,$ACC1,$ACC1,0b10 | |
1761 | vpermdi $H2,$ACC2,$ACC2,0b10 | |
1762 | vpermdi $H3,$ACC3,$ACC3,0b10 | |
1763 | vpermdi $H4,$ACC4,$ACC4,0b10 | |
1764 | vaddudm $ACC0,$ACC0,$H0 | |
1765 | vaddudm $ACC1,$ACC1,$H1 | |
1766 | vaddudm $ACC2,$ACC2,$H2 | |
1767 | vaddudm $ACC3,$ACC3,$H3 | |
1768 | vaddudm $ACC4,$ACC4,$H4 | |
1769 | ||
1770 | ################################################################ | |
1771 | # lazy reduction | |
1772 | ||
1773 | vspltisb $T0,2 | |
1774 | vsrd $H4,$ACC3,$_26 | |
1775 | vsrd $H1,$ACC0,$_26 | |
1776 | vand $H3,$ACC3,$mask26 | |
1777 | vand $H0,$ACC0,$mask26 | |
1778 | vaddudm $H4,$H4,$ACC4 # h3 -> h4 | |
1779 | vaddudm $H1,$H1,$ACC1 # h0 -> h1 | |
1780 | ||
1781 | vsrd $ACC4,$H4,$_26 | |
1782 | vsrd $ACC1,$H1,$_26 | |
1783 | vand $H4,$H4,$mask26 | |
1784 | vand $H1,$H1,$mask26 | |
1785 | vaddudm $H0,$H0,$ACC4 | |
1786 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 | |
1787 | ||
1788 | vsld $ACC4,$ACC4,$T0 # <<2 | |
1789 | vsrd $ACC2,$H2,$_26 | |
1790 | vand $H2,$H2,$mask26 | |
1791 | vaddudm $H0,$H0,$ACC4 # h4 -> h0 | |
1792 | vaddudm $H3,$H3,$ACC2 # h2 -> h3 | |
1793 | ||
1794 | vsrd $ACC0,$H0,$_26 | |
1795 | vsrd $ACC3,$H3,$_26 | |
1796 | vand $H0,$H0,$mask26 | |
1797 | vand $H3,$H3,$mask26 | |
1798 | vaddudm $H1,$H1,$ACC0 # h0 -> h1 | |
1799 | vaddudm $H4,$H4,$ACC3 # h3 -> h4 | |
1800 | ||
1801 | beq Ldone_vsx | |
1802 | ||
1803 | add r6,$const,$len | |
1804 | ||
1805 | be?lvx_u $_4,$x00,$const # byte swap mask | |
1806 | lvx_u $T1,$x00,$inp # load last partial input block | |
1807 | lvx_u $T2,$x10,$inp | |
1808 | lvx_u $T3,$x20,$inp | |
1809 | lvx_u $T4,$x30,$inp | |
1810 | be?vperm $T1,$T1,$T1,$_4 | |
1811 | be?vperm $T2,$T2,$T2,$_4 | |
1812 | be?vperm $T3,$T3,$T3,$_4 | |
1813 | be?vperm $T4,$T4,$T4,$_4 | |
1814 | ||
1815 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 | |
1816 | vspltisb $_4,4 | |
1817 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 | |
1818 | vpermdi $I3,$T1,$T2,0b11 | |
1819 | ||
1820 | vsrd $I1,$I0,$_26 | |
1821 | vsrd $I2,$I2,$_4 | |
1822 | vsrd $I4,$I3,$_40 | |
1823 | vsrd $I3,$I3,$_14 | |
1824 | vand $I0,$I0,$mask26 | |
1825 | vand $I1,$I1,$mask26 | |
1826 | vand $I2,$I2,$mask26 | |
1827 | vand $I3,$I3,$mask26 | |
1828 | ||
1829 | vpermdi $T0,$T3,$T4,0b00 | |
1830 | vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 | |
1831 | vpermdi $T2,$T3,$T4,0b11 | |
1832 | ||
1833 | lvx_u $ACC0,$x00,r6 | |
1834 | lvx_u $ACC1,$x30,r6 | |
1835 | ||
1836 | vsrd $T3,$T0,$_26 | |
1837 | vsrd $T1,$T1,$_4 | |
1838 | vsrd $T4,$T2,$_40 | |
1839 | vsrd $T2,$T2,$_14 | |
1840 | vand $T0,$T0,$mask26 | |
1841 | vand $T3,$T3,$mask26 | |
1842 | vand $T1,$T1,$mask26 | |
1843 | vand $T2,$T2,$mask26 | |
1844 | ||
1845 | # inp[2]:inp[0]:inp[3]:inp[1] | |
1846 | vmrgow $I4,$T4,$I4 | |
1847 | vmrgow $I0,$T0,$I0 | |
1848 | vmrgow $I1,$T3,$I1 | |
1849 | vmrgow $I2,$T1,$I2 | |
1850 | vmrgow $I3,$T2,$I3 | |
1851 | vor $I4,$I4,$padbits | |
1852 | ||
1853 | vperm $H0,$H0,$H0,$ACC0 # move hash to right lane | |
1854 | vand $I0,$I0, $ACC1 # mask redundant input lane[s] | |
1855 | vperm $H1,$H1,$H1,$ACC0 | |
1856 | vand $I1,$I1, $ACC1 | |
1857 | vperm $H2,$H2,$H2,$ACC0 | |
1858 | vand $I2,$I2, $ACC1 | |
1859 | vperm $H3,$H3,$H3,$ACC0 | |
1860 | vand $I3,$I3, $ACC1 | |
1861 | vperm $H4,$H4,$H4,$ACC0 | |
1862 | vand $I4,$I4, $ACC1 | |
1863 | ||
1864 | vaddudm $I0,$I0,$H0 # accumulate hash | |
1865 | vxor $H0,$H0,$H0 # wipe hash value | |
1866 | vaddudm $I1,$I1,$H1 | |
1867 | vxor $H1,$H1,$H1 | |
1868 | vaddudm $I2,$I2,$H2 | |
1869 | vxor $H2,$H2,$H2 | |
1870 | vaddudm $I3,$I3,$H3 | |
1871 | vxor $H3,$H3,$H3 | |
1872 | vaddudm $I4,$I4,$H4 | |
1873 | vxor $H4,$H4,$H4 | |
1874 | ||
1875 | xor. $len,$len,$len | |
1876 | b Last_vsx | |
1877 | ||
1878 | .align 4 | |
1879 | Ldone_vsx: | |
1880 | $POP r0,`$VSXFRAME+$LRSAVE`($sp) | |
1881 | li $x10,4 | |
1882 | li $x20,8 | |
1883 | li $x30,12 | |
1884 | li $x40,16 | |
1885 | stvwx_u $H0,$x00,$ctx # store hash | |
1886 | stvwx_u $H1,$x10,$ctx | |
1887 | stvwx_u $H2,$x20,$ctx | |
1888 | stvwx_u $H3,$x30,$ctx | |
1889 | stvwx_u $H4,$x40,$ctx | |
1890 | ||
1891 | lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave | |
1892 | mtlr r0 | |
1893 | li r10,`15+$LOCALS+128` | |
1894 | li r11,`31+$LOCALS+128` | |
1895 | mtspr 256,r12 # restore vrsave | |
1896 | lvx v20,r10,$sp | |
1897 | addi r10,r10,32 | |
1898 | lvx v21,r10,$sp | |
1899 | addi r10,r10,32 | |
1900 | lvx v22,r11,$sp | |
1901 | addi r11,r11,32 | |
1902 | lvx v23,r10,$sp | |
1903 | addi r10,r10,32 | |
1904 | lvx v24,r11,$sp | |
1905 | addi r11,r11,32 | |
1906 | lvx v25,r10,$sp | |
1907 | addi r10,r10,32 | |
1908 | lvx v26,r11,$sp | |
1909 | addi r11,r11,32 | |
1910 | lvx v27,r10,$sp | |
1911 | addi r10,r10,32 | |
1912 | lvx v28,r11,$sp | |
1913 | addi r11,r11,32 | |
1914 | lvx v29,r10,$sp | |
1915 | addi r10,r10,32 | |
1916 | lvx v30,r11,$sp | |
1917 | lvx v31,r10,$sp | |
1918 | $POP r27,`$VSXFRAME-$SIZE_T*5`($sp) | |
1919 | $POP r28,`$VSXFRAME-$SIZE_T*4`($sp) | |
1920 | $POP r29,`$VSXFRAME-$SIZE_T*3`($sp) | |
1921 | $POP r30,`$VSXFRAME-$SIZE_T*2`($sp) | |
1922 | $POP r31,`$VSXFRAME-$SIZE_T*1`($sp) | |
1923 | addi $sp,$sp,$VSXFRAME | |
9e58d119 AP |
1924 | blr |
1925 | .long 0 | |
a28e4890 AP |
1926 | .byte 0,12,0x04,1,0x80,5,4,0 |
1927 | .long 0 | |
1928 | .size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx | |
1929 | ||
1930 | .align 6 | |
1931 | LPICmeup: | |
1932 | mflr r0 | |
1933 | bcl 20,31,\$+4 | |
1934 | mflr $const # vvvvvv "distance" between . and 1st data entry | |
1935 | addi $const,$const,`64-8` | |
1936 | mtlr r0 | |
1937 | blr | |
1938 | .long 0 | |
1939 | .byte 0,12,0x14,0,0,0,0,0 | |
1940 | .space `64-9*4` | |
1941 | ||
1942 | .quad 0x0000000003ffffff,0x0000000003ffffff # mask26 | |
1943 | .quad 0x000000000000001a,0x000000000000001a # _26 | |
1944 | .quad 0x0000000000000028,0x0000000000000028 # _40 | |
1945 | .quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm | |
1946 | .quad 0x0100000001000000,0x0100000001000000 # padbits | |
1947 | .quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian | |
1948 | ||
1949 | .quad 0x0000000000000000,0x0000000004050607 # magic tail masks | |
1950 | .quad 0x0405060700000000,0x0000000000000000 | |
1951 | .quad 0x0000000000000000,0x0405060700000000 | |
1952 | ||
1953 | .quad 0xffffffff00000000,0xffffffffffffffff | |
1954 | .quad 0xffffffff00000000,0xffffffff00000000 | |
1955 | .quad 0x0000000000000000,0xffffffff00000000 | |
9e58d119 | 1956 | ___ |
a28e4890 | 1957 | }}} |
9e58d119 | 1958 | $code.=<<___; |
a28e4890 | 1959 | .asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm" |
9e58d119 AP |
1960 | ___ |
1961 | ||
a28e4890 AP |
1962 | foreach (split("\n",$code)) { |
1963 | s/\`([^\`]*)\`/eval($1)/ge; | |
1964 | ||
1965 | # instructions prefixed with '?' are endian-specific and need | |
1966 | # to be adjusted accordingly... | |
1967 | if ($flavour !~ /le$/) { # big-endian | |
1968 | s/be\?// or | |
1969 | s/le\?/#le#/ | |
1970 | } else { # little-endian | |
1971 | s/le\?// or | |
1972 | s/be\?/#be#/ | |
1973 | } | |
1974 | ||
1975 | print $_,"\n"; | |
1976 | } | |
9e58d119 | 1977 | close STDOUT; |