]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
49d3b641 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9e58d119 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # This module implements Poly1305 hash for PowerPC FPU. | |
18 | # | |
19 | # June 2015 | |
20 | # | |
21 | # Numbers are cycles per processed byte with poly1305_blocks alone, | |
22 | # and improvement coefficients relative to gcc-generated code. | |
23 | # | |
24 | # Freescale e300 9.78/+30% | |
4b8736a2 AP |
25 | # PPC74x0 6.92/+50% |
26 | # PPC970 6.03/+80% | |
9e58d119 AP |
27 | # POWER7 3.50/+30% |
28 | # POWER8 3.75/+10% | |
29 | ||
1aa89a7a RL |
30 | # $output is the last argument if it looks like a file (it has an extension) |
31 | # $flavour is the first argument if it doesn't look like a file | |
32 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
33 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
9e58d119 AP |
34 | |
35 | if ($flavour =~ /64/) { | |
36 | $SIZE_T =8; | |
37 | $LRSAVE =2*$SIZE_T; | |
38 | $UCMP ="cmpld"; | |
39 | $STU ="stdu"; | |
40 | $POP ="ld"; | |
41 | $PUSH ="std"; | |
42 | } elsif ($flavour =~ /32/) { | |
43 | $SIZE_T =4; | |
44 | $LRSAVE =$SIZE_T; | |
45 | $UCMP ="cmplw"; | |
46 | $STU ="stwu"; | |
47 | $POP ="lwz"; | |
48 | $PUSH ="stw"; | |
49 | } else { die "nonsense $flavour"; } | |
50 | ||
51 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; | |
52 | ||
53 | $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx"; | |
54 | ||
55 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
56 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
57 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
58 | die "can't locate ppc-xlate.pl"; | |
59 | ||
1aa89a7a RL |
60 | open STDOUT,"| $^X $xlate $flavour \"$output\"" |
61 | or die "can't call $xlate: $!"; | |
9e58d119 AP |
62 | |
63 | $LOCALS=6*$SIZE_T; | |
64 | $FRAME=$LOCALS+6*8+18*8; | |
65 | ||
66 | my $sp="r1"; | |
67 | ||
68 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); | |
69 | my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6)); | |
70 | ||
71 | my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, | |
72 | $two0,$two32,$two64,$two96,$two130,$five_two130, | |
73 | $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, | |
74 | $s2lo,$s2hi,$s3lo,$s3hi, | |
75 | $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31)); | |
76 | # borrowings | |
77 | my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); | |
78 | my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); | |
79 | my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi); | |
80 | ||
81 | $code.=<<___; | |
82 | .machine "any" | |
83 | .text | |
84 | ||
85 | .globl .poly1305_init_fpu | |
86 | .align 6 | |
87 | .poly1305_init_fpu: | |
88 | $STU $sp,-$LOCALS($sp) # minimal frame | |
89 | mflr $padbit | |
90 | $PUSH $padbit,`$LOCALS+$LRSAVE`($sp) | |
91 | ||
92 | bl LPICmeup | |
93 | ||
94 | xor r0,r0,r0 | |
95 | mtlr $padbit # restore lr | |
96 | ||
97 | lfd $two0,8*0($len) # load constants | |
98 | lfd $two32,8*1($len) | |
99 | lfd $two64,8*2($len) | |
100 | lfd $two96,8*3($len) | |
101 | lfd $two130,8*4($len) | |
102 | lfd $five_two130,8*5($len) | |
103 | ||
104 | stfd $two0,8*0($ctx) # initial hash value, biased 0 | |
105 | stfd $two32,8*1($ctx) | |
106 | stfd $two64,8*2($ctx) | |
107 | stfd $two96,8*3($ctx) | |
108 | ||
109 | $UCMP $inp,r0 | |
110 | beq- Lno_key | |
111 | ||
112 | lfd $h3lo,8*13($len) # new fpscr | |
113 | mffs $h3hi # old fpscr | |
114 | ||
115 | stfd $two0,8*4($ctx) # key "template" | |
116 | stfd $two32,8*5($ctx) | |
117 | stfd $two64,8*6($ctx) | |
118 | stfd $two96,8*7($ctx) | |
119 | ||
120 | li $in1,4 | |
121 | li $in2,8 | |
122 | li $in3,12 | |
123 | $LWXLE $in0,0,$inp # load key | |
124 | $LWXLE $in1,$in1,$inp | |
125 | $LWXLE $in2,$in2,$inp | |
126 | $LWXLE $in3,$in3,$inp | |
127 | ||
128 | lis $i1,0xf000 # 0xf0000000 | |
129 | ori $i2,$i1,3 # 0xf0000003 | |
130 | andc $in0,$in0,$i1 # &=0x0fffffff | |
131 | andc $in1,$in1,$i2 # &=0x0ffffffc | |
132 | andc $in2,$in2,$i2 | |
133 | andc $in3,$in3,$i2 | |
134 | ||
135 | stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template" | |
136 | stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx) | |
137 | stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx) | |
138 | stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx) | |
139 | ||
140 | mtfsf 255,$h3lo # fpscr | |
141 | stfd $two0,8*18($ctx) # copy constants to context | |
142 | stfd $two32,8*19($ctx) | |
143 | stfd $two64,8*20($ctx) | |
144 | stfd $two96,8*21($ctx) | |
145 | stfd $two130,8*22($ctx) | |
146 | stfd $five_two130,8*23($ctx) | |
147 | ||
148 | lfd $h0lo,8*4($ctx) # load [biased] key | |
149 | lfd $h1lo,8*5($ctx) | |
150 | lfd $h2lo,8*6($ctx) | |
151 | lfd $h3lo,8*7($ctx) | |
152 | ||
153 | fsub $h0lo,$h0lo,$two0 # r0 | |
154 | fsub $h1lo,$h1lo,$two32 # r1 | |
155 | fsub $h2lo,$h2lo,$two64 # r2 | |
156 | fsub $h3lo,$h3lo,$two96 # r3 | |
157 | ||
158 | lfd $two0,8*6($len) # more constants | |
159 | lfd $two32,8*7($len) | |
160 | lfd $two64,8*8($len) | |
161 | lfd $two96,8*9($len) | |
162 | ||
163 | fmul $h1hi,$h1lo,$five_two130 # s1 | |
164 | fmul $h2hi,$h2lo,$five_two130 # s2 | |
165 | stfd $h3hi,8*15($ctx) # borrow slot for original fpscr | |
166 | fmul $h3hi,$h3lo,$five_two130 # s3 | |
167 | ||
168 | fadd $h0hi,$h0lo,$two0 | |
169 | stfd $h1hi,8*12($ctx) # put aside for now | |
170 | fadd $h1hi,$h1lo,$two32 | |
171 | stfd $h2hi,8*13($ctx) | |
172 | fadd $h2hi,$h2lo,$two64 | |
173 | stfd $h3hi,8*14($ctx) | |
174 | fadd $h3hi,$h3lo,$two96 | |
175 | ||
176 | fsub $h0hi,$h0hi,$two0 | |
177 | fsub $h1hi,$h1hi,$two32 | |
178 | fsub $h2hi,$h2hi,$two64 | |
179 | fsub $h3hi,$h3hi,$two96 | |
180 | ||
181 | lfd $two0,8*10($len) # more constants | |
182 | lfd $two32,8*11($len) | |
183 | lfd $two64,8*12($len) | |
184 | ||
185 | fsub $h0lo,$h0lo,$h0hi | |
186 | fsub $h1lo,$h1lo,$h1hi | |
187 | fsub $h2lo,$h2lo,$h2hi | |
188 | fsub $h3lo,$h3lo,$h3hi | |
189 | ||
190 | stfd $h0hi,8*5($ctx) # r0hi | |
191 | stfd $h1hi,8*7($ctx) # r1hi | |
192 | stfd $h2hi,8*9($ctx) # r2hi | |
193 | stfd $h3hi,8*11($ctx) # r3hi | |
194 | ||
195 | stfd $h0lo,8*4($ctx) # r0lo | |
196 | stfd $h1lo,8*6($ctx) # r1lo | |
197 | stfd $h2lo,8*8($ctx) # r2lo | |
198 | stfd $h3lo,8*10($ctx) # r3lo | |
199 | ||
200 | lfd $h1lo,8*12($ctx) # s1 | |
201 | lfd $h2lo,8*13($ctx) # s2 | |
202 | lfd $h3lo,8*14($ctx) # s3 | |
203 | lfd $h0lo,8*15($ctx) # pull original fpscr | |
204 | ||
205 | fadd $h1hi,$h1lo,$two0 | |
206 | fadd $h2hi,$h2lo,$two32 | |
207 | fadd $h3hi,$h3lo,$two64 | |
208 | ||
209 | fsub $h1hi,$h1hi,$two0 | |
210 | fsub $h2hi,$h2hi,$two32 | |
211 | fsub $h3hi,$h3hi,$two64 | |
212 | ||
213 | fsub $h1lo,$h1lo,$h1hi | |
214 | fsub $h2lo,$h2lo,$h2hi | |
215 | fsub $h3lo,$h3lo,$h3hi | |
216 | ||
217 | stfd $h1hi,8*13($ctx) # s1hi | |
218 | stfd $h2hi,8*15($ctx) # s2hi | |
219 | stfd $h3hi,8*17($ctx) # s3hi | |
220 | ||
221 | stfd $h1lo,8*12($ctx) # s1lo | |
222 | stfd $h2lo,8*14($ctx) # s2lo | |
223 | stfd $h3lo,8*16($ctx) # s3lo | |
224 | ||
225 | mtfsf 255,$h0lo # restore fpscr | |
226 | Lno_key: | |
227 | xor r3,r3,r3 | |
228 | addi $sp,$sp,$LOCALS | |
229 | blr | |
230 | .long 0 | |
231 | .byte 0,12,4,1,0x80,0,2,0 | |
232 | .size .poly1305_init_fpu,.-.poly1305_init_fpu | |
233 | ||
234 | .globl .poly1305_blocks_fpu | |
235 | .align 4 | |
236 | .poly1305_blocks_fpu: | |
237 | srwi. $len,$len,4 | |
238 | beq- Labort | |
239 | ||
240 | $STU $sp,-$FRAME($sp) | |
241 | mflr r0 | |
242 | stfd f14,`$FRAME-8*18`($sp) | |
243 | stfd f15,`$FRAME-8*17`($sp) | |
244 | stfd f16,`$FRAME-8*16`($sp) | |
245 | stfd f17,`$FRAME-8*15`($sp) | |
246 | stfd f18,`$FRAME-8*14`($sp) | |
247 | stfd f19,`$FRAME-8*13`($sp) | |
248 | stfd f20,`$FRAME-8*12`($sp) | |
249 | stfd f21,`$FRAME-8*11`($sp) | |
250 | stfd f22,`$FRAME-8*10`($sp) | |
251 | stfd f23,`$FRAME-8*9`($sp) | |
252 | stfd f24,`$FRAME-8*8`($sp) | |
253 | stfd f25,`$FRAME-8*7`($sp) | |
254 | stfd f26,`$FRAME-8*6`($sp) | |
255 | stfd f27,`$FRAME-8*5`($sp) | |
256 | stfd f28,`$FRAME-8*4`($sp) | |
257 | stfd f29,`$FRAME-8*3`($sp) | |
258 | stfd f30,`$FRAME-8*2`($sp) | |
259 | stfd f31,`$FRAME-8*1`($sp) | |
260 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
261 | ||
262 | xor r0,r0,r0 | |
263 | li $in3,1 | |
264 | mtctr $len | |
265 | neg $len,$len | |
266 | stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp) | |
267 | stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp) | |
268 | ||
269 | lfd $two0,8*18($ctx) # load constants | |
270 | lfd $two32,8*19($ctx) | |
271 | lfd $two64,8*20($ctx) | |
272 | lfd $two96,8*21($ctx) | |
273 | lfd $two130,8*22($ctx) | |
274 | lfd $five_two130,8*23($ctx) | |
275 | ||
276 | lfd $h0lo,8*0($ctx) # load [biased] hash value | |
277 | lfd $h1lo,8*1($ctx) | |
278 | lfd $h2lo,8*2($ctx) | |
279 | lfd $h3lo,8*3($ctx) | |
280 | ||
281 | stfd $two0,`$LOCALS+8*0`($sp) # input "template" | |
282 | oris $in3,$padbit,`(1023+52+96)<<4` | |
283 | stfd $two32,`$LOCALS+8*1`($sp) | |
284 | stfd $two64,`$LOCALS+8*2`($sp) | |
285 | stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp) | |
286 | ||
287 | li $i1,4 | |
288 | li $i2,8 | |
289 | li $i3,12 | |
290 | $LWXLE $in0,0,$inp # load input | |
291 | $LWXLE $in1,$i1,$inp | |
292 | $LWXLE $in2,$i2,$inp | |
293 | $LWXLE $in3,$i3,$inp | |
294 | addi $inp,$inp,16 | |
295 | ||
296 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" | |
297 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) | |
298 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) | |
299 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) | |
300 | ||
301 | mffs $x0 # original fpscr | |
302 | lfd $x1,`$LOCALS+8*4`($sp) # new fpscr | |
303 | lfd $r0lo,8*4($ctx) # load key | |
304 | lfd $r0hi,8*5($ctx) | |
305 | lfd $r1lo,8*6($ctx) | |
306 | lfd $r1hi,8*7($ctx) | |
307 | lfd $r2lo,8*8($ctx) | |
308 | lfd $r2hi,8*9($ctx) | |
309 | lfd $r3lo,8*10($ctx) | |
310 | lfd $r3hi,8*11($ctx) | |
311 | lfd $s1lo,8*12($ctx) | |
312 | lfd $s1hi,8*13($ctx) | |
313 | lfd $s2lo,8*14($ctx) | |
314 | lfd $s2hi,8*15($ctx) | |
315 | lfd $s3lo,8*16($ctx) | |
316 | lfd $s3hi,8*17($ctx) | |
317 | ||
318 | stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr | |
319 | mtfsf 255,$x1 | |
320 | ||
321 | addic $len,$len,1 | |
322 | addze r0,r0 | |
323 | slwi. r0,r0,4 | |
324 | sub $inp,$inp,r0 # conditional rewind | |
325 | ||
326 | lfd $x0,`$LOCALS+8*0`($sp) | |
327 | lfd $x1,`$LOCALS+8*1`($sp) | |
328 | lfd $x2,`$LOCALS+8*2`($sp) | |
329 | lfd $x3,`$LOCALS+8*3`($sp) | |
330 | ||
331 | fsub $h0lo,$h0lo,$two0 # de-bias hash value | |
332 | $LWXLE $in0,0,$inp # modulo-scheduled input load | |
333 | fsub $h1lo,$h1lo,$two32 | |
334 | $LWXLE $in1,$i1,$inp | |
335 | fsub $h2lo,$h2lo,$two64 | |
336 | $LWXLE $in2,$i2,$inp | |
337 | fsub $h3lo,$h3lo,$two96 | |
338 | $LWXLE $in3,$i3,$inp | |
339 | ||
340 | fsub $x0,$x0,$two0 # de-bias input | |
341 | addi $inp,$inp,16 | |
342 | fsub $x1,$x1,$two32 | |
343 | fsub $x2,$x2,$two64 | |
344 | fsub $x3,$x3,$two96 | |
345 | ||
346 | fadd $x0,$x0,$h0lo # accumulate input | |
347 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) | |
348 | fadd $x1,$x1,$h1lo | |
349 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) | |
350 | fadd $x2,$x2,$h2lo | |
351 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) | |
352 | fadd $x3,$x3,$h3lo | |
353 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) | |
354 | ||
355 | b Lentry | |
356 | ||
357 | .align 4 | |
358 | Loop: | |
359 | fsub $y0,$y0,$two0 # de-bias input | |
360 | addic $len,$len,1 | |
361 | fsub $y1,$y1,$two32 | |
362 | addze r0,r0 | |
363 | fsub $y2,$y2,$two64 | |
364 | slwi. r0,r0,4 | |
365 | fsub $y3,$y3,$two96 | |
366 | sub $inp,$inp,r0 # conditional rewind | |
367 | ||
368 | fadd $h0lo,$h0lo,$y0 # accumulate input | |
369 | fadd $h0hi,$h0hi,$y1 | |
370 | fadd $h2lo,$h2lo,$y2 | |
371 | fadd $h2hi,$h2hi,$y3 | |
372 | ||
373 | ######################################### base 2^48 -> base 2^32 | |
374 | fadd $c1lo,$h1lo,$two64 | |
375 | $LWXLE $in0,0,$inp # modulo-scheduled input load | |
376 | fadd $c1hi,$h1hi,$two64 | |
377 | $LWXLE $in1,$i1,$inp | |
378 | fadd $c3lo,$h3lo,$two130 | |
379 | $LWXLE $in2,$i2,$inp | |
380 | fadd $c3hi,$h3hi,$two130 | |
381 | $LWXLE $in3,$i3,$inp | |
382 | fadd $c0lo,$h0lo,$two32 | |
383 | addi $inp,$inp,16 | |
384 | fadd $c0hi,$h0hi,$two32 | |
385 | fadd $c2lo,$h2lo,$two96 | |
386 | fadd $c2hi,$h2hi,$two96 | |
387 | ||
388 | fsub $c1lo,$c1lo,$two64 | |
389 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" | |
390 | fsub $c1hi,$c1hi,$two64 | |
391 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) | |
392 | fsub $c3lo,$c3lo,$two130 | |
393 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) | |
394 | fsub $c3hi,$c3hi,$two130 | |
395 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) | |
396 | fsub $c0lo,$c0lo,$two32 | |
397 | fsub $c0hi,$c0hi,$two32 | |
398 | fsub $c2lo,$c2lo,$two96 | |
399 | fsub $c2hi,$c2hi,$two96 | |
400 | ||
401 | fsub $h1lo,$h1lo,$c1lo | |
402 | fsub $h1hi,$h1hi,$c1hi | |
403 | fsub $h3lo,$h3lo,$c3lo | |
404 | fsub $h3hi,$h3hi,$c3hi | |
405 | fsub $h2lo,$h2lo,$c2lo | |
406 | fsub $h2hi,$h2hi,$c2hi | |
407 | fsub $h0lo,$h0lo,$c0lo | |
408 | fsub $h0hi,$h0hi,$c0hi | |
409 | ||
410 | fadd $h1lo,$h1lo,$c0lo | |
411 | fadd $h1hi,$h1hi,$c0hi | |
412 | fadd $h3lo,$h3lo,$c2lo | |
413 | fadd $h3hi,$h3hi,$c2hi | |
414 | fadd $h2lo,$h2lo,$c1lo | |
415 | fadd $h2hi,$h2hi,$c1hi | |
416 | fmadd $h0lo,$c3lo,$five_two130,$h0lo | |
417 | fmadd $h0hi,$c3hi,$five_two130,$h0hi | |
418 | ||
419 | fadd $x1,$h1lo,$h1hi | |
420 | lfd $s1lo,8*12($ctx) # reload constants | |
421 | fadd $x3,$h3lo,$h3hi | |
422 | lfd $s1hi,8*13($ctx) | |
423 | fadd $x2,$h2lo,$h2hi | |
424 | lfd $r3lo,8*10($ctx) | |
425 | fadd $x0,$h0lo,$h0hi | |
426 | lfd $r3hi,8*11($ctx) | |
427 | Lentry: | |
428 | fmul $h0lo,$s3lo,$x1 | |
429 | fmul $h0hi,$s3hi,$x1 | |
430 | fmul $h2lo,$r1lo,$x1 | |
431 | fmul $h2hi,$r1hi,$x1 | |
432 | fmul $h1lo,$r0lo,$x1 | |
433 | fmul $h1hi,$r0hi,$x1 | |
434 | fmul $h3lo,$r2lo,$x1 | |
435 | fmul $h3hi,$r2hi,$x1 | |
436 | ||
437 | fmadd $h0lo,$s1lo,$x3,$h0lo | |
438 | fmadd $h0hi,$s1hi,$x3,$h0hi | |
439 | fmadd $h2lo,$s3lo,$x3,$h2lo | |
440 | fmadd $h2hi,$s3hi,$x3,$h2hi | |
441 | fmadd $h1lo,$s2lo,$x3,$h1lo | |
442 | fmadd $h1hi,$s2hi,$x3,$h1hi | |
443 | fmadd $h3lo,$r0lo,$x3,$h3lo | |
444 | fmadd $h3hi,$r0hi,$x3,$h3hi | |
445 | ||
446 | fmadd $h0lo,$s2lo,$x2,$h0lo | |
447 | fmadd $h0hi,$s2hi,$x2,$h0hi | |
448 | fmadd $h2lo,$r0lo,$x2,$h2lo | |
449 | fmadd $h2hi,$r0hi,$x2,$h2hi | |
450 | fmadd $h1lo,$s3lo,$x2,$h1lo | |
451 | fmadd $h1hi,$s3hi,$x2,$h1hi | |
452 | fmadd $h3lo,$r1lo,$x2,$h3lo | |
453 | fmadd $h3hi,$r1hi,$x2,$h3hi | |
454 | ||
455 | fmadd $h0lo,$r0lo,$x0,$h0lo | |
456 | lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input | |
457 | fmadd $h0hi,$r0hi,$x0,$h0hi | |
458 | lfd $y1,`$LOCALS+8*1`($sp) | |
459 | fmadd $h2lo,$r2lo,$x0,$h2lo | |
460 | lfd $y2,`$LOCALS+8*2`($sp) | |
461 | fmadd $h2hi,$r2hi,$x0,$h2hi | |
462 | lfd $y3,`$LOCALS+8*3`($sp) | |
463 | fmadd $h1lo,$r1lo,$x0,$h1lo | |
464 | fmadd $h1hi,$r1hi,$x0,$h1hi | |
465 | fmadd $h3lo,$r3lo,$x0,$h3lo | |
466 | fmadd $h3hi,$r3hi,$x0,$h3hi | |
467 | ||
468 | bdnz Loop | |
469 | ||
470 | ######################################### base 2^48 -> base 2^32 | |
471 | fadd $c0lo,$h0lo,$two32 | |
472 | fadd $c0hi,$h0hi,$two32 | |
473 | fadd $c2lo,$h2lo,$two96 | |
474 | fadd $c2hi,$h2hi,$two96 | |
475 | fadd $c1lo,$h1lo,$two64 | |
476 | fadd $c1hi,$h1hi,$two64 | |
477 | fadd $c3lo,$h3lo,$two130 | |
478 | fadd $c3hi,$h3hi,$two130 | |
479 | ||
480 | fsub $c0lo,$c0lo,$two32 | |
481 | fsub $c0hi,$c0hi,$two32 | |
482 | fsub $c2lo,$c2lo,$two96 | |
483 | fsub $c2hi,$c2hi,$two96 | |
484 | fsub $c1lo,$c1lo,$two64 | |
485 | fsub $c1hi,$c1hi,$two64 | |
486 | fsub $c3lo,$c3lo,$two130 | |
487 | fsub $c3hi,$c3hi,$two130 | |
488 | ||
489 | fsub $h1lo,$h1lo,$c1lo | |
490 | fsub $h1hi,$h1hi,$c1hi | |
491 | fsub $h3lo,$h3lo,$c3lo | |
492 | fsub $h3hi,$h3hi,$c3hi | |
493 | fsub $h2lo,$h2lo,$c2lo | |
494 | fsub $h2hi,$h2hi,$c2hi | |
495 | fsub $h0lo,$h0lo,$c0lo | |
496 | fsub $h0hi,$h0hi,$c0hi | |
497 | ||
498 | fadd $h1lo,$h1lo,$c0lo | |
499 | fadd $h1hi,$h1hi,$c0hi | |
500 | fadd $h3lo,$h3lo,$c2lo | |
501 | fadd $h3hi,$h3hi,$c2hi | |
502 | fadd $h2lo,$h2lo,$c1lo | |
503 | fadd $h2hi,$h2hi,$c1hi | |
504 | fmadd $h0lo,$c3lo,$five_two130,$h0lo | |
505 | fmadd $h0hi,$c3hi,$five_two130,$h0hi | |
506 | ||
507 | fadd $x1,$h1lo,$h1hi | |
508 | fadd $x3,$h3lo,$h3hi | |
509 | fadd $x2,$h2lo,$h2hi | |
510 | fadd $x0,$h0lo,$h0hi | |
511 | ||
512 | lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr | |
513 | fadd $x1,$x1,$two32 # bias | |
514 | fadd $x3,$x3,$two96 | |
515 | fadd $x2,$x2,$two64 | |
516 | fadd $x0,$x0,$two0 | |
517 | ||
518 | stfd $x1,8*1($ctx) # store [biased] hash value | |
519 | stfd $x3,8*3($ctx) | |
520 | stfd $x2,8*2($ctx) | |
521 | stfd $x0,8*0($ctx) | |
522 | ||
523 | mtfsf 255,$h0lo # restore original fpscr | |
524 | lfd f14,`$FRAME-8*18`($sp) | |
525 | lfd f15,`$FRAME-8*17`($sp) | |
526 | lfd f16,`$FRAME-8*16`($sp) | |
527 | lfd f17,`$FRAME-8*15`($sp) | |
528 | lfd f18,`$FRAME-8*14`($sp) | |
529 | lfd f19,`$FRAME-8*13`($sp) | |
530 | lfd f20,`$FRAME-8*12`($sp) | |
531 | lfd f21,`$FRAME-8*11`($sp) | |
532 | lfd f22,`$FRAME-8*10`($sp) | |
533 | lfd f23,`$FRAME-8*9`($sp) | |
534 | lfd f24,`$FRAME-8*8`($sp) | |
535 | lfd f25,`$FRAME-8*7`($sp) | |
536 | lfd f26,`$FRAME-8*6`($sp) | |
537 | lfd f27,`$FRAME-8*5`($sp) | |
538 | lfd f28,`$FRAME-8*4`($sp) | |
539 | lfd f29,`$FRAME-8*3`($sp) | |
540 | lfd f30,`$FRAME-8*2`($sp) | |
541 | lfd f31,`$FRAME-8*1`($sp) | |
542 | addi $sp,$sp,$FRAME | |
543 | Labort: | |
544 | blr | |
545 | .long 0 | |
546 | .byte 0,12,4,1,0x80,0,4,0 | |
547 | .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu | |
548 | ___ | |
549 | { | |
550 | my ($mac,$nonce)=($inp,$len); | |
551 | ||
552 | my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3 | |
553 | ) = map("r$_",(7..11,28..31)); | |
554 | my $mask = "r0"; | |
555 | my $FRAME = (6+4)*$SIZE_T; | |
556 | ||
557 | $code.=<<___; | |
558 | .globl .poly1305_emit_fpu | |
559 | .align 4 | |
560 | .poly1305_emit_fpu: | |
561 | $STU $sp,-$FRAME($sp) | |
562 | mflr r0 | |
563 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
564 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
565 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
566 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
567 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
568 | ||
569 | lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash | |
570 | lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx) | |
571 | lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx) | |
572 | lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx) | |
573 | lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx) | |
574 | lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx) | |
575 | lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx) | |
576 | lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx) | |
577 | ||
578 | lis $mask,0xfff0 | |
579 | andc $d0,$d0,$mask # mask exponent | |
580 | andc $d1,$d1,$mask | |
581 | andc $d2,$d2,$mask | |
582 | andc $d3,$d3,$mask # can be partially reduced... | |
583 | li $mask,3 | |
584 | ||
585 | srwi $padbit,$d3,2 # ... so reduce | |
586 | and $h4,$d3,$mask | |
587 | andc $d3,$d3,$mask | |
588 | add $d3,$d3,$padbit | |
589 | ___ | |
590 | if ($SIZE_T==4) { | |
591 | $code.=<<___; | |
592 | addc $h0,$h0,$d3 | |
593 | adde $h1,$h1,$d0 | |
594 | adde $h2,$h2,$d1 | |
595 | adde $h3,$h3,$d2 | |
596 | addze $h4,$h4 | |
597 | ||
598 | addic $d0,$h0,5 # compare to modulus | |
599 | addze $d1,$h1 | |
600 | addze $d2,$h2 | |
601 | addze $d3,$h3 | |
602 | addze $mask,$h4 | |
603 | ||
604 | srwi $mask,$mask,2 # did it carry/borrow? | |
605 | neg $mask,$mask | |
606 | srawi $mask,$mask,31 # mask | |
607 | ||
608 | andc $h0,$h0,$mask | |
609 | and $d0,$d0,$mask | |
610 | andc $h1,$h1,$mask | |
611 | and $d1,$d1,$mask | |
612 | or $h0,$h0,$d0 | |
613 | lwz $d0,0($nonce) # load nonce | |
614 | andc $h2,$h2,$mask | |
615 | and $d2,$d2,$mask | |
616 | or $h1,$h1,$d1 | |
617 | lwz $d1,4($nonce) | |
618 | andc $h3,$h3,$mask | |
619 | and $d3,$d3,$mask | |
620 | or $h2,$h2,$d2 | |
621 | lwz $d2,8($nonce) | |
622 | or $h3,$h3,$d3 | |
623 | lwz $d3,12($nonce) | |
624 | ||
625 | addc $h0,$h0,$d0 # accumulate nonce | |
626 | adde $h1,$h1,$d1 | |
627 | adde $h2,$h2,$d2 | |
628 | adde $h3,$h3,$d3 | |
629 | ___ | |
630 | } else { | |
631 | $code.=<<___; | |
632 | add $h0,$h0,$d3 | |
633 | add $h1,$h1,$d0 | |
634 | add $h2,$h2,$d1 | |
635 | add $h3,$h3,$d2 | |
636 | ||
637 | srdi $d0,$h0,32 | |
638 | add $h1,$h1,$d0 | |
639 | srdi $d1,$h1,32 | |
640 | add $h2,$h2,$d1 | |
641 | srdi $d2,$h2,32 | |
642 | add $h3,$h3,$d2 | |
643 | srdi $d3,$h3,32 | |
644 | add $h4,$h4,$d3 | |
645 | ||
646 | insrdi $h0,$h1,32,0 | |
647 | insrdi $h2,$h3,32,0 | |
648 | ||
649 | addic $d0,$h0,5 # compare to modulus | |
650 | addze $d1,$h2 | |
651 | addze $d2,$h4 | |
652 | ||
653 | srdi $mask,$d2,2 # did it carry/borrow? | |
654 | neg $mask,$mask | |
655 | sradi $mask,$mask,63 # mask | |
656 | ld $d2,0($nonce) # load nonce | |
657 | ld $d3,8($nonce) | |
658 | ||
659 | andc $h0,$h0,$mask | |
660 | and $d0,$d0,$mask | |
661 | andc $h2,$h2,$mask | |
662 | and $d1,$d1,$mask | |
663 | or $h0,$h0,$d0 | |
664 | or $h2,$h2,$d1 | |
665 | ___ | |
666 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
667 | rotldi $d2,$d2,32 # flip nonce words | |
668 | rotldi $d3,$d3,32 | |
669 | ___ | |
670 | $code.=<<___; | |
671 | addc $h0,$h0,$d2 # accumulate nonce | |
672 | adde $h2,$h2,$d3 | |
673 | ||
674 | srdi $h1,$h0,32 | |
675 | srdi $h3,$h2,32 | |
676 | ___ | |
677 | } | |
678 | $code.=<<___ if ($LITTLE_ENDIAN); | |
679 | stw $h0,0($mac) # write result | |
680 | stw $h1,4($mac) | |
681 | stw $h2,8($mac) | |
682 | stw $h3,12($mac) | |
683 | ___ | |
684 | $code.=<<___ if (!$LITTLE_ENDIAN); | |
685 | li $d1,4 | |
686 | stwbrx $h0,0,$mac # write result | |
687 | li $d2,8 | |
688 | stwbrx $h1,$d1,$mac | |
689 | li $d3,12 | |
690 | stwbrx $h2,$d2,$mac | |
691 | stwbrx $h3,$d3,$mac | |
692 | ___ | |
693 | $code.=<<___; | |
694 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
695 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
696 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
697 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
698 | addi $sp,$sp,$FRAME | |
699 | blr | |
700 | .long 0 | |
701 | .byte 0,12,4,1,0x80,4,3,0 | |
702 | .size .poly1305_emit_fpu,.-.poly1305_emit_fpu | |
703 | ___ | |
704 | } | |
705 | # Ugly hack here, because PPC assembler syntax seem to vary too | |
706 | # much from platforms to platform... | |
707 | $code.=<<___; | |
708 | .align 6 | |
709 | LPICmeup: | |
710 | mflr r0 | |
711 | bcl 20,31,\$+4 | |
712 | mflr $len # vvvvvv "distance" between . and 1st data entry | |
713 | addi $len,$len,`64-8` # borrow $len | |
714 | mtlr r0 | |
715 | blr | |
716 | .long 0 | |
717 | .byte 0,12,0x14,0,0,0,0,0 | |
718 | .space `64-9*4` | |
719 | ||
720 | .quad 0x4330000000000000 # 2^(52+0) | |
721 | .quad 0x4530000000000000 # 2^(52+32) | |
722 | .quad 0x4730000000000000 # 2^(52+64) | |
723 | .quad 0x4930000000000000 # 2^(52+96) | |
724 | .quad 0x4b50000000000000 # 2^(52+130) | |
725 | ||
726 | .quad 0x37f4000000000000 # 5/2^130 | |
727 | ||
728 | .quad 0x4430000000000000 # 2^(52+16+0) | |
729 | .quad 0x4630000000000000 # 2^(52+16+32) | |
730 | .quad 0x4830000000000000 # 2^(52+16+64) | |
731 | .quad 0x4a30000000000000 # 2^(52+16+96) | |
732 | .quad 0x3e30000000000000 # 2^(52+16+0-96) | |
733 | .quad 0x4030000000000000 # 2^(52+16+32-96) | |
734 | .quad 0x4230000000000000 # 2^(52+16+64-96) | |
735 | ||
736 | .quad 0x0000000000000001 # fpscr: truncate, no exceptions | |
737 | .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>" | |
738 | .align 4 | |
739 | ___ | |
740 | ||
741 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
742 | print $code; | |
a21314db | 743 | close STDOUT or die "error closing STDOUT: $!"; |