]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
03d770d9 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9e58d119 AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # October 2015 | |
609b0852 | 18 | # |
9e58d119 AP |
19 | # ChaCha20 for PowerPC/AltiVec. |
20 | # | |
791cc302 AP |
21 | # June 2018 |
22 | # | |
23 | # Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for | |
24 | # processors that can't issue more than one vector instruction per | |
25 | # cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x | |
26 | # interleave would perform better. Incidentally PowerISA 2.07 (first | |
27 | # implemented by POWER8) defined new usable instructions, hence 4xVSX | |
28 | # code path... | |
29 | # | |
9e58d119 AP |
30 | # Performance in cycles per byte out of large buffer. |
31 | # | |
791cc302 | 32 | # IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX |
9e58d119 | 33 | # |
791cc302 AP |
34 | # Freescale e300 13.6/+115% - - |
35 | # PPC74x0/G4e 6.81/+310% 3.81 - | |
36 | # PPC970/G5 9.29/+160% ? - | |
37 | # POWER7 8.62/+61% 3.35 - | |
38 | # POWER8 8.70/+51% 2.91 2.09 | |
39 | # POWER9 8.80/+29% 4.44(*) 2.45(**) | |
c869c3ad AP |
40 | # |
41 | # (*) this is trade-off result, it's possible to improve it, but | |
42 | # then it would negatively affect all others; | |
791cc302 AP |
43 | # (**) POWER9 seems to be "allergic" to mixing vector and integer |
44 | # instructions, which is why switch to vector-only code pays | |
45 | # off that much; | |
9e58d119 | 46 | |
1aa89a7a RL |
47 | # $output is the last argument if it looks like a file (it has an extension) |
48 | # $flavour is the first argument if it doesn't look like a file | |
49 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
50 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
9e58d119 AP |
51 | |
52 | if ($flavour =~ /64/) { | |
53 | $SIZE_T =8; | |
54 | $LRSAVE =2*$SIZE_T; | |
55 | $STU ="stdu"; | |
56 | $POP ="ld"; | |
57 | $PUSH ="std"; | |
58 | $UCMP ="cmpld"; | |
59 | } elsif ($flavour =~ /32/) { | |
60 | $SIZE_T =4; | |
61 | $LRSAVE =$SIZE_T; | |
62 | $STU ="stwu"; | |
63 | $POP ="lwz"; | |
64 | $PUSH ="stw"; | |
65 | $UCMP ="cmplw"; | |
66 | } else { die "nonsense $flavour"; } | |
67 | ||
68 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; | |
69 | ||
70 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
71 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
72 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
73 | die "can't locate ppc-xlate.pl"; | |
74 | ||
1aa89a7a RL |
75 | open STDOUT,"| $^X $xlate $flavour \"$output\"" |
76 | or die "can't call $xlate: $!"; | |
9e58d119 AP |
77 | |
78 | $LOCALS=6*$SIZE_T; | |
79 | $FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables | |
80 | ||
81 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm | |
82 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; | |
83 | $code .= "\t$opcode\t".join(',',@_)."\n"; | |
84 | } | |
85 | ||
86 | my $sp = "r1"; | |
87 | ||
88 | my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); | |
89 | ||
90 | my @x=map("r$_",(16..31)); | |
91 | my @d=map("r$_",(11,12,14,15)); | |
92 | my @t=map("r$_",(7..10)); | |
93 | ||
94 | sub ROUND { | |
95 | my ($a0,$b0,$c0,$d0)=@_; | |
96 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
97 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
98 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
99 | ||
100 | ( | |
101 | "&add (@x[$a0],@x[$a0],@x[$b0])", | |
102 | "&add (@x[$a1],@x[$a1],@x[$b1])", | |
103 | "&add (@x[$a2],@x[$a2],@x[$b2])", | |
104 | "&add (@x[$a3],@x[$a3],@x[$b3])", | |
105 | "&xor (@x[$d0],@x[$d0],@x[$a0])", | |
106 | "&xor (@x[$d1],@x[$d1],@x[$a1])", | |
107 | "&xor (@x[$d2],@x[$d2],@x[$a2])", | |
108 | "&xor (@x[$d3],@x[$d3],@x[$a3])", | |
109 | "&rotlwi (@x[$d0],@x[$d0],16)", | |
110 | "&rotlwi (@x[$d1],@x[$d1],16)", | |
111 | "&rotlwi (@x[$d2],@x[$d2],16)", | |
112 | "&rotlwi (@x[$d3],@x[$d3],16)", | |
113 | ||
114 | "&add (@x[$c0],@x[$c0],@x[$d0])", | |
115 | "&add (@x[$c1],@x[$c1],@x[$d1])", | |
116 | "&add (@x[$c2],@x[$c2],@x[$d2])", | |
117 | "&add (@x[$c3],@x[$c3],@x[$d3])", | |
118 | "&xor (@x[$b0],@x[$b0],@x[$c0])", | |
119 | "&xor (@x[$b1],@x[$b1],@x[$c1])", | |
120 | "&xor (@x[$b2],@x[$b2],@x[$c2])", | |
121 | "&xor (@x[$b3],@x[$b3],@x[$c3])", | |
122 | "&rotlwi (@x[$b0],@x[$b0],12)", | |
123 | "&rotlwi (@x[$b1],@x[$b1],12)", | |
124 | "&rotlwi (@x[$b2],@x[$b2],12)", | |
125 | "&rotlwi (@x[$b3],@x[$b3],12)", | |
126 | ||
127 | "&add (@x[$a0],@x[$a0],@x[$b0])", | |
128 | "&add (@x[$a1],@x[$a1],@x[$b1])", | |
129 | "&add (@x[$a2],@x[$a2],@x[$b2])", | |
130 | "&add (@x[$a3],@x[$a3],@x[$b3])", | |
131 | "&xor (@x[$d0],@x[$d0],@x[$a0])", | |
132 | "&xor (@x[$d1],@x[$d1],@x[$a1])", | |
133 | "&xor (@x[$d2],@x[$d2],@x[$a2])", | |
134 | "&xor (@x[$d3],@x[$d3],@x[$a3])", | |
135 | "&rotlwi (@x[$d0],@x[$d0],8)", | |
136 | "&rotlwi (@x[$d1],@x[$d1],8)", | |
137 | "&rotlwi (@x[$d2],@x[$d2],8)", | |
138 | "&rotlwi (@x[$d3],@x[$d3],8)", | |
139 | ||
140 | "&add (@x[$c0],@x[$c0],@x[$d0])", | |
141 | "&add (@x[$c1],@x[$c1],@x[$d1])", | |
142 | "&add (@x[$c2],@x[$c2],@x[$d2])", | |
143 | "&add (@x[$c3],@x[$c3],@x[$d3])", | |
144 | "&xor (@x[$b0],@x[$b0],@x[$c0])", | |
145 | "&xor (@x[$b1],@x[$b1],@x[$c1])", | |
146 | "&xor (@x[$b2],@x[$b2],@x[$c2])", | |
147 | "&xor (@x[$b3],@x[$b3],@x[$c3])", | |
148 | "&rotlwi (@x[$b0],@x[$b0],7)", | |
149 | "&rotlwi (@x[$b1],@x[$b1],7)", | |
150 | "&rotlwi (@x[$b2],@x[$b2],7)", | |
151 | "&rotlwi (@x[$b3],@x[$b3],7)" | |
152 | ); | |
153 | } | |
154 | ||
155 | $code.=<<___; | |
156 | .machine "any" | |
abcbf7ed | 157 | .text |
9e58d119 AP |
158 | |
159 | .globl .ChaCha20_ctr32_int | |
160 | .align 5 | |
161 | .ChaCha20_ctr32_int: | |
162 | __ChaCha20_ctr32_int: | |
163 | ${UCMP}i $len,0 | |
164 | beqlr- | |
165 | ||
166 | $STU $sp,-$FRAME($sp) | |
167 | mflr r0 | |
168 | ||
169 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
170 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
171 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
172 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
173 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
174 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
175 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
176 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
177 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
178 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
179 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
180 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
181 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
182 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
183 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
184 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
185 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
186 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
187 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | |
188 | ||
189 | lwz @d[0],0($ctr) # load counter | |
190 | lwz @d[1],4($ctr) | |
191 | lwz @d[2],8($ctr) | |
192 | lwz @d[3],12($ctr) | |
193 | ||
194 | bl __ChaCha20_1x | |
195 | ||
196 | $POP r0,`$FRAME+$LRSAVE`($sp) | |
197 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
198 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
199 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
200 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
201 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
202 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
203 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
204 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
205 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
206 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
207 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
208 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
209 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
210 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
211 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
212 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
213 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
214 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
215 | mtlr r0 | |
216 | addi $sp,$sp,$FRAME | |
217 | blr | |
218 | .long 0 | |
219 | .byte 0,12,4,1,0x80,18,5,0 | |
220 | .long 0 | |
221 | .size .ChaCha20_ctr32_int,.-.ChaCha20_ctr32_int | |
222 | ||
223 | .align 5 | |
224 | __ChaCha20_1x: | |
225 | Loop_outer: | |
226 | lis @x[0],0x6170 # synthesize sigma | |
227 | lis @x[1],0x3320 | |
228 | lis @x[2],0x7962 | |
229 | lis @x[3],0x6b20 | |
230 | ori @x[0],@x[0],0x7865 | |
231 | ori @x[1],@x[1],0x646e | |
232 | ori @x[2],@x[2],0x2d32 | |
233 | ori @x[3],@x[3],0x6574 | |
234 | ||
235 | li r0,10 # inner loop counter | |
236 | lwz @x[4],0($key) # load key | |
237 | lwz @x[5],4($key) | |
238 | lwz @x[6],8($key) | |
239 | lwz @x[7],12($key) | |
240 | lwz @x[8],16($key) | |
241 | mr @x[12],@d[0] # copy counter | |
242 | lwz @x[9],20($key) | |
243 | mr @x[13],@d[1] | |
244 | lwz @x[10],24($key) | |
245 | mr @x[14],@d[2] | |
246 | lwz @x[11],28($key) | |
247 | mr @x[15],@d[3] | |
248 | ||
249 | mr @t[0],@x[4] | |
250 | mr @t[1],@x[5] | |
251 | mr @t[2],@x[6] | |
252 | mr @t[3],@x[7] | |
253 | ||
254 | mtctr r0 | |
255 | Loop: | |
256 | ___ | |
257 | foreach (&ROUND(0, 4, 8,12)) { eval; } | |
258 | foreach (&ROUND(0, 5,10,15)) { eval; } | |
259 | $code.=<<___; | |
260 | bdnz Loop | |
261 | ||
262 | subic $len,$len,64 # $len-=64 | |
263 | addi @x[0],@x[0],0x7865 # accumulate key block | |
264 | addi @x[1],@x[1],0x646e | |
265 | addi @x[2],@x[2],0x2d32 | |
266 | addi @x[3],@x[3],0x6574 | |
267 | addis @x[0],@x[0],0x6170 | |
268 | addis @x[1],@x[1],0x3320 | |
269 | addis @x[2],@x[2],0x7962 | |
270 | addis @x[3],@x[3],0x6b20 | |
271 | ||
272 | subfe. r0,r0,r0 # borrow?-1:0 | |
273 | add @x[4],@x[4],@t[0] | |
274 | lwz @t[0],16($key) | |
275 | add @x[5],@x[5],@t[1] | |
276 | lwz @t[1],20($key) | |
277 | add @x[6],@x[6],@t[2] | |
278 | lwz @t[2],24($key) | |
279 | add @x[7],@x[7],@t[3] | |
280 | lwz @t[3],28($key) | |
281 | add @x[8],@x[8],@t[0] | |
282 | add @x[9],@x[9],@t[1] | |
283 | add @x[10],@x[10],@t[2] | |
284 | add @x[11],@x[11],@t[3] | |
285 | ||
286 | add @x[12],@x[12],@d[0] | |
287 | add @x[13],@x[13],@d[1] | |
288 | add @x[14],@x[14],@d[2] | |
289 | add @x[15],@x[15],@d[3] | |
290 | addi @d[0],@d[0],1 # increment counter | |
291 | ___ | |
292 | if (!$LITTLE_ENDIAN) { for($i=0;$i<16;$i++) { # flip byte order | |
293 | $code.=<<___; | |
294 | mr @t[$i&3],@x[$i] | |
295 | rotlwi @x[$i],@x[$i],8 | |
296 | rlwimi @x[$i],@t[$i&3],24,0,7 | |
297 | rlwimi @x[$i],@t[$i&3],24,16,23 | |
298 | ___ | |
299 | } } | |
300 | $code.=<<___; | |
301 | bne Ltail # $len-=64 borrowed | |
302 | ||
303 | lwz @t[0],0($inp) # load input, aligned or not | |
304 | lwz @t[1],4($inp) | |
305 | ${UCMP}i $len,0 # done already? | |
306 | lwz @t[2],8($inp) | |
307 | lwz @t[3],12($inp) | |
308 | xor @x[0],@x[0],@t[0] # xor with input | |
309 | lwz @t[0],16($inp) | |
310 | xor @x[1],@x[1],@t[1] | |
311 | lwz @t[1],20($inp) | |
312 | xor @x[2],@x[2],@t[2] | |
313 | lwz @t[2],24($inp) | |
314 | xor @x[3],@x[3],@t[3] | |
315 | lwz @t[3],28($inp) | |
316 | xor @x[4],@x[4],@t[0] | |
317 | lwz @t[0],32($inp) | |
318 | xor @x[5],@x[5],@t[1] | |
319 | lwz @t[1],36($inp) | |
320 | xor @x[6],@x[6],@t[2] | |
321 | lwz @t[2],40($inp) | |
322 | xor @x[7],@x[7],@t[3] | |
323 | lwz @t[3],44($inp) | |
324 | xor @x[8],@x[8],@t[0] | |
325 | lwz @t[0],48($inp) | |
326 | xor @x[9],@x[9],@t[1] | |
327 | lwz @t[1],52($inp) | |
328 | xor @x[10],@x[10],@t[2] | |
329 | lwz @t[2],56($inp) | |
330 | xor @x[11],@x[11],@t[3] | |
331 | lwz @t[3],60($inp) | |
332 | xor @x[12],@x[12],@t[0] | |
333 | stw @x[0],0($out) # store output, aligned or not | |
334 | xor @x[13],@x[13],@t[1] | |
335 | stw @x[1],4($out) | |
336 | xor @x[14],@x[14],@t[2] | |
337 | stw @x[2],8($out) | |
338 | xor @x[15],@x[15],@t[3] | |
339 | stw @x[3],12($out) | |
340 | stw @x[4],16($out) | |
341 | stw @x[5],20($out) | |
342 | stw @x[6],24($out) | |
343 | stw @x[7],28($out) | |
344 | stw @x[8],32($out) | |
345 | stw @x[9],36($out) | |
346 | stw @x[10],40($out) | |
347 | stw @x[11],44($out) | |
348 | stw @x[12],48($out) | |
349 | stw @x[13],52($out) | |
350 | stw @x[14],56($out) | |
351 | addi $inp,$inp,64 | |
352 | stw @x[15],60($out) | |
353 | addi $out,$out,64 | |
354 | ||
355 | bne Loop_outer | |
356 | ||
357 | blr | |
358 | ||
359 | .align 4 | |
360 | Ltail: | |
361 | addi $len,$len,64 # restore tail length | |
362 | subi $inp,$inp,1 # prepare for *++ptr | |
363 | subi $out,$out,1 | |
364 | addi @t[0],$sp,$LOCALS-1 | |
365 | mtctr $len | |
366 | ||
367 | stw @x[0],`$LOCALS+0`($sp) # save whole block to stack | |
368 | stw @x[1],`$LOCALS+4`($sp) | |
369 | stw @x[2],`$LOCALS+8`($sp) | |
370 | stw @x[3],`$LOCALS+12`($sp) | |
371 | stw @x[4],`$LOCALS+16`($sp) | |
372 | stw @x[5],`$LOCALS+20`($sp) | |
373 | stw @x[6],`$LOCALS+24`($sp) | |
374 | stw @x[7],`$LOCALS+28`($sp) | |
375 | stw @x[8],`$LOCALS+32`($sp) | |
376 | stw @x[9],`$LOCALS+36`($sp) | |
377 | stw @x[10],`$LOCALS+40`($sp) | |
378 | stw @x[11],`$LOCALS+44`($sp) | |
379 | stw @x[12],`$LOCALS+48`($sp) | |
380 | stw @x[13],`$LOCALS+52`($sp) | |
381 | stw @x[14],`$LOCALS+56`($sp) | |
382 | stw @x[15],`$LOCALS+60`($sp) | |
383 | ||
384 | Loop_tail: # byte-by-byte loop | |
385 | lbzu @d[0],1($inp) | |
386 | lbzu @x[0],1(@t[0]) | |
387 | xor @d[1],@d[0],@x[0] | |
388 | stbu @d[1],1($out) | |
389 | bdnz Loop_tail | |
390 | ||
391 | stw $sp,`$LOCALS+0`($sp) # wipe block on stack | |
392 | stw $sp,`$LOCALS+4`($sp) | |
393 | stw $sp,`$LOCALS+8`($sp) | |
394 | stw $sp,`$LOCALS+12`($sp) | |
395 | stw $sp,`$LOCALS+16`($sp) | |
396 | stw $sp,`$LOCALS+20`($sp) | |
397 | stw $sp,`$LOCALS+24`($sp) | |
398 | stw $sp,`$LOCALS+28`($sp) | |
399 | stw $sp,`$LOCALS+32`($sp) | |
400 | stw $sp,`$LOCALS+36`($sp) | |
401 | stw $sp,`$LOCALS+40`($sp) | |
402 | stw $sp,`$LOCALS+44`($sp) | |
403 | stw $sp,`$LOCALS+48`($sp) | |
404 | stw $sp,`$LOCALS+52`($sp) | |
405 | stw $sp,`$LOCALS+56`($sp) | |
406 | stw $sp,`$LOCALS+60`($sp) | |
407 | ||
408 | blr | |
409 | .long 0 | |
410 | .byte 0,12,0x14,0,0,0,0,0 | |
411 | ___ | |
412 | ||
413 | {{{ | |
c869c3ad AP |
414 | my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2) |
415 | = map("v$_",(0..11)); | |
416 | my @K = map("v$_",(12..17)); | |
1a467bd1 AP |
417 | my ($FOUR,$sixteen,$twenty4) = map("v$_",(18..19,23)); |
418 | my ($inpperm,$outperm,$outmask) = map("v$_",(24..26)); | |
419 | my @D = map("v$_",(27..31)); | |
c869c3ad | 420 | my ($twelve,$seven,$T0,$T1) = @D; |
9e58d119 | 421 | |
1a467bd1 | 422 | my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v23-v31 offload |
9e58d119 AP |
423 | |
424 | sub VMXROUND { | |
425 | my $odd = pop; | |
c869c3ad | 426 | my ($a,$b,$c,$d)=@_; |
9e58d119 AP |
427 | |
428 | ( | |
429 | "&vadduwm ('$a','$a','$b')", | |
430 | "&vxor ('$d','$d','$a')", | |
431 | "&vperm ('$d','$d','$d','$sixteen')", | |
432 | ||
433 | "&vadduwm ('$c','$c','$d')", | |
c869c3ad AP |
434 | "&vxor ('$b','$b','$c')", |
435 | "&vrlw ('$b','$b','$twelve')", | |
9e58d119 AP |
436 | |
437 | "&vadduwm ('$a','$a','$b')", | |
438 | "&vxor ('$d','$d','$a')", | |
439 | "&vperm ('$d','$d','$d','$twenty4')", | |
440 | ||
441 | "&vadduwm ('$c','$c','$d')", | |
c869c3ad AP |
442 | "&vxor ('$b','$b','$c')", |
443 | "&vrlw ('$b','$b','$seven')", | |
9e58d119 | 444 | |
10f27971 AP |
445 | "&vrldoi ('$c','$c',8)", |
446 | "&vrldoi ('$b','$b',$odd?4:12)", | |
447 | "&vrldoi ('$d','$d',$odd?12:4)" | |
9e58d119 AP |
448 | ); |
449 | } | |
450 | ||
451 | $code.=<<___; | |
452 | ||
453 | .globl .ChaCha20_ctr32_vmx | |
454 | .align 5 | |
455 | .ChaCha20_ctr32_vmx: | |
456 | ${UCMP}i $len,256 | |
457 | blt __ChaCha20_ctr32_int | |
458 | ||
459 | $STU $sp,-$FRAME($sp) | |
460 | mflr r0 | |
461 | li r10,`15+$LOCALS+64` | |
462 | li r11,`31+$LOCALS+64` | |
463 | mfspr r12,256 | |
1a467bd1 | 464 | stvx v23,r10,$sp |
9e58d119 | 465 | addi r10,r10,32 |
1a467bd1 | 466 | stvx v24,r11,$sp |
9e58d119 | 467 | addi r11,r11,32 |
1a467bd1 | 468 | stvx v25,r10,$sp |
9e58d119 | 469 | addi r10,r10,32 |
1a467bd1 | 470 | stvx v26,r11,$sp |
9e58d119 | 471 | addi r11,r11,32 |
1a467bd1 | 472 | stvx v27,r10,$sp |
9e58d119 | 473 | addi r10,r10,32 |
1a467bd1 | 474 | stvx v28,r11,$sp |
9e58d119 | 475 | addi r11,r11,32 |
1a467bd1 | 476 | stvx v29,r10,$sp |
9e58d119 | 477 | addi r10,r10,32 |
1a467bd1 AP |
478 | stvx v30,r11,$sp |
479 | stvx v31,r10,$sp | |
9e58d119 AP |
480 | stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave |
481 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
482 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
483 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
484 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
485 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
486 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
487 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
488 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
489 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
490 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
491 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
492 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
493 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
494 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
495 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
496 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
497 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
498 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
1a467bd1 | 499 | li r12,-4096+511 |
9e58d119 | 500 | $PUSH r0, `$FRAME+$LRSAVE`($sp) |
c869c3ad | 501 | mtspr 256,r12 # preserve 29 AltiVec registers |
9e58d119 AP |
502 | |
503 | bl Lconsts # returns pointer Lsigma in r12 | |
504 | li @x[0],16 | |
505 | li @x[1],32 | |
506 | li @x[2],48 | |
507 | li @x[3],64 | |
508 | li @x[4],31 # 31 is not a typo | |
509 | li @x[5],15 # nor is 15 | |
510 | ||
511 | lvx @K[1],0,$key # load key | |
512 | ?lvsr $T0,0,$key # prepare unaligned load | |
513 | lvx @K[2],@x[0],$key | |
514 | lvx @D[0],@x[4],$key | |
515 | ||
516 | lvx @K[3],0,$ctr # load counter | |
517 | ?lvsr $T1,0,$ctr # prepare unaligned load | |
518 | lvx @D[1],@x[5],$ctr | |
519 | ||
520 | lvx @K[0],0,r12 # load constants | |
521 | lvx @K[5],@x[0],r12 # one | |
522 | lvx $FOUR,@x[1],r12 | |
523 | lvx $sixteen,@x[2],r12 | |
524 | lvx $twenty4,@x[3],r12 | |
525 | ||
526 | ?vperm @K[1],@K[2],@K[1],$T0 # align key | |
527 | ?vperm @K[2],@D[0],@K[2],$T0 | |
528 | ?vperm @K[3],@D[1],@K[3],$T1 # align counter | |
529 | ||
530 | lwz @d[0],0($ctr) # load counter to GPR | |
531 | lwz @d[1],4($ctr) | |
532 | vadduwm @K[3],@K[3],@K[5] # adjust AltiVec counter | |
533 | lwz @d[2],8($ctr) | |
534 | vadduwm @K[4],@K[3],@K[5] | |
535 | lwz @d[3],12($ctr) | |
536 | vadduwm @K[5],@K[4],@K[5] | |
537 | ||
9e58d119 AP |
538 | vxor $T0,$T0,$T0 # 0x00..00 |
539 | vspltisw $outmask,-1 # 0xff..ff | |
540 | ?lvsr $inpperm,0,$inp # prepare for unaligned load | |
541 | ?lvsl $outperm,0,$out # prepare for unaligned store | |
542 | ?vperm $outmask,$outmask,$T0,$outperm | |
543 | ||
a82a9f71 | 544 | be?lvsl $T0,0,@x[0] # 0x00..0f |
9e58d119 | 545 | be?vspltisb $T1,3 # 0x03..03 |
a82a9f71 | 546 | be?vxor $T0,$T0,$T1 # swap bytes within words |
9e58d119 | 547 | be?vxor $outperm,$outperm,$T1 |
a82a9f71 | 548 | be?vperm $inpperm,$inpperm,$inpperm,$T0 |
9e58d119 | 549 | |
c869c3ad | 550 | li r0,10 # inner loop counter |
9e58d119 AP |
551 | b Loop_outer_vmx |
552 | ||
553 | .align 4 | |
554 | Loop_outer_vmx: | |
555 | lis @x[0],0x6170 # synthesize sigma | |
556 | lis @x[1],0x3320 | |
557 | vmr $A0,@K[0] | |
558 | lis @x[2],0x7962 | |
559 | lis @x[3],0x6b20 | |
560 | vmr $A1,@K[0] | |
561 | ori @x[0],@x[0],0x7865 | |
562 | ori @x[1],@x[1],0x646e | |
563 | vmr $A2,@K[0] | |
564 | ori @x[2],@x[2],0x2d32 | |
565 | ori @x[3],@x[3],0x6574 | |
566 | vmr $B0,@K[1] | |
567 | ||
9e58d119 AP |
568 | lwz @x[4],0($key) # load key to GPR |
569 | vmr $B1,@K[1] | |
570 | lwz @x[5],4($key) | |
571 | vmr $B2,@K[1] | |
572 | lwz @x[6],8($key) | |
573 | vmr $C0,@K[2] | |
574 | lwz @x[7],12($key) | |
575 | vmr $C1,@K[2] | |
576 | lwz @x[8],16($key) | |
577 | vmr $C2,@K[2] | |
578 | mr @x[12],@d[0] # copy GPR counter | |
579 | lwz @x[9],20($key) | |
580 | vmr $D0,@K[3] | |
581 | mr @x[13],@d[1] | |
582 | lwz @x[10],24($key) | |
583 | vmr $D1,@K[4] | |
584 | mr @x[14],@d[2] | |
585 | lwz @x[11],28($key) | |
586 | vmr $D2,@K[5] | |
587 | mr @x[15],@d[3] | |
588 | ||
589 | mr @t[0],@x[4] | |
590 | mr @t[1],@x[5] | |
591 | mr @t[2],@x[6] | |
592 | mr @t[3],@x[7] | |
c869c3ad AP |
593 | |
594 | vspltisw $twelve,12 # synthesize constants | |
9e58d119 AP |
595 | vspltisw $seven,7 |
596 | ||
597 | mtctr r0 | |
598 | nop | |
599 | Loop_vmx: | |
600 | ___ | |
c869c3ad AP |
601 | my @thread0=&VMXROUND($A0,$B0,$C0,$D0,0); |
602 | my @thread1=&VMXROUND($A1,$B1,$C1,$D1,0); | |
603 | my @thread2=&VMXROUND($A2,$B2,$C2,$D2,0); | |
9e58d119 AP |
604 | my @thread3=&ROUND(0,4,8,12); |
605 | ||
606 | foreach (@thread0) { | |
1a467bd1 AP |
607 | eval; |
608 | eval(shift(@thread1)); | |
609 | eval(shift(@thread2)); | |
610 | ||
611 | eval(shift(@thread3)); | |
612 | eval(shift(@thread3)); | |
613 | eval(shift(@thread3)); | |
9e58d119 | 614 | } |
c869c3ad | 615 | foreach (@thread3) { eval; } |
9e58d119 | 616 | |
c869c3ad AP |
617 | @thread0=&VMXROUND($A0,$B0,$C0,$D0,1); |
618 | @thread1=&VMXROUND($A1,$B1,$C1,$D1,1); | |
619 | @thread2=&VMXROUND($A2,$B2,$C2,$D2,1); | |
9e58d119 AP |
620 | @thread3=&ROUND(0,5,10,15); |
621 | ||
622 | foreach (@thread0) { | |
1a467bd1 AP |
623 | eval; |
624 | eval(shift(@thread1)); | |
625 | eval(shift(@thread2)); | |
626 | ||
627 | eval(shift(@thread3)); | |
628 | eval(shift(@thread3)); | |
629 | eval(shift(@thread3)); | |
9e58d119 | 630 | } |
c869c3ad | 631 | foreach (@thread3) { eval; } |
9e58d119 AP |
632 | $code.=<<___; |
633 | bdnz Loop_vmx | |
634 | ||
635 | subi $len,$len,256 # $len-=256 | |
636 | addi @x[0],@x[0],0x7865 # accumulate key block | |
637 | addi @x[1],@x[1],0x646e | |
638 | addi @x[2],@x[2],0x2d32 | |
639 | addi @x[3],@x[3],0x6574 | |
640 | addis @x[0],@x[0],0x6170 | |
641 | addis @x[1],@x[1],0x3320 | |
642 | addis @x[2],@x[2],0x7962 | |
643 | addis @x[3],@x[3],0x6b20 | |
644 | add @x[4],@x[4],@t[0] | |
645 | lwz @t[0],16($key) | |
646 | add @x[5],@x[5],@t[1] | |
647 | lwz @t[1],20($key) | |
648 | add @x[6],@x[6],@t[2] | |
649 | lwz @t[2],24($key) | |
650 | add @x[7],@x[7],@t[3] | |
651 | lwz @t[3],28($key) | |
652 | add @x[8],@x[8],@t[0] | |
653 | add @x[9],@x[9],@t[1] | |
654 | add @x[10],@x[10],@t[2] | |
655 | add @x[11],@x[11],@t[3] | |
656 | add @x[12],@x[12],@d[0] | |
657 | add @x[13],@x[13],@d[1] | |
658 | add @x[14],@x[14],@d[2] | |
659 | add @x[15],@x[15],@d[3] | |
660 | ||
661 | vadduwm $A0,$A0,@K[0] # accumulate key block | |
662 | vadduwm $A1,$A1,@K[0] | |
663 | vadduwm $A2,$A2,@K[0] | |
664 | vadduwm $B0,$B0,@K[1] | |
665 | vadduwm $B1,$B1,@K[1] | |
666 | vadduwm $B2,$B2,@K[1] | |
667 | vadduwm $C0,$C0,@K[2] | |
668 | vadduwm $C1,$C1,@K[2] | |
669 | vadduwm $C2,$C2,@K[2] | |
670 | vadduwm $D0,$D0,@K[3] | |
671 | vadduwm $D1,$D1,@K[4] | |
672 | vadduwm $D2,$D2,@K[5] | |
673 | ||
674 | addi @d[0],@d[0],4 # increment counter | |
675 | vadduwm @K[3],@K[3],$FOUR | |
676 | vadduwm @K[4],@K[4],$FOUR | |
677 | vadduwm @K[5],@K[5],$FOUR | |
678 | ||
679 | ___ | |
680 | if (!$LITTLE_ENDIAN) { for($i=0;$i<16;$i++) { # flip byte order | |
681 | $code.=<<___; | |
682 | mr @t[$i&3],@x[$i] | |
683 | rotlwi @x[$i],@x[$i],8 | |
684 | rlwimi @x[$i],@t[$i&3],24,0,7 | |
685 | rlwimi @x[$i],@t[$i&3],24,16,23 | |
686 | ___ | |
687 | } } | |
688 | $code.=<<___; | |
689 | lwz @t[0],0($inp) # load input, aligned or not | |
690 | lwz @t[1],4($inp) | |
691 | lwz @t[2],8($inp) | |
692 | lwz @t[3],12($inp) | |
693 | xor @x[0],@x[0],@t[0] # xor with input | |
694 | lwz @t[0],16($inp) | |
695 | xor @x[1],@x[1],@t[1] | |
696 | lwz @t[1],20($inp) | |
697 | xor @x[2],@x[2],@t[2] | |
698 | lwz @t[2],24($inp) | |
699 | xor @x[3],@x[3],@t[3] | |
700 | lwz @t[3],28($inp) | |
701 | xor @x[4],@x[4],@t[0] | |
702 | lwz @t[0],32($inp) | |
703 | xor @x[5],@x[5],@t[1] | |
704 | lwz @t[1],36($inp) | |
705 | xor @x[6],@x[6],@t[2] | |
706 | lwz @t[2],40($inp) | |
707 | xor @x[7],@x[7],@t[3] | |
708 | lwz @t[3],44($inp) | |
709 | xor @x[8],@x[8],@t[0] | |
710 | lwz @t[0],48($inp) | |
711 | xor @x[9],@x[9],@t[1] | |
712 | lwz @t[1],52($inp) | |
713 | xor @x[10],@x[10],@t[2] | |
714 | lwz @t[2],56($inp) | |
715 | xor @x[11],@x[11],@t[3] | |
716 | lwz @t[3],60($inp) | |
717 | xor @x[12],@x[12],@t[0] | |
718 | stw @x[0],0($out) # store output, aligned or not | |
719 | xor @x[13],@x[13],@t[1] | |
720 | stw @x[1],4($out) | |
721 | xor @x[14],@x[14],@t[2] | |
722 | stw @x[2],8($out) | |
723 | xor @x[15],@x[15],@t[3] | |
724 | stw @x[3],12($out) | |
725 | addi $inp,$inp,64 | |
726 | stw @x[4],16($out) | |
727 | li @t[0],16 | |
728 | stw @x[5],20($out) | |
729 | li @t[1],32 | |
730 | stw @x[6],24($out) | |
731 | li @t[2],48 | |
732 | stw @x[7],28($out) | |
733 | li @t[3],64 | |
734 | stw @x[8],32($out) | |
735 | stw @x[9],36($out) | |
736 | stw @x[10],40($out) | |
737 | stw @x[11],44($out) | |
738 | stw @x[12],48($out) | |
739 | stw @x[13],52($out) | |
740 | stw @x[14],56($out) | |
741 | stw @x[15],60($out) | |
742 | addi $out,$out,64 | |
743 | ||
744 | lvx @D[0],0,$inp # load input | |
745 | lvx @D[1],@t[0],$inp | |
746 | lvx @D[2],@t[1],$inp | |
747 | lvx @D[3],@t[2],$inp | |
748 | lvx @D[4],@t[3],$inp | |
749 | addi $inp,$inp,64 | |
750 | ||
751 | ?vperm @D[0],@D[1],@D[0],$inpperm # align input | |
752 | ?vperm @D[1],@D[2],@D[1],$inpperm | |
753 | ?vperm @D[2],@D[3],@D[2],$inpperm | |
754 | ?vperm @D[3],@D[4],@D[3],$inpperm | |
755 | vxor $A0,$A0,@D[0] # xor with input | |
756 | vxor $B0,$B0,@D[1] | |
757 | lvx @D[1],@t[0],$inp # keep loading input | |
758 | vxor $C0,$C0,@D[2] | |
759 | lvx @D[2],@t[1],$inp | |
760 | vxor $D0,$D0,@D[3] | |
761 | lvx @D[3],@t[2],$inp | |
762 | lvx @D[0],@t[3],$inp | |
763 | addi $inp,$inp,64 | |
764 | li @t[3],63 # 63 is not a typo | |
765 | vperm $A0,$A0,$A0,$outperm # pre-misalign output | |
766 | vperm $B0,$B0,$B0,$outperm | |
767 | vperm $C0,$C0,$C0,$outperm | |
768 | vperm $D0,$D0,$D0,$outperm | |
769 | ||
770 | ?vperm @D[4],@D[1],@D[4],$inpperm # align input | |
771 | ?vperm @D[1],@D[2],@D[1],$inpperm | |
772 | ?vperm @D[2],@D[3],@D[2],$inpperm | |
773 | ?vperm @D[3],@D[0],@D[3],$inpperm | |
774 | vxor $A1,$A1,@D[4] | |
775 | vxor $B1,$B1,@D[1] | |
776 | lvx @D[1],@t[0],$inp # keep loading input | |
777 | vxor $C1,$C1,@D[2] | |
778 | lvx @D[2],@t[1],$inp | |
779 | vxor $D1,$D1,@D[3] | |
780 | lvx @D[3],@t[2],$inp | |
781 | lvx @D[4],@t[3],$inp # redundant in aligned case | |
782 | addi $inp,$inp,64 | |
783 | vperm $A1,$A1,$A1,$outperm # pre-misalign output | |
784 | vperm $B1,$B1,$B1,$outperm | |
785 | vperm $C1,$C1,$C1,$outperm | |
786 | vperm $D1,$D1,$D1,$outperm | |
787 | ||
788 | ?vperm @D[0],@D[1],@D[0],$inpperm # align input | |
789 | ?vperm @D[1],@D[2],@D[1],$inpperm | |
790 | ?vperm @D[2],@D[3],@D[2],$inpperm | |
791 | ?vperm @D[3],@D[4],@D[3],$inpperm | |
792 | vxor $A2,$A2,@D[0] | |
793 | vxor $B2,$B2,@D[1] | |
794 | vxor $C2,$C2,@D[2] | |
795 | vxor $D2,$D2,@D[3] | |
796 | vperm $A2,$A2,$A2,$outperm # pre-misalign output | |
797 | vperm $B2,$B2,$B2,$outperm | |
798 | vperm $C2,$C2,$C2,$outperm | |
799 | vperm $D2,$D2,$D2,$outperm | |
800 | ||
801 | andi. @x[1],$out,15 # is $out aligned? | |
802 | mr @x[0],$out | |
803 | ||
804 | vsel @D[0],$A0,$B0,$outmask # collect pre-misaligned output | |
805 | vsel @D[1],$B0,$C0,$outmask | |
806 | vsel @D[2],$C0,$D0,$outmask | |
807 | vsel @D[3],$D0,$A1,$outmask | |
808 | vsel $B0,$A1,$B1,$outmask | |
809 | vsel $C0,$B1,$C1,$outmask | |
810 | vsel $D0,$C1,$D1,$outmask | |
811 | vsel $A1,$D1,$A2,$outmask | |
812 | vsel $B1,$A2,$B2,$outmask | |
813 | vsel $C1,$B2,$C2,$outmask | |
814 | vsel $D1,$C2,$D2,$outmask | |
815 | ||
816 | #stvx $A0,0,$out # take it easy on the edges | |
817 | stvx @D[0],@t[0],$out # store output | |
818 | stvx @D[1],@t[1],$out | |
819 | stvx @D[2],@t[2],$out | |
820 | addi $out,$out,64 | |
821 | stvx @D[3],0,$out | |
822 | stvx $B0,@t[0],$out | |
823 | stvx $C0,@t[1],$out | |
824 | stvx $D0,@t[2],$out | |
825 | addi $out,$out,64 | |
826 | stvx $A1,0,$out | |
827 | stvx $B1,@t[0],$out | |
828 | stvx $C1,@t[1],$out | |
829 | stvx $D1,@t[2],$out | |
830 | addi $out,$out,64 | |
831 | ||
832 | beq Laligned_vmx | |
833 | ||
834 | sub @x[2],$out,@x[1] # in misaligned case edges | |
835 | li @x[3],0 # are written byte-by-byte | |
836 | Lunaligned_tail_vmx: | |
837 | stvebx $D2,@x[3],@x[2] | |
838 | addi @x[3],@x[3],1 | |
839 | cmpw @x[3],@x[1] | |
840 | bne Lunaligned_tail_vmx | |
841 | ||
842 | sub @x[2],@x[0],@x[1] | |
843 | Lunaligned_head_vmx: | |
844 | stvebx $A0,@x[1],@x[2] | |
845 | cmpwi @x[1],15 | |
846 | addi @x[1],@x[1],1 | |
847 | bne Lunaligned_head_vmx | |
848 | ||
849 | ${UCMP}i $len,255 # done with 256-byte blocks yet? | |
850 | bgt Loop_outer_vmx | |
851 | ||
852 | b Ldone_vmx | |
853 | ||
854 | .align 4 | |
855 | Laligned_vmx: | |
856 | stvx $A0,0,@x[0] # head hexaword was not stored | |
857 | ||
858 | ${UCMP}i $len,255 # done with 256-byte blocks yet? | |
859 | bgt Loop_outer_vmx | |
860 | nop | |
861 | ||
862 | Ldone_vmx: | |
863 | ${UCMP}i $len,0 # done yet? | |
864 | bnel __ChaCha20_1x | |
865 | ||
866 | lwz r12,`$FRAME-$SIZE_T*18-4`($sp) # pull vrsave | |
867 | li r10,`15+$LOCALS+64` | |
868 | li r11,`31+$LOCALS+64` | |
869 | mtspr 256,r12 # restore vrsave | |
1a467bd1 | 870 | lvx v23,r10,$sp |
9e58d119 | 871 | addi r10,r10,32 |
1a467bd1 | 872 | lvx v24,r11,$sp |
9e58d119 | 873 | addi r11,r11,32 |
1a467bd1 | 874 | lvx v25,r10,$sp |
9e58d119 | 875 | addi r10,r10,32 |
1a467bd1 | 876 | lvx v26,r11,$sp |
9e58d119 | 877 | addi r11,r11,32 |
1a467bd1 | 878 | lvx v27,r10,$sp |
9e58d119 | 879 | addi r10,r10,32 |
1a467bd1 | 880 | lvx v28,r11,$sp |
9e58d119 | 881 | addi r11,r11,32 |
1a467bd1 | 882 | lvx v29,r10,$sp |
9e58d119 | 883 | addi r10,r10,32 |
1a467bd1 AP |
884 | lvx v30,r11,$sp |
885 | lvx v31,r10,$sp | |
9e58d119 AP |
886 | $POP r0, `$FRAME+$LRSAVE`($sp) |
887 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
888 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
889 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
890 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
891 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
892 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
893 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
894 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
895 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
896 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
897 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
898 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
899 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
900 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
901 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
902 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
903 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
904 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
905 | mtlr r0 | |
906 | addi $sp,$sp,$FRAME | |
907 | blr | |
908 | .long 0 | |
909 | .byte 0,12,0x04,1,0x80,18,5,0 | |
910 | .long 0 | |
911 | .size .ChaCha20_ctr32_vmx,.-.ChaCha20_ctr32_vmx | |
791cc302 AP |
912 | ___ |
913 | }}} | |
914 | {{{ | |
915 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
916 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); | |
917 | my @K = map("v$_",(16..19)); | |
918 | my $CTR = "v26"; | |
919 | my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); | |
920 | my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); | |
921 | my $beperm = "v31"; | |
922 | ||
923 | my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); | |
924 | ||
925 | my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload | |
926 | ||
927 | sub VSX_lane_ROUND { | |
928 | my ($a0,$b0,$c0,$d0)=@_; | |
929 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
930 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
931 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
932 | my @x=map("\"v$_\"",(0..15)); | |
933 | ||
934 | ( | |
935 | "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
936 | "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
937 | "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
938 | "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
939 | "&vxor (@x[$d0],@x[$d0],@x[$a0])", | |
940 | "&vxor (@x[$d1],@x[$d1],@x[$a1])", | |
941 | "&vxor (@x[$d2],@x[$d2],@x[$a2])", | |
942 | "&vxor (@x[$d3],@x[$d3],@x[$a3])", | |
943 | "&vrlw (@x[$d0],@x[$d0],'$sixteen')", | |
944 | "&vrlw (@x[$d1],@x[$d1],'$sixteen')", | |
945 | "&vrlw (@x[$d2],@x[$d2],'$sixteen')", | |
946 | "&vrlw (@x[$d3],@x[$d3],'$sixteen')", | |
947 | ||
948 | "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", | |
949 | "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", | |
950 | "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", | |
951 | "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", | |
952 | "&vxor (@x[$b0],@x[$b0],@x[$c0])", | |
953 | "&vxor (@x[$b1],@x[$b1],@x[$c1])", | |
954 | "&vxor (@x[$b2],@x[$b2],@x[$c2])", | |
955 | "&vxor (@x[$b3],@x[$b3],@x[$c3])", | |
956 | "&vrlw (@x[$b0],@x[$b0],'$twelve')", | |
957 | "&vrlw (@x[$b1],@x[$b1],'$twelve')", | |
958 | "&vrlw (@x[$b2],@x[$b2],'$twelve')", | |
959 | "&vrlw (@x[$b3],@x[$b3],'$twelve')", | |
960 | ||
961 | "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", | |
962 | "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", | |
963 | "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", | |
964 | "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", | |
965 | "&vxor (@x[$d0],@x[$d0],@x[$a0])", | |
966 | "&vxor (@x[$d1],@x[$d1],@x[$a1])", | |
967 | "&vxor (@x[$d2],@x[$d2],@x[$a2])", | |
968 | "&vxor (@x[$d3],@x[$d3],@x[$a3])", | |
969 | "&vrlw (@x[$d0],@x[$d0],'$eight')", | |
970 | "&vrlw (@x[$d1],@x[$d1],'$eight')", | |
971 | "&vrlw (@x[$d2],@x[$d2],'$eight')", | |
972 | "&vrlw (@x[$d3],@x[$d3],'$eight')", | |
973 | ||
974 | "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", | |
975 | "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", | |
976 | "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", | |
977 | "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", | |
978 | "&vxor (@x[$b0],@x[$b0],@x[$c0])", | |
979 | "&vxor (@x[$b1],@x[$b1],@x[$c1])", | |
980 | "&vxor (@x[$b2],@x[$b2],@x[$c2])", | |
981 | "&vxor (@x[$b3],@x[$b3],@x[$c3])", | |
982 | "&vrlw (@x[$b0],@x[$b0],'$seven')", | |
983 | "&vrlw (@x[$b1],@x[$b1],'$seven')", | |
984 | "&vrlw (@x[$b2],@x[$b2],'$seven')", | |
985 | "&vrlw (@x[$b3],@x[$b3],'$seven')" | |
986 | ); | |
987 | } | |
988 | ||
989 | $code.=<<___; | |
9e58d119 | 990 | |
791cc302 AP |
991 | .globl .ChaCha20_ctr32_vsx |
992 | .align 5 | |
993 | .ChaCha20_ctr32_vsx: | |
994 | $STU $sp,-$FRAME($sp) | |
995 | mflr r0 | |
996 | li r10,`15+$LOCALS+64` | |
997 | li r11,`31+$LOCALS+64` | |
998 | mfspr r12,256 | |
999 | stvx v26,r10,$sp | |
1000 | addi r10,r10,32 | |
1001 | stvx v27,r11,$sp | |
1002 | addi r11,r11,32 | |
1003 | stvx v28,r10,$sp | |
1004 | addi r10,r10,32 | |
1005 | stvx v29,r11,$sp | |
1006 | addi r11,r11,32 | |
1007 | stvx v30,r10,$sp | |
1008 | stvx v31,r11,$sp | |
1009 | stw r12,`$FRAME-4`($sp) # save vrsave | |
1010 | li r12,-4096+63 | |
1011 | $PUSH r0, `$FRAME+$LRSAVE`($sp) | |
1012 | mtspr 256,r12 # preserve 29 AltiVec registers | |
1013 | ||
1014 | bl Lconsts # returns pointer Lsigma in r12 | |
1015 | lvx_4w @K[0],0,r12 # load sigma | |
1016 | addi r12,r12,0x50 | |
1017 | li $x10,16 | |
1018 | li $x20,32 | |
1019 | li $x30,48 | |
1020 | li r11,64 | |
1021 | ||
1022 | lvx_4w @K[1],0,$key # load key | |
1023 | lvx_4w @K[2],$x10,$key | |
1024 | lvx_4w @K[3],0,$ctr # load counter | |
1025 | ||
1026 | vxor $xt0,$xt0,$xt0 | |
1027 | lvx_4w $xt1,r11,r12 | |
1028 | vspltw $CTR,@K[3],0 | |
1029 | vsldoi @K[3],@K[3],$xt0,4 | |
1030 | vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] | |
1031 | vadduwm $CTR,$CTR,$xt1 | |
1032 | ||
1033 | be?lvsl $beperm,0,$x10 # 0x00..0f | |
1034 | be?vspltisb $xt0,3 # 0x03..03 | |
1035 | be?vxor $beperm,$beperm,$xt0 # swap bytes within words | |
1036 | ||
1037 | li r0,10 # inner loop counter | |
1038 | mtctr r0 | |
1039 | b Loop_outer_vsx | |
1040 | ||
1041 | .align 5 | |
1042 | Loop_outer_vsx: | |
1043 | lvx $xa0,$x00,r12 # load [smashed] sigma | |
1044 | lvx $xa1,$x10,r12 | |
1045 | lvx $xa2,$x20,r12 | |
1046 | lvx $xa3,$x30,r12 | |
1047 | ||
1048 | vspltw $xb0,@K[1],0 # smash the key | |
1049 | vspltw $xb1,@K[1],1 | |
1050 | vspltw $xb2,@K[1],2 | |
1051 | vspltw $xb3,@K[1],3 | |
1052 | ||
1053 | vspltw $xc0,@K[2],0 | |
1054 | vspltw $xc1,@K[2],1 | |
1055 | vspltw $xc2,@K[2],2 | |
1056 | vspltw $xc3,@K[2],3 | |
1057 | ||
1058 | vmr $xd0,$CTR # smash the counter | |
1059 | vspltw $xd1,@K[3],1 | |
1060 | vspltw $xd2,@K[3],2 | |
1061 | vspltw $xd3,@K[3],3 | |
1062 | ||
1063 | vspltisw $sixteen,-16 # synthesize constants | |
1064 | vspltisw $twelve,12 | |
1065 | vspltisw $eight,8 | |
1066 | vspltisw $seven,7 | |
1067 | ||
1068 | Loop_vsx: | |
1069 | ___ | |
1070 | foreach (&VSX_lane_ROUND(0, 4, 8,12)) { eval; } | |
1071 | foreach (&VSX_lane_ROUND(0, 5,10,15)) { eval; } | |
1072 | $code.=<<___; | |
1073 | bdnz Loop_vsx | |
1074 | ||
1075 | vadduwm $xd0,$xd0,$CTR | |
1076 | ||
1077 | vmrgew $xt0,$xa0,$xa1 # transpose data | |
1078 | vmrgew $xt1,$xa2,$xa3 | |
1079 | vmrgow $xa0,$xa0,$xa1 | |
1080 | vmrgow $xa2,$xa2,$xa3 | |
1081 | vmrgew $xt2,$xb0,$xb1 | |
1082 | vmrgew $xt3,$xb2,$xb3 | |
1083 | vpermdi $xa1,$xa0,$xa2,0b00 | |
1084 | vpermdi $xa3,$xa0,$xa2,0b11 | |
1085 | vpermdi $xa0,$xt0,$xt1,0b00 | |
1086 | vpermdi $xa2,$xt0,$xt1,0b11 | |
1087 | ||
1088 | vmrgow $xb0,$xb0,$xb1 | |
1089 | vmrgow $xb2,$xb2,$xb3 | |
1090 | vmrgew $xt0,$xc0,$xc1 | |
1091 | vmrgew $xt1,$xc2,$xc3 | |
1092 | vpermdi $xb1,$xb0,$xb2,0b00 | |
1093 | vpermdi $xb3,$xb0,$xb2,0b11 | |
1094 | vpermdi $xb0,$xt2,$xt3,0b00 | |
1095 | vpermdi $xb2,$xt2,$xt3,0b11 | |
1096 | ||
1097 | vmrgow $xc0,$xc0,$xc1 | |
1098 | vmrgow $xc2,$xc2,$xc3 | |
1099 | vmrgew $xt2,$xd0,$xd1 | |
1100 | vmrgew $xt3,$xd2,$xd3 | |
1101 | vpermdi $xc1,$xc0,$xc2,0b00 | |
1102 | vpermdi $xc3,$xc0,$xc2,0b11 | |
1103 | vpermdi $xc0,$xt0,$xt1,0b00 | |
1104 | vpermdi $xc2,$xt0,$xt1,0b11 | |
1105 | ||
1106 | vmrgow $xd0,$xd0,$xd1 | |
1107 | vmrgow $xd2,$xd2,$xd3 | |
1108 | vspltisw $xt0,4 | |
1109 | vadduwm $CTR,$CTR,$xt0 # next counter value | |
1110 | vpermdi $xd1,$xd0,$xd2,0b00 | |
1111 | vpermdi $xd3,$xd0,$xd2,0b11 | |
1112 | vpermdi $xd0,$xt2,$xt3,0b00 | |
1113 | vpermdi $xd2,$xt2,$xt3,0b11 | |
1114 | ||
1115 | vadduwm $xa0,$xa0,@K[0] | |
1116 | vadduwm $xb0,$xb0,@K[1] | |
1117 | vadduwm $xc0,$xc0,@K[2] | |
1118 | vadduwm $xd0,$xd0,@K[3] | |
1119 | ||
1120 | be?vperm $xa0,$xa0,$xa0,$beperm | |
1121 | be?vperm $xb0,$xb0,$xb0,$beperm | |
1122 | be?vperm $xc0,$xc0,$xc0,$beperm | |
1123 | be?vperm $xd0,$xd0,$xd0,$beperm | |
1124 | ||
1125 | ${UCMP}i $len,0x40 | |
1126 | blt Ltail_vsx | |
1127 | ||
1128 | lvx_4w $xt0,$x00,$inp | |
1129 | lvx_4w $xt1,$x10,$inp | |
1130 | lvx_4w $xt2,$x20,$inp | |
1131 | lvx_4w $xt3,$x30,$inp | |
1132 | ||
1133 | vxor $xt0,$xt0,$xa0 | |
1134 | vxor $xt1,$xt1,$xb0 | |
1135 | vxor $xt2,$xt2,$xc0 | |
1136 | vxor $xt3,$xt3,$xd0 | |
1137 | ||
1138 | stvx_4w $xt0,$x00,$out | |
1139 | stvx_4w $xt1,$x10,$out | |
1140 | addi $inp,$inp,0x40 | |
1141 | stvx_4w $xt2,$x20,$out | |
1142 | subi $len,$len,0x40 | |
1143 | stvx_4w $xt3,$x30,$out | |
1144 | addi $out,$out,0x40 | |
1145 | beq Ldone_vsx | |
1146 | ||
1147 | vadduwm $xa0,$xa1,@K[0] | |
1148 | vadduwm $xb0,$xb1,@K[1] | |
1149 | vadduwm $xc0,$xc1,@K[2] | |
1150 | vadduwm $xd0,$xd1,@K[3] | |
1151 | ||
1152 | be?vperm $xa0,$xa0,$xa0,$beperm | |
1153 | be?vperm $xb0,$xb0,$xb0,$beperm | |
1154 | be?vperm $xc0,$xc0,$xc0,$beperm | |
1155 | be?vperm $xd0,$xd0,$xd0,$beperm | |
1156 | ||
1157 | ${UCMP}i $len,0x40 | |
1158 | blt Ltail_vsx | |
1159 | ||
1160 | lvx_4w $xt0,$x00,$inp | |
1161 | lvx_4w $xt1,$x10,$inp | |
1162 | lvx_4w $xt2,$x20,$inp | |
1163 | lvx_4w $xt3,$x30,$inp | |
1164 | ||
1165 | vxor $xt0,$xt0,$xa0 | |
1166 | vxor $xt1,$xt1,$xb0 | |
1167 | vxor $xt2,$xt2,$xc0 | |
1168 | vxor $xt3,$xt3,$xd0 | |
1169 | ||
1170 | stvx_4w $xt0,$x00,$out | |
1171 | stvx_4w $xt1,$x10,$out | |
1172 | addi $inp,$inp,0x40 | |
1173 | stvx_4w $xt2,$x20,$out | |
1174 | subi $len,$len,0x40 | |
1175 | stvx_4w $xt3,$x30,$out | |
1176 | addi $out,$out,0x40 | |
1177 | beq Ldone_vsx | |
1178 | ||
1179 | vadduwm $xa0,$xa2,@K[0] | |
1180 | vadduwm $xb0,$xb2,@K[1] | |
1181 | vadduwm $xc0,$xc2,@K[2] | |
1182 | vadduwm $xd0,$xd2,@K[3] | |
1183 | ||
1184 | be?vperm $xa0,$xa0,$xa0,$beperm | |
1185 | be?vperm $xb0,$xb0,$xb0,$beperm | |
1186 | be?vperm $xc0,$xc0,$xc0,$beperm | |
1187 | be?vperm $xd0,$xd0,$xd0,$beperm | |
1188 | ||
1189 | ${UCMP}i $len,0x40 | |
1190 | blt Ltail_vsx | |
1191 | ||
1192 | lvx_4w $xt0,$x00,$inp | |
1193 | lvx_4w $xt1,$x10,$inp | |
1194 | lvx_4w $xt2,$x20,$inp | |
1195 | lvx_4w $xt3,$x30,$inp | |
1196 | ||
1197 | vxor $xt0,$xt0,$xa0 | |
1198 | vxor $xt1,$xt1,$xb0 | |
1199 | vxor $xt2,$xt2,$xc0 | |
1200 | vxor $xt3,$xt3,$xd0 | |
1201 | ||
1202 | stvx_4w $xt0,$x00,$out | |
1203 | stvx_4w $xt1,$x10,$out | |
1204 | addi $inp,$inp,0x40 | |
1205 | stvx_4w $xt2,$x20,$out | |
1206 | subi $len,$len,0x40 | |
1207 | stvx_4w $xt3,$x30,$out | |
1208 | addi $out,$out,0x40 | |
1209 | beq Ldone_vsx | |
1210 | ||
1211 | vadduwm $xa0,$xa3,@K[0] | |
1212 | vadduwm $xb0,$xb3,@K[1] | |
1213 | vadduwm $xc0,$xc3,@K[2] | |
1214 | vadduwm $xd0,$xd3,@K[3] | |
1215 | ||
1216 | be?vperm $xa0,$xa0,$xa0,$beperm | |
1217 | be?vperm $xb0,$xb0,$xb0,$beperm | |
1218 | be?vperm $xc0,$xc0,$xc0,$beperm | |
1219 | be?vperm $xd0,$xd0,$xd0,$beperm | |
1220 | ||
1221 | ${UCMP}i $len,0x40 | |
1222 | blt Ltail_vsx | |
1223 | ||
1224 | lvx_4w $xt0,$x00,$inp | |
1225 | lvx_4w $xt1,$x10,$inp | |
1226 | lvx_4w $xt2,$x20,$inp | |
1227 | lvx_4w $xt3,$x30,$inp | |
1228 | ||
1229 | vxor $xt0,$xt0,$xa0 | |
1230 | vxor $xt1,$xt1,$xb0 | |
1231 | vxor $xt2,$xt2,$xc0 | |
1232 | vxor $xt3,$xt3,$xd0 | |
1233 | ||
1234 | stvx_4w $xt0,$x00,$out | |
1235 | stvx_4w $xt1,$x10,$out | |
1236 | addi $inp,$inp,0x40 | |
1237 | stvx_4w $xt2,$x20,$out | |
1238 | subi $len,$len,0x40 | |
1239 | stvx_4w $xt3,$x30,$out | |
1240 | addi $out,$out,0x40 | |
1241 | mtctr r0 | |
1242 | bne Loop_outer_vsx | |
1243 | ||
1244 | Ldone_vsx: | |
1245 | lwz r12,`$FRAME-4`($sp) # pull vrsave | |
1246 | li r10,`15+$LOCALS+64` | |
1247 | li r11,`31+$LOCALS+64` | |
1248 | $POP r0, `$FRAME+$LRSAVE`($sp) | |
1249 | mtspr 256,r12 # restore vrsave | |
1250 | lvx v26,r10,$sp | |
1251 | addi r10,r10,32 | |
1252 | lvx v27,r11,$sp | |
1253 | addi r11,r11,32 | |
1254 | lvx v28,r10,$sp | |
1255 | addi r10,r10,32 | |
1256 | lvx v29,r11,$sp | |
1257 | addi r11,r11,32 | |
1258 | lvx v30,r10,$sp | |
1259 | lvx v31,r11,$sp | |
1260 | mtlr r0 | |
1261 | addi $sp,$sp,$FRAME | |
1262 | blr | |
1263 | ||
1264 | .align 4 | |
1265 | Ltail_vsx: | |
1266 | addi r11,$sp,$LOCALS | |
1267 | mtctr $len | |
1268 | stvx_4w $xa0,$x00,r11 # offload block to stack | |
1269 | stvx_4w $xb0,$x10,r11 | |
1270 | stvx_4w $xc0,$x20,r11 | |
1271 | stvx_4w $xd0,$x30,r11 | |
1272 | subi r12,r11,1 # prepare for *++ptr | |
1273 | subi $inp,$inp,1 | |
1274 | subi $out,$out,1 | |
1275 | ||
1276 | Loop_tail_vsx: | |
1277 | lbzu r6,1(r12) | |
1278 | lbzu r7,1($inp) | |
1279 | xor r6,r6,r7 | |
1280 | stbu r6,1($out) | |
1281 | bdnz Loop_tail_vsx | |
1282 | ||
1283 | stvx_4w $K[0],$x00,r11 # wipe copy of the block | |
1284 | stvx_4w $K[0],$x10,r11 | |
1285 | stvx_4w $K[0],$x20,r11 | |
1286 | stvx_4w $K[0],$x30,r11 | |
1287 | ||
1288 | b Ldone_vsx | |
1289 | .long 0 | |
1290 | .byte 0,12,0x04,1,0x80,0,5,0 | |
1291 | .long 0 | |
1292 | .size .ChaCha20_ctr32_vsx,.-.ChaCha20_ctr32_vsx | |
1293 | ___ | |
1294 | }}} | |
1295 | $code.=<<___; | |
9e58d119 AP |
1296 | .align 5 |
1297 | Lconsts: | |
1298 | mflr r0 | |
1299 | bcl 20,31,\$+4 | |
c869c3ad | 1300 | mflr r12 #vvvvv "distance between . and Lsigma |
9e58d119 AP |
1301 | addi r12,r12,`64-8` |
1302 | mtlr r0 | |
1303 | blr | |
1304 | .long 0 | |
1305 | .byte 0,12,0x14,0,0,0,0,0 | |
1306 | .space `64-9*4` | |
1307 | Lsigma: | |
1308 | .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 | |
1309 | .long 1,0,0,0 | |
1310 | .long 4,0,0,0 | |
1311 | ___ | |
1312 | $code.=<<___ if ($LITTLE_ENDIAN); | |
1313 | .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 | |
1314 | .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 | |
1315 | ___ | |
1316 | $code.=<<___ if (!$LITTLE_ENDIAN); # flipped words | |
1317 | .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d | |
1318 | .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c | |
1319 | ___ | |
1320 | $code.=<<___; | |
791cc302 AP |
1321 | .long 0x61707865,0x61707865,0x61707865,0x61707865 |
1322 | .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e | |
1323 | .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 | |
1324 | .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 | |
1325 | .long 0,1,2,3 | |
9e58d119 AP |
1326 | .asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>" |
1327 | .align 2 | |
1328 | ___ | |
9e58d119 AP |
1329 | |
1330 | foreach (split("\n",$code)) { | |
1331 | s/\`([^\`]*)\`/eval $1/ge; | |
1332 | ||
1333 | # instructions prefixed with '?' are endian-specific and need | |
1334 | # to be adjusted accordingly... | |
1335 | if ($flavour !~ /le$/) { # big-endian | |
1336 | s/be\?// or | |
1337 | s/le\?/#le#/ or | |
1338 | s/\?lvsr/lvsl/ or | |
1339 | s/\?lvsl/lvsr/ or | |
1340 | s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or | |
10f27971 | 1341 | s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; |
9e58d119 AP |
1342 | } else { # little-endian |
1343 | s/le\?// or | |
1344 | s/be\?/#be#/ or | |
10f27971 AP |
1345 | s/\?([a-z]+)/$1/ or |
1346 | s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; | |
9e58d119 AP |
1347 | } |
1348 | ||
1349 | print $_,"\n"; | |
1350 | } | |
1351 | ||
a21314db | 1352 | close STDOUT or die "error closing STDOUT: $!"; |