]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chachap10-ppc.pl
Remove repetitive words
[thirdparty/openssl.git] / crypto / chacha / asm / chachap10-ppc.pl
CommitLineData
f596bbe4 1#! /usr/bin/env perl
b6461792 2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
f596bbe4
DB
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# October 2015
18#
19# ChaCha20 for PowerPC/AltiVec.
20#
21# June 2018
22#
23# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
24# processors that can't issue more than one vector instruction per
25# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
26# interleave would perform better. Incidentally PowerISA 2.07 (first
27# implemented by POWER8) defined new usable instructions, hence 4xVSX
28# code path...
29#
30# Performance in cycles per byte out of large buffer.
31#
32# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX
33#
34# Freescale e300 13.6/+115% - -
35# PPC74x0/G4e 6.81/+310% 3.81 -
36# PPC970/G5 9.29/+160% ? -
37# POWER7 8.62/+61% 3.35 -
38# POWER8 8.70/+51% 2.91 2.09
39# POWER9 8.80/+29% 4.44(*) 2.45(**)
40#
41# (*) this is trade-off result, it's possible to improve it, but
42# then it would negatively affect all others;
43# (**) POWER9 seems to be "allergic" to mixing vector and integer
44# instructions, which is why switch to vector-only code pays
45# off that much;
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52if ($flavour =~ /64/) {
53 $SIZE_T =8;
54 $LRSAVE =2*$SIZE_T;
55 $STU ="stdu";
56 $POP ="ld";
57 $PUSH ="std";
58 $UCMP ="cmpld";
59} elsif ($flavour =~ /32/) {
60 $SIZE_T =4;
61 $LRSAVE =$SIZE_T;
62 $STU ="stwu";
63 $POP ="lwz";
64 $PUSH ="stw";
65 $UCMP ="cmplw";
66} else { die "nonsense $flavour"; }
67
68$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
73die "can't locate ppc-xlate.pl";
74
75open STDOUT,"| $^X $xlate $flavour \"$output\""
76 or die "can't call $xlate: $!";
77
78$LOCALS=6*$SIZE_T;
79$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables
80
81sub AUTOLOAD() # thunk [simplified] x86-style perlasm
82{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
83 $code .= "\t$opcode\t".join(',',@_)."\n";
84}
85
86my $sp = "r1";
87
88my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
89
90
91{{{
92my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
93 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
94my @K = map("v$_",(16..19));
95my $CTR = "v26";
96my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
97my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
98my $beperm = "v31";
99
100my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
101
102my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload
103
f596bbe4
DB
104
105sub VSX_lane_ROUND_4x {
106my ($a0,$b0,$c0,$d0)=@_;
107my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
108my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
109my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
110my @x=map("\"v$_\"",(0..15));
111
112 (
113 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
114 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
115 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
116 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
117 "&vxor (@x[$d0],@x[$d0],@x[$a0])",
118 "&vxor (@x[$d1],@x[$d1],@x[$a1])",
119 "&vxor (@x[$d2],@x[$d2],@x[$a2])",
120 "&vxor (@x[$d3],@x[$d3],@x[$a3])",
121 "&vrlw (@x[$d0],@x[$d0],'$sixteen')",
122 "&vrlw (@x[$d1],@x[$d1],'$sixteen')",
123 "&vrlw (@x[$d2],@x[$d2],'$sixteen')",
124 "&vrlw (@x[$d3],@x[$d3],'$sixteen')",
125
126 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
127 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
128 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
129 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
130 "&vxor (@x[$b0],@x[$b0],@x[$c0])",
131 "&vxor (@x[$b1],@x[$b1],@x[$c1])",
132 "&vxor (@x[$b2],@x[$b2],@x[$c2])",
133 "&vxor (@x[$b3],@x[$b3],@x[$c3])",
134 "&vrlw (@x[$b0],@x[$b0],'$twelve')",
135 "&vrlw (@x[$b1],@x[$b1],'$twelve')",
136 "&vrlw (@x[$b2],@x[$b2],'$twelve')",
137 "&vrlw (@x[$b3],@x[$b3],'$twelve')",
138
139 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
140 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
141 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
142 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
143 "&vxor (@x[$d0],@x[$d0],@x[$a0])",
144 "&vxor (@x[$d1],@x[$d1],@x[$a1])",
145 "&vxor (@x[$d2],@x[$d2],@x[$a2])",
146 "&vxor (@x[$d3],@x[$d3],@x[$a3])",
147 "&vrlw (@x[$d0],@x[$d0],'$eight')",
148 "&vrlw (@x[$d1],@x[$d1],'$eight')",
149 "&vrlw (@x[$d2],@x[$d2],'$eight')",
150 "&vrlw (@x[$d3],@x[$d3],'$eight')",
151
152 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
153 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
154 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
155 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
156 "&vxor (@x[$b0],@x[$b0],@x[$c0])",
157 "&vxor (@x[$b1],@x[$b1],@x[$c1])",
158 "&vxor (@x[$b2],@x[$b2],@x[$c2])",
159 "&vxor (@x[$b3],@x[$b3],@x[$c3])",
160 "&vrlw (@x[$b0],@x[$b0],'$seven')",
161 "&vrlw (@x[$b1],@x[$b1],'$seven')",
162 "&vrlw (@x[$b2],@x[$b2],'$seven')",
163 "&vrlw (@x[$b3],@x[$b3],'$seven')"
164 );
165}
166
167$code.=<<___;
168
169.globl .ChaCha20_ctr32_vsx_p10
170.align 5
171.ChaCha20_ctr32_vsx_p10:
7e1f3ffc 172 ${UCMP}i $len,255
1afb326d 173 ble .Not_greater_than_8x
174 b ChaCha20_ctr32_vsx_8x
175.Not_greater_than_8x:
f596bbe4
DB
176 $STU $sp,-$FRAME($sp)
177 mflr r0
178 li r10,`15+$LOCALS+64`
179 li r11,`31+$LOCALS+64`
180 mfspr r12,256
181 stvx v26,r10,$sp
182 addi r10,r10,32
183 stvx v27,r11,$sp
184 addi r11,r11,32
185 stvx v28,r10,$sp
186 addi r10,r10,32
187 stvx v29,r11,$sp
188 addi r11,r11,32
189 stvx v30,r10,$sp
190 stvx v31,r11,$sp
191 stw r12,`$FRAME-4`($sp) # save vrsave
192 li r12,-4096+63
193 $PUSH r0, `$FRAME+$LRSAVE`($sp)
194 mtspr 256,r12 # preserve 29 AltiVec registers
195
196 bl Lconsts # returns pointer Lsigma in r12
197 lvx_4w @K[0],0,r12 # load sigma
198 addi r12,r12,0x70
199 li $x10,16
200 li $x20,32
201 li $x30,48
202 li r11,64
203
204 lvx_4w @K[1],0,$key # load key
205 lvx_4w @K[2],$x10,$key
206 lvx_4w @K[3],0,$ctr # load counter
207
208 vxor $xt0,$xt0,$xt0
209 lvx_4w $xt1,r11,r12
210 vspltw $CTR,@K[3],0
211 vsldoi @K[3],@K[3],$xt0,4
212 vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0]
213 vadduwm $CTR,$CTR,$xt1
214
215 be?lvsl $beperm,0,$x10 # 0x00..0f
216 be?vspltisb $xt0,3 # 0x03..03
217 be?vxor $beperm,$beperm,$xt0 # swap bytes within words
218
219 li r0,10 # inner loop counter
220 mtctr r0
221 b Loop_outer_vsx
222
223.align 5
224Loop_outer_vsx:
225 lvx $xa0,$x00,r12 # load [smashed] sigma
226 lvx $xa1,$x10,r12
227 lvx $xa2,$x20,r12
228 lvx $xa3,$x30,r12
229
230 vspltw $xb0,@K[1],0 # smash the key
231 vspltw $xb1,@K[1],1
232 vspltw $xb2,@K[1],2
233 vspltw $xb3,@K[1],3
234
235 vspltw $xc0,@K[2],0
236 vspltw $xc1,@K[2],1
237 vspltw $xc2,@K[2],2
238 vspltw $xc3,@K[2],3
239
240 vmr $xd0,$CTR # smash the counter
241 vspltw $xd1,@K[3],1
242 vspltw $xd2,@K[3],2
243 vspltw $xd3,@K[3],3
244
245 vspltisw $sixteen,-16 # synthesize constants
246 vspltisw $twelve,12
247 vspltisw $eight,8
248 vspltisw $seven,7
249
f596bbe4
DB
250Loop_vsx_4x:
251___
252 foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
253 foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
254$code.=<<___;
255
256 bdnz Loop_vsx_4x
257
258 vadduwm $xd0,$xd0,$CTR
259
260 vmrgew $xt0,$xa0,$xa1 # transpose data
261 vmrgew $xt1,$xa2,$xa3
262 vmrgow $xa0,$xa0,$xa1
263 vmrgow $xa2,$xa2,$xa3
264 vmrgew $xt2,$xb0,$xb1
265 vmrgew $xt3,$xb2,$xb3
266 vpermdi $xa1,$xa0,$xa2,0b00
267 vpermdi $xa3,$xa0,$xa2,0b11
268 vpermdi $xa0,$xt0,$xt1,0b00
269 vpermdi $xa2,$xt0,$xt1,0b11
270
271 vmrgow $xb0,$xb0,$xb1
272 vmrgow $xb2,$xb2,$xb3
273 vmrgew $xt0,$xc0,$xc1
274 vmrgew $xt1,$xc2,$xc3
275 vpermdi $xb1,$xb0,$xb2,0b00
276 vpermdi $xb3,$xb0,$xb2,0b11
277 vpermdi $xb0,$xt2,$xt3,0b00
278 vpermdi $xb2,$xt2,$xt3,0b11
279
280 vmrgow $xc0,$xc0,$xc1
281 vmrgow $xc2,$xc2,$xc3
282 vmrgew $xt2,$xd0,$xd1
283 vmrgew $xt3,$xd2,$xd3
284 vpermdi $xc1,$xc0,$xc2,0b00
285 vpermdi $xc3,$xc0,$xc2,0b11
286 vpermdi $xc0,$xt0,$xt1,0b00
287 vpermdi $xc2,$xt0,$xt1,0b11
288
289 vmrgow $xd0,$xd0,$xd1
290 vmrgow $xd2,$xd2,$xd3
291 vspltisw $xt0,4
292 vadduwm $CTR,$CTR,$xt0 # next counter value
293 vpermdi $xd1,$xd0,$xd2,0b00
294 vpermdi $xd3,$xd0,$xd2,0b11
295 vpermdi $xd0,$xt2,$xt3,0b00
296 vpermdi $xd2,$xt2,$xt3,0b11
297
298 vadduwm $xa0,$xa0,@K[0]
299 vadduwm $xb0,$xb0,@K[1]
300 vadduwm $xc0,$xc0,@K[2]
301 vadduwm $xd0,$xd0,@K[3]
302
303 be?vperm $xa0,$xa0,$xa0,$beperm
304 be?vperm $xb0,$xb0,$xb0,$beperm
305 be?vperm $xc0,$xc0,$xc0,$beperm
306 be?vperm $xd0,$xd0,$xd0,$beperm
307
308 ${UCMP}i $len,0x40
309 blt Ltail_vsx
310
311 lvx_4w $xt0,$x00,$inp
312 lvx_4w $xt1,$x10,$inp
313 lvx_4w $xt2,$x20,$inp
314 lvx_4w $xt3,$x30,$inp
315
316 vxor $xt0,$xt0,$xa0
317 vxor $xt1,$xt1,$xb0
318 vxor $xt2,$xt2,$xc0
319 vxor $xt3,$xt3,$xd0
320
321 stvx_4w $xt0,$x00,$out
322 stvx_4w $xt1,$x10,$out
323 addi $inp,$inp,0x40
324 stvx_4w $xt2,$x20,$out
325 subi $len,$len,0x40
326 stvx_4w $xt3,$x30,$out
327 addi $out,$out,0x40
328 beq Ldone_vsx
329
330 vadduwm $xa0,$xa1,@K[0]
331 vadduwm $xb0,$xb1,@K[1]
332 vadduwm $xc0,$xc1,@K[2]
333 vadduwm $xd0,$xd1,@K[3]
334
335 be?vperm $xa0,$xa0,$xa0,$beperm
336 be?vperm $xb0,$xb0,$xb0,$beperm
337 be?vperm $xc0,$xc0,$xc0,$beperm
338 be?vperm $xd0,$xd0,$xd0,$beperm
339
340 ${UCMP}i $len,0x40
341 blt Ltail_vsx
342
343 lvx_4w $xt0,$x00,$inp
344 lvx_4w $xt1,$x10,$inp
345 lvx_4w $xt2,$x20,$inp
346 lvx_4w $xt3,$x30,$inp
347
348 vxor $xt0,$xt0,$xa0
349 vxor $xt1,$xt1,$xb0
350 vxor $xt2,$xt2,$xc0
351 vxor $xt3,$xt3,$xd0
352
353 stvx_4w $xt0,$x00,$out
354 stvx_4w $xt1,$x10,$out
355 addi $inp,$inp,0x40
356 stvx_4w $xt2,$x20,$out
357 subi $len,$len,0x40
358 stvx_4w $xt3,$x30,$out
359 addi $out,$out,0x40
360 beq Ldone_vsx
361
362 vadduwm $xa0,$xa2,@K[0]
363 vadduwm $xb0,$xb2,@K[1]
364 vadduwm $xc0,$xc2,@K[2]
365 vadduwm $xd0,$xd2,@K[3]
366
367 be?vperm $xa0,$xa0,$xa0,$beperm
368 be?vperm $xb0,$xb0,$xb0,$beperm
369 be?vperm $xc0,$xc0,$xc0,$beperm
370 be?vperm $xd0,$xd0,$xd0,$beperm
371
372 ${UCMP}i $len,0x40
373 blt Ltail_vsx
374
375 lvx_4w $xt0,$x00,$inp
376 lvx_4w $xt1,$x10,$inp
377 lvx_4w $xt2,$x20,$inp
378 lvx_4w $xt3,$x30,$inp
379
380 vxor $xt0,$xt0,$xa0
381 vxor $xt1,$xt1,$xb0
382 vxor $xt2,$xt2,$xc0
383 vxor $xt3,$xt3,$xd0
384
385 stvx_4w $xt0,$x00,$out
386 stvx_4w $xt1,$x10,$out
387 addi $inp,$inp,0x40
388 stvx_4w $xt2,$x20,$out
389 subi $len,$len,0x40
390 stvx_4w $xt3,$x30,$out
391 addi $out,$out,0x40
392 beq Ldone_vsx
393
394 vadduwm $xa0,$xa3,@K[0]
395 vadduwm $xb0,$xb3,@K[1]
396 vadduwm $xc0,$xc3,@K[2]
397 vadduwm $xd0,$xd3,@K[3]
398
399 be?vperm $xa0,$xa0,$xa0,$beperm
400 be?vperm $xb0,$xb0,$xb0,$beperm
401 be?vperm $xc0,$xc0,$xc0,$beperm
402 be?vperm $xd0,$xd0,$xd0,$beperm
403
404 ${UCMP}i $len,0x40
405 blt Ltail_vsx
406
407 lvx_4w $xt0,$x00,$inp
408 lvx_4w $xt1,$x10,$inp
409 lvx_4w $xt2,$x20,$inp
410 lvx_4w $xt3,$x30,$inp
411
412 vxor $xt0,$xt0,$xa0
413 vxor $xt1,$xt1,$xb0
414 vxor $xt2,$xt2,$xc0
415 vxor $xt3,$xt3,$xd0
416
417 stvx_4w $xt0,$x00,$out
418 stvx_4w $xt1,$x10,$out
419 addi $inp,$inp,0x40
420 stvx_4w $xt2,$x20,$out
421 subi $len,$len,0x40
422 stvx_4w $xt3,$x30,$out
423 addi $out,$out,0x40
424 mtctr r0
425 bne Loop_outer_vsx
426
427Ldone_vsx:
428 lwz r12,`$FRAME-4`($sp) # pull vrsave
429 li r10,`15+$LOCALS+64`
430 li r11,`31+$LOCALS+64`
431 $POP r0, `$FRAME+$LRSAVE`($sp)
432 mtspr 256,r12 # restore vrsave
433 lvx v26,r10,$sp
434 addi r10,r10,32
435 lvx v27,r11,$sp
436 addi r11,r11,32
437 lvx v28,r10,$sp
438 addi r10,r10,32
439 lvx v29,r11,$sp
440 addi r11,r11,32
441 lvx v30,r10,$sp
442 lvx v31,r11,$sp
443 mtlr r0
444 addi $sp,$sp,$FRAME
445 blr
446
447.align 4
448Ltail_vsx:
449 addi r11,$sp,$LOCALS
450 mtctr $len
451 stvx_4w $xa0,$x00,r11 # offload block to stack
452 stvx_4w $xb0,$x10,r11
453 stvx_4w $xc0,$x20,r11
454 stvx_4w $xd0,$x30,r11
455 subi r12,r11,1 # prepare for *++ptr
456 subi $inp,$inp,1
457 subi $out,$out,1
458
459Loop_tail_vsx:
460 lbzu r6,1(r12)
461 lbzu r7,1($inp)
462 xor r6,r6,r7
463 stbu r6,1($out)
464 bdnz Loop_tail_vsx
465
466 stvx_4w $K[0],$x00,r11 # wipe copy of the block
467 stvx_4w $K[0],$x10,r11
468 stvx_4w $K[0],$x20,r11
469 stvx_4w $K[0],$x30,r11
470
471 b Ldone_vsx
472 .long 0
473 .byte 0,12,0x04,1,0x80,0,5,0
474 .long 0
475.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
476___
477}}}
478
479##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to
480# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
481# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
482# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
483#
484{{{
485#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
486my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
487 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
488 $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
489 $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
490my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
491my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
492my @K = map("v$_",27,(24..26));
493my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
494my $xr0 = "v4";
495my $CTR0 = "v22";
496my $CTR1 = "v5";
497my $beperm = "v31";
498my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
499my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
500my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
501my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
502my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
503
504my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload
505
506sub VSX_lane_ROUND_8x {
507my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
508my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
509my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
510my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
511my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
512my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
513my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
514my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
515my @x=map("\"v$_\"",(0..31));
516
517 (
518 "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13
519 "&vxxlorc (@x[$c7], $xv9,$xv9)",
520
521 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
522 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
523 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
524 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
525 "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1
526 "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2
527 "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3
528 "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4
529
530 "&vxor (@x[$d0],@x[$d0],@x[$a0])",
531 "&vxor (@x[$d1],@x[$d1],@x[$a1])",
532 "&vxor (@x[$d2],@x[$d2],@x[$a2])",
533 "&vxor (@x[$d3],@x[$d3],@x[$a3])",
534 "&vxor (@x[$d4],@x[$d4],@x[$a4])",
535 "&vxor (@x[$d5],@x[$d5],@x[$a5])",
536 "&vxor (@x[$d6],@x[$d6],@x[$a6])",
537 "&vxor (@x[$d7],@x[$d7],@x[$a7])",
538
539 "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
540 "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
541 "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
542 "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
543 "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
544 "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
545 "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
546 "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
547
548 "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
549 "&vxxlorc (@x[$c7], $xv15,$xv15)",
550 "&vxxlorc (@x[$a7], $xv10,$xv10)",
551
552 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
553 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
554 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
555 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
556 "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
557 "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
558 "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
559 "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
560
561 "&vxor (@x[$b0],@x[$b0],@x[$c0])",
562 "&vxor (@x[$b1],@x[$b1],@x[$c1])",
563 "&vxor (@x[$b2],@x[$b2],@x[$c2])",
564 "&vxor (@x[$b3],@x[$b3],@x[$c3])",
565 "&vxor (@x[$b4],@x[$b4],@x[$c4])",
566 "&vxor (@x[$b5],@x[$b5],@x[$c5])",
567 "&vxor (@x[$b6],@x[$b6],@x[$c6])",
568 "&vxor (@x[$b7],@x[$b7],@x[$c7])",
569
570 "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
571 "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
572 "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
573 "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
574 "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
575 "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
576 "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
577 "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
578
579 "&vxxlorc (@x[$a7], $xv13,$xv13)",
580 "&vxxlor ($xv15 ,@x[$c7],@x[$c7])",
581 "&vxxlorc (@x[$c7], $xv11,$xv11)",
582
583
584 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
585 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
586 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
587 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
588 "&vadduwm (@x[$a4],@x[$a4],@x[$b4])",
589 "&vadduwm (@x[$a5],@x[$a5],@x[$b5])",
590 "&vadduwm (@x[$a6],@x[$a6],@x[$b6])",
591 "&vadduwm (@x[$a7],@x[$a7],@x[$b7])",
592
593 "&vxor (@x[$d0],@x[$d0],@x[$a0])",
594 "&vxor (@x[$d1],@x[$d1],@x[$a1])",
595 "&vxor (@x[$d2],@x[$d2],@x[$a2])",
596 "&vxor (@x[$d3],@x[$d3],@x[$a3])",
597 "&vxor (@x[$d4],@x[$d4],@x[$a4])",
598 "&vxor (@x[$d5],@x[$d5],@x[$a5])",
599 "&vxor (@x[$d6],@x[$d6],@x[$a6])",
600 "&vxor (@x[$d7],@x[$d7],@x[$a7])",
601
602 "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
603 "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
604 "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
605 "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
606 "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
607 "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
608 "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
609 "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
610
611 "&vxxlorc (@x[$c7], $xv15,$xv15)",
612 "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
613 "&vxxlorc (@x[$a7], $xv12,$xv12)",
614
615 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
616 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
617 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
618 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
619 "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
620 "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
621 "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
622 "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
623 "&vxor (@x[$b0],@x[$b0],@x[$c0])",
624 "&vxor (@x[$b1],@x[$b1],@x[$c1])",
625 "&vxor (@x[$b2],@x[$b2],@x[$c2])",
626 "&vxor (@x[$b3],@x[$b3],@x[$c3])",
627 "&vxor (@x[$b4],@x[$b4],@x[$c4])",
628 "&vxor (@x[$b5],@x[$b5],@x[$c5])",
629 "&vxor (@x[$b6],@x[$b6],@x[$c6])",
630 "&vxor (@x[$b7],@x[$b7],@x[$c7])",
631 "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
632 "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
633 "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
634 "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
635 "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
636 "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
637 "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
638 "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
639
640 "&vxxlorc (@x[$a7], $xv13,$xv13)",
641 );
642}
643
644$code.=<<___;
645
646.globl .ChaCha20_ctr32_vsx_8x
647.align 5
648.ChaCha20_ctr32_vsx_8x:
649 $STU $sp,-$FRAME($sp)
650 mflr r0
651 li r10,`15+$LOCALS+64`
652 li r11,`31+$LOCALS+64`
653 mfspr r12,256
654 stvx v24,r10,$sp
655 addi r10,r10,32
656 stvx v25,r11,$sp
657 addi r11,r11,32
658 stvx v26,r10,$sp
659 addi r10,r10,32
660 stvx v27,r11,$sp
661 addi r11,r11,32
662 stvx v28,r10,$sp
663 addi r10,r10,32
664 stvx v29,r11,$sp
665 addi r11,r11,32
666 stvx v30,r10,$sp
667 stvx v31,r11,$sp
668 stw r12,`$FRAME-4`($sp) # save vrsave
669 li r12,-4096+63
670 $PUSH r0, `$FRAME+$LRSAVE`($sp)
671 mtspr 256,r12 # preserve 29 AltiVec registers
672
673 bl Lconsts # returns pointer Lsigma in r12
674
675 lvx_4w @K[0],0,r12 # load sigma
676 addi r12,r12,0x70
677 li $x10,16
678 li $x20,32
679 li $x30,48
680 li r11,64
681
682 vspltisw $xa4,-16 # synthesize constants
683 vspltisw $xb4,12 # synthesize constants
684 vspltisw $xc4,8 # synthesize constants
685 vspltisw $xd4,7 # synthesize constants
686
687 lvx $xa0,$x00,r12 # load [smashed] sigma
688 lvx $xa1,$x10,r12
689 lvx $xa2,$x20,r12
690 lvx $xa3,$x30,r12
691
692 vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12
693 vxxlor $xv10 ,$xb4,$xb4
694 vxxlor $xv11 ,$xc4,$xc4
695 vxxlor $xv12 ,$xd4,$xd4
696 vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25
697 vxxlor $xv23 ,$xa1,$xa1
698 vxxlor $xv24 ,$xa2,$xa2
699 vxxlor $xv25 ,$xa3,$xa3
700
701 lvx_4w @K[1],0,$key # load key
702 lvx_4w @K[2],$x10,$key
703 lvx_4w @K[3],0,$ctr # load counter
704 vspltisw $xt3,4
705
706
707 vxor $xt2,$xt2,$xt2
708 lvx_4w $xt1,r11,r12
709 vspltw $xa2,@K[3],0 #save the original count after spltw
710 vsldoi @K[3],@K[3],$xt2,4
711 vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0]
712 vadduwm $xt1,$xa2,$xt1
713 vadduwm $xt3,$xt1,$xt3 # next counter value
714 vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8.
715
716 be?lvsl $beperm,0,$x10 # 0x00..0f
717 be?vspltisb $xt0,3 # 0x03..03
718 be?vxor $beperm,$beperm,$xt0 # swap bytes within words
719 be?vxxlor $xv26 ,$beperm,$beperm
720
721 vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2
722 vxxlor $xv1 ,@K[1],@K[1]
723 vxxlor $xv2 ,@K[2],@K[2]
724 vxxlor $xv3 ,@K[3],@K[3]
725 vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5
726 vxxlor $xv5 ,$xt3,$xt3
727 vxxlor $xv8 ,$xa0,$xa0
728
729 li r0,10 # inner loop counter
730 mtctr r0
731 b Loop_outer_vsx_8x
732
733.align 5
734Loop_outer_vsx_8x:
735 vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma
736 vxxlorc $xa1,$xv23,$xv23
737 vxxlorc $xa2,$xv24,$xv24
738 vxxlorc $xa3,$xv25,$xv25
739 vxxlorc $xa4,$xv22,$xv22
740 vxxlorc $xa5,$xv23,$xv23
741 vxxlorc $xa6,$xv24,$xv24
742 vxxlorc $xa7,$xv25,$xv25
743
744 vspltw $xb0,@K[1],0 # smash the key
745 vspltw $xb1,@K[1],1
746 vspltw $xb2,@K[1],2
747 vspltw $xb3,@K[1],3
748 vspltw $xb4,@K[1],0 # smash the key
749 vspltw $xb5,@K[1],1
750 vspltw $xb6,@K[1],2
751 vspltw $xb7,@K[1],3
752
753 vspltw $xc0,@K[2],0
754 vspltw $xc1,@K[2],1
755 vspltw $xc2,@K[2],2
756 vspltw $xc3,@K[2],3
757 vspltw $xc4,@K[2],0
758 vspltw $xc7,@K[2],3
759 vspltw $xc5,@K[2],1
760
761 vxxlorc $xd0,$xv4,$xv4 # smash the counter
762 vspltw $xd1,@K[3],1
763 vspltw $xd2,@K[3],2
764 vspltw $xd3,@K[3],3
765 vxxlorc $xd4,$xv5,$xv5 # smash the counter
766 vspltw $xd5,@K[3],1
767 vspltw $xd6,@K[3],2
768 vspltw $xd7,@K[3],3
769 vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
770
771Loop_vsx_8x:
772___
773 foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
774 foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
775$code.=<<___;
776
777 bdnz Loop_vsx_8x
778 vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31
779 vxxlor $xv14 ,$xd5,$xd5 #
780 vxxlor $xv15 ,$xd6,$xd6 #
781 vxxlor $xv16 ,$xd7,$xd7 #
782
783 vxxlor $xv18 ,$xc4,$xc4 #
784 vxxlor $xv19 ,$xc5,$xc5 #
785 vxxlor $xv20 ,$xc6,$xc6 #
786 vxxlor $xv21 ,$xc7,$xc7 #
787
788 vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs
789 vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs
3c0bb68c 790 be?vxxlorc $beperm,$xv26,$xv26 # copy back the beperm.
f596bbe4
DB
791
792 vxxlorc @K[0],$xv0,$xv0 #27
793 vxxlorc @K[1],$xv1,$xv1 #24
794 vxxlorc @K[2],$xv2,$xv2 #25
795 vxxlorc @K[3],$xv3,$xv3 #26
796 vxxlorc $CTR0,$xv4,$xv4
797###changing to vertical
798
799 vmrgew $xt0,$xa0,$xa1 # transpose data
800 vmrgew $xt1,$xa2,$xa3
801 vmrgow $xa0,$xa0,$xa1
802 vmrgow $xa2,$xa2,$xa3
803
804 vmrgew $xt2,$xb0,$xb1
805 vmrgew $xt3,$xb2,$xb3
806 vmrgow $xb0,$xb0,$xb1
807 vmrgow $xb2,$xb2,$xb3
808
809 vadduwm $xd0,$xd0,$CTR0
810
811 vpermdi $xa1,$xa0,$xa2,0b00
812 vpermdi $xa3,$xa0,$xa2,0b11
813 vpermdi $xa0,$xt0,$xt1,0b00
814 vpermdi $xa2,$xt0,$xt1,0b11
815 vpermdi $xb1,$xb0,$xb2,0b00
816 vpermdi $xb3,$xb0,$xb2,0b11
817 vpermdi $xb0,$xt2,$xt3,0b00
818 vpermdi $xb2,$xt2,$xt3,0b11
819
820 vmrgew $xt0,$xc0,$xc1
821 vmrgew $xt1,$xc2,$xc3
822 vmrgow $xc0,$xc0,$xc1
823 vmrgow $xc2,$xc2,$xc3
824 vmrgew $xt2,$xd0,$xd1
825 vmrgew $xt3,$xd2,$xd3
826 vmrgow $xd0,$xd0,$xd1
827 vmrgow $xd2,$xd2,$xd3
828
829 vpermdi $xc1,$xc0,$xc2,0b00
830 vpermdi $xc3,$xc0,$xc2,0b11
831 vpermdi $xc0,$xt0,$xt1,0b00
832 vpermdi $xc2,$xt0,$xt1,0b11
833 vpermdi $xd1,$xd0,$xd2,0b00
834 vpermdi $xd3,$xd0,$xd2,0b11
835 vpermdi $xd0,$xt2,$xt3,0b00
836 vpermdi $xd2,$xt2,$xt3,0b11
837
838 vspltisw $xt0,8
839 vadduwm $CTR0,$CTR0,$xt0 # next counter value
840 vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5
841
842 vadduwm $xa0,$xa0,@K[0]
843 vadduwm $xb0,$xb0,@K[1]
844 vadduwm $xc0,$xc0,@K[2]
845 vadduwm $xd0,$xd0,@K[3]
846
847 be?vperm $xa0,$xa0,$xa0,$beperm
848 be?vperm $xb0,$xb0,$xb0,$beperm
849 be?vperm $xc0,$xc0,$xc0,$beperm
850 be?vperm $xd0,$xd0,$xd0,$beperm
851
852 ${UCMP}i $len,0x40
853 blt Ltail_vsx_8x
854
855 lvx_4w $xt0,$x00,$inp
856 lvx_4w $xt1,$x10,$inp
857 lvx_4w $xt2,$x20,$inp
858 lvx_4w $xt3,$x30,$inp
859
860 vxor $xt0,$xt0,$xa0
861 vxor $xt1,$xt1,$xb0
862 vxor $xt2,$xt2,$xc0
863 vxor $xt3,$xt3,$xd0
864
865 stvx_4w $xt0,$x00,$out
866 stvx_4w $xt1,$x10,$out
867 addi $inp,$inp,0x40
868 stvx_4w $xt2,$x20,$out
869 subi $len,$len,0x40
870 stvx_4w $xt3,$x30,$out
871 addi $out,$out,0x40
872 beq Ldone_vsx_8x
873
874 vadduwm $xa0,$xa1,@K[0]
875 vadduwm $xb0,$xb1,@K[1]
876 vadduwm $xc0,$xc1,@K[2]
877 vadduwm $xd0,$xd1,@K[3]
878
879 be?vperm $xa0,$xa0,$xa0,$beperm
880 be?vperm $xb0,$xb0,$xb0,$beperm
881 be?vperm $xc0,$xc0,$xc0,$beperm
882 be?vperm $xd0,$xd0,$xd0,$beperm
883
884 ${UCMP}i $len,0x40
885 blt Ltail_vsx_8x
886
887 lvx_4w $xt0,$x00,$inp
888 lvx_4w $xt1,$x10,$inp
889 lvx_4w $xt2,$x20,$inp
890 lvx_4w $xt3,$x30,$inp
891
892 vxor $xt0,$xt0,$xa0
893 vxor $xt1,$xt1,$xb0
894 vxor $xt2,$xt2,$xc0
895 vxor $xt3,$xt3,$xd0
896
897 stvx_4w $xt0,$x00,$out
898 stvx_4w $xt1,$x10,$out
899 addi $inp,$inp,0x40
900 stvx_4w $xt2,$x20,$out
901 subi $len,$len,0x40
902 stvx_4w $xt3,$x30,$out
903 addi $out,$out,0x40
904 beq Ldone_vsx_8x
905
906 vadduwm $xa0,$xa2,@K[0]
907 vadduwm $xb0,$xb2,@K[1]
908 vadduwm $xc0,$xc2,@K[2]
909 vadduwm $xd0,$xd2,@K[3]
910
911 be?vperm $xa0,$xa0,$xa0,$beperm
912 be?vperm $xb0,$xb0,$xb0,$beperm
913 be?vperm $xc0,$xc0,$xc0,$beperm
914 be?vperm $xd0,$xd0,$xd0,$beperm
915
916 ${UCMP}i $len,0x40
917 blt Ltail_vsx_8x
918
919 lvx_4w $xt0,$x00,$inp
920 lvx_4w $xt1,$x10,$inp
921 lvx_4w $xt2,$x20,$inp
922 lvx_4w $xt3,$x30,$inp
923
924 vxor $xt0,$xt0,$xa0
925 vxor $xt1,$xt1,$xb0
926 vxor $xt2,$xt2,$xc0
927 vxor $xt3,$xt3,$xd0
928
929 stvx_4w $xt0,$x00,$out
930 stvx_4w $xt1,$x10,$out
931 addi $inp,$inp,0x40
932 stvx_4w $xt2,$x20,$out
933 subi $len,$len,0x40
934 stvx_4w $xt3,$x30,$out
935 addi $out,$out,0x40
936 beq Ldone_vsx_8x
937
938 vadduwm $xa0,$xa3,@K[0]
939 vadduwm $xb0,$xb3,@K[1]
940 vadduwm $xc0,$xc3,@K[2]
941 vadduwm $xd0,$xd3,@K[3]
942
943 be?vperm $xa0,$xa0,$xa0,$beperm
944 be?vperm $xb0,$xb0,$xb0,$beperm
945 be?vperm $xc0,$xc0,$xc0,$beperm
946 be?vperm $xd0,$xd0,$xd0,$beperm
947
948 ${UCMP}i $len,0x40
949 blt Ltail_vsx_8x
950
951 lvx_4w $xt0,$x00,$inp
952 lvx_4w $xt1,$x10,$inp
953 lvx_4w $xt2,$x20,$inp
954 lvx_4w $xt3,$x30,$inp
955
956 vxor $xt0,$xt0,$xa0
957 vxor $xt1,$xt1,$xb0
958 vxor $xt2,$xt2,$xc0
959 vxor $xt3,$xt3,$xd0
960
961 stvx_4w $xt0,$x00,$out
962 stvx_4w $xt1,$x10,$out
963 addi $inp,$inp,0x40
964 stvx_4w $xt2,$x20,$out
965 subi $len,$len,0x40
966 stvx_4w $xt3,$x30,$out
967 addi $out,$out,0x40
968 beq Ldone_vsx_8x
969
970#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
971#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
972
973 vxxlorc $CTR1 ,$xv5,$xv5
974
975 vxxlorc $xcn4 ,$xv18,$xv18
976 vxxlorc $xcn5 ,$xv19,$xv19
977 vxxlorc $xcn6 ,$xv20,$xv20
978 vxxlorc $xcn7 ,$xv21,$xv21
979
980 vxxlorc $xdn4 ,$xv13,$xv13
981 vxxlorc $xdn5 ,$xv14,$xv14
982 vxxlorc $xdn6 ,$xv15,$xv15
983 vxxlorc $xdn7 ,$xv16,$xv16
984 vadduwm $xdn4,$xdn4,$CTR1
985
986 vxxlorc $xb6 ,$xv6,$xv6
987 vxxlorc $xb7 ,$xv7,$xv7
988#use xa1->xr0, as xt0...in the block 4-7
989
990 vmrgew $xr0,$xa4,$xa5 # transpose data
991 vmrgew $xt1,$xa6,$xa7
992 vmrgow $xa4,$xa4,$xa5
993 vmrgow $xa6,$xa6,$xa7
994 vmrgew $xt2,$xb4,$xb5
995 vmrgew $xt3,$xb6,$xb7
996 vmrgow $xb4,$xb4,$xb5
997 vmrgow $xb6,$xb6,$xb7
998
999 vpermdi $xa5,$xa4,$xa6,0b00
1000 vpermdi $xa7,$xa4,$xa6,0b11
1001 vpermdi $xa4,$xr0,$xt1,0b00
1002 vpermdi $xa6,$xr0,$xt1,0b11
1003 vpermdi $xb5,$xb4,$xb6,0b00
1004 vpermdi $xb7,$xb4,$xb6,0b11
1005 vpermdi $xb4,$xt2,$xt3,0b00
1006 vpermdi $xb6,$xt2,$xt3,0b11
1007
1008 vmrgew $xr0,$xcn4,$xcn5
1009 vmrgew $xt1,$xcn6,$xcn7
1010 vmrgow $xcn4,$xcn4,$xcn5
1011 vmrgow $xcn6,$xcn6,$xcn7
1012 vmrgew $xt2,$xdn4,$xdn5
1013 vmrgew $xt3,$xdn6,$xdn7
1014 vmrgow $xdn4,$xdn4,$xdn5
1015 vmrgow $xdn6,$xdn6,$xdn7
1016
1017 vpermdi $xcn5,$xcn4,$xcn6,0b00
1018 vpermdi $xcn7,$xcn4,$xcn6,0b11
1019 vpermdi $xcn4,$xr0,$xt1,0b00
1020 vpermdi $xcn6,$xr0,$xt1,0b11
1021 vpermdi $xdn5,$xdn4,$xdn6,0b00
1022 vpermdi $xdn7,$xdn4,$xdn6,0b11
1023 vpermdi $xdn4,$xt2,$xt3,0b00
1024 vpermdi $xdn6,$xt2,$xt3,0b11
1025
1026 vspltisw $xr0,8
1027 vadduwm $CTR1,$CTR1,$xr0 # next counter value
1028 vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5
1029
1030 vadduwm $xan0,$xa4,@K[0]
1031 vadduwm $xbn0,$xb4,@K[1]
1032 vadduwm $xcn0,$xcn4,@K[2]
1033 vadduwm $xdn0,$xdn4,@K[3]
1034
1035 be?vperm $xan0,$xa4,$xa4,$beperm
1036 be?vperm $xbn0,$xb4,$xb4,$beperm
1037 be?vperm $xcn0,$xcn4,$xcn4,$beperm
1038 be?vperm $xdn0,$xdn4,$xdn4,$beperm
1039
1040 ${UCMP}i $len,0x40
1041 blt Ltail_vsx_8x_1
1042
1043 lvx_4w $xr0,$x00,$inp
1044 lvx_4w $xt1,$x10,$inp
1045 lvx_4w $xt2,$x20,$inp
1046 lvx_4w $xt3,$x30,$inp
1047
1048 vxor $xr0,$xr0,$xan0
1049 vxor $xt1,$xt1,$xbn0
1050 vxor $xt2,$xt2,$xcn0
1051 vxor $xt3,$xt3,$xdn0
1052
1053 stvx_4w $xr0,$x00,$out
1054 stvx_4w $xt1,$x10,$out
1055 addi $inp,$inp,0x40
1056 stvx_4w $xt2,$x20,$out
1057 subi $len,$len,0x40
1058 stvx_4w $xt3,$x30,$out
1059 addi $out,$out,0x40
1060 beq Ldone_vsx_8x
1061
1062 vadduwm $xan0,$xa5,@K[0]
1063 vadduwm $xbn0,$xb5,@K[1]
1064 vadduwm $xcn0,$xcn5,@K[2]
1065 vadduwm $xdn0,$xdn5,@K[3]
1066
1067 be?vperm $xan0,$xan0,$xan0,$beperm
1068 be?vperm $xbn0,$xbn0,$xbn0,$beperm
1069 be?vperm $xcn0,$xcn0,$xcn0,$beperm
1070 be?vperm $xdn0,$xdn0,$xdn0,$beperm
1071
1072 ${UCMP}i $len,0x40
1073 blt Ltail_vsx_8x_1
1074
1075 lvx_4w $xr0,$x00,$inp
1076 lvx_4w $xt1,$x10,$inp
1077 lvx_4w $xt2,$x20,$inp
1078 lvx_4w $xt3,$x30,$inp
1079
1080 vxor $xr0,$xr0,$xan0
1081 vxor $xt1,$xt1,$xbn0
1082 vxor $xt2,$xt2,$xcn0
1083 vxor $xt3,$xt3,$xdn0
1084
1085 stvx_4w $xr0,$x00,$out
1086 stvx_4w $xt1,$x10,$out
1087 addi $inp,$inp,0x40
1088 stvx_4w $xt2,$x20,$out
1089 subi $len,$len,0x40
1090 stvx_4w $xt3,$x30,$out
1091 addi $out,$out,0x40
1092 beq Ldone_vsx_8x
1093
1094 vadduwm $xan0,$xa6,@K[0]
1095 vadduwm $xbn0,$xb6,@K[1]
1096 vadduwm $xcn0,$xcn6,@K[2]
1097 vadduwm $xdn0,$xdn6,@K[3]
1098
1099 be?vperm $xan0,$xan0,$xan0,$beperm
1100 be?vperm $xbn0,$xbn0,$xbn0,$beperm
1101 be?vperm $xcn0,$xcn0,$xcn0,$beperm
1102 be?vperm $xdn0,$xdn0,$xdn0,$beperm
1103
1104 ${UCMP}i $len,0x40
1105 blt Ltail_vsx_8x_1
1106
1107 lvx_4w $xr0,$x00,$inp
1108 lvx_4w $xt1,$x10,$inp
1109 lvx_4w $xt2,$x20,$inp
1110 lvx_4w $xt3,$x30,$inp
1111
1112 vxor $xr0,$xr0,$xan0
1113 vxor $xt1,$xt1,$xbn0
1114 vxor $xt2,$xt2,$xcn0
1115 vxor $xt3,$xt3,$xdn0
1116
1117 stvx_4w $xr0,$x00,$out
1118 stvx_4w $xt1,$x10,$out
1119 addi $inp,$inp,0x40
1120 stvx_4w $xt2,$x20,$out
1121 subi $len,$len,0x40
1122 stvx_4w $xt3,$x30,$out
1123 addi $out,$out,0x40
1124 beq Ldone_vsx_8x
1125
1126 vadduwm $xan0,$xa7,@K[0]
1127 vadduwm $xbn0,$xb7,@K[1]
1128 vadduwm $xcn0,$xcn7,@K[2]
1129 vadduwm $xdn0,$xdn7,@K[3]
1130
1131 be?vperm $xan0,$xan0,$xan0,$beperm
1132 be?vperm $xbn0,$xbn0,$xbn0,$beperm
1133 be?vperm $xcn0,$xcn0,$xcn0,$beperm
1134 be?vperm $xdn0,$xdn0,$xdn0,$beperm
1135
1136 ${UCMP}i $len,0x40
1137 blt Ltail_vsx_8x_1
1138
1139 lvx_4w $xr0,$x00,$inp
1140 lvx_4w $xt1,$x10,$inp
1141 lvx_4w $xt2,$x20,$inp
1142 lvx_4w $xt3,$x30,$inp
1143
1144 vxor $xr0,$xr0,$xan0
1145 vxor $xt1,$xt1,$xbn0
1146 vxor $xt2,$xt2,$xcn0
1147 vxor $xt3,$xt3,$xdn0
1148
1149 stvx_4w $xr0,$x00,$out
1150 stvx_4w $xt1,$x10,$out
1151 addi $inp,$inp,0x40
1152 stvx_4w $xt2,$x20,$out
1153 subi $len,$len,0x40
1154 stvx_4w $xt3,$x30,$out
1155 addi $out,$out,0x40
1156 beq Ldone_vsx_8x
1157
1158 mtctr r0
1159 bne Loop_outer_vsx_8x
1160
1161Ldone_vsx_8x:
1162 lwz r12,`$FRAME-4`($sp) # pull vrsave
1163 li r10,`15+$LOCALS+64`
1164 li r11,`31+$LOCALS+64`
1165 $POP r0, `$FRAME+$LRSAVE`($sp)
1166 mtspr 256,r12 # restore vrsave
1167 lvx v24,r10,$sp
1168 addi r10,r10,32
1169 lvx v25,r11,$sp
1170 addi r11,r11,32
1171 lvx v26,r10,$sp
1172 addi r10,r10,32
1173 lvx v27,r11,$sp
1174 addi r11,r11,32
1175 lvx v28,r10,$sp
1176 addi r10,r10,32
1177 lvx v29,r11,$sp
1178 addi r11,r11,32
1179 lvx v30,r10,$sp
1180 lvx v31,r11,$sp
1181 mtlr r0
1182 addi $sp,$sp,$FRAME
1183 blr
1184
1185.align 4
1186Ltail_vsx_8x:
1187 addi r11,$sp,$LOCALS
1188 mtctr $len
1189 stvx_4w $xa0,$x00,r11 # offload block to stack
1190 stvx_4w $xb0,$x10,r11
1191 stvx_4w $xc0,$x20,r11
1192 stvx_4w $xd0,$x30,r11
1193 subi r12,r11,1 # prepare for *++ptr
1194 subi $inp,$inp,1
1195 subi $out,$out,1
1196 bl Loop_tail_vsx_8x
1197Ltail_vsx_8x_1:
1198 addi r11,$sp,$LOCALS
1199 mtctr $len
1200 stvx_4w $xan0,$x00,r11 # offload block to stack
1201 stvx_4w $xbn0,$x10,r11
1202 stvx_4w $xcn0,$x20,r11
1203 stvx_4w $xdn0,$x30,r11
1204 subi r12,r11,1 # prepare for *++ptr
1205 subi $inp,$inp,1
1206 subi $out,$out,1
1207 bl Loop_tail_vsx_8x
1208
1209Loop_tail_vsx_8x:
1210 lbzu r6,1(r12)
1211 lbzu r7,1($inp)
1212 xor r6,r6,r7
1213 stbu r6,1($out)
1214 bdnz Loop_tail_vsx_8x
1215
1216 stvx_4w $K[0],$x00,r11 # wipe copy of the block
1217 stvx_4w $K[0],$x10,r11
1218 stvx_4w $K[0],$x20,r11
1219 stvx_4w $K[0],$x30,r11
1220
1221 b Ldone_vsx_8x
1222 .long 0
1223 .byte 0,12,0x04,1,0x80,0,5,0
1224 .long 0
1225.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
1226___
1227}}}
1228
1229
1230$code.=<<___;
1231.align 5
1232Lconsts:
1233 mflr r0
1234 bcl 20,31,\$+4
1235 mflr r12 #vvvvv "distance between . and Lsigma
1236 addi r12,r12,`64-8`
1237 mtlr r0
1238 blr
1239 .long 0
1240 .byte 0,12,0x14,0,0,0,0,0
1241 .space `64-9*4`
1242Lsigma:
1243 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574
1244 .long 1,0,0,0
1245 .long 2,0,0,0
1246 .long 3,0,0,0
1247 .long 4,0,0,0
1248___
1249$code.=<<___ if ($LITTLE_ENDIAN);
1250 .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
1251 .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
1252___
1253$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words
1254 .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
1255 .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
1256___
1257$code.=<<___;
1258 .long 0x61707865,0x61707865,0x61707865,0x61707865
1259 .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
1260 .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
1261 .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
1262 .long 0,1,2,3
1263 .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
1264.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
1265.align 2
1266___
1267
1268foreach (split("\n",$code)) {
1269 s/\`([^\`]*)\`/eval $1/ge;
1270
1271 # instructions prefixed with '?' are endian-specific and need
1272 # to be adjusted accordingly...
1273 if ($flavour !~ /le$/) { # big-endian
1274 s/be\?// or
1275 s/le\?/#le#/ or
1276 s/\?lvsr/lvsl/ or
1277 s/\?lvsl/lvsr/ or
1278 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
1279 s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
1280 } else { # little-endian
1281 s/le\?// or
1282 s/be\?/#be#/ or
1283 s/\?([a-z]+)/$1/ or
1284 s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
1285 }
1286
1287 print $_,"\n";
1288}
1289
1290close STDOUT or die "error closing STDOUT: $!";