]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/chacha/asm/chacha-x86_64.pl
poly1305/asm/poly1305-x86_64.pl: allow nasm to assemble AVX512 code.
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # November 2014
18 #
19 # ChaCha20 for x86_64.
20 #
21 # December 2016
22 #
23 # Add AVX512F code path.
24 #
25 # Performance in cycles per byte out of large buffer.
26 #
27 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
28 #
29 # P4 9.48/+99% -/22.7(ii) -
30 # Core2 7.83/+55% 7.90/8.08 4.35
31 # Westmere 7.19/+50% 5.60/6.70 3.00
32 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
35 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
36 # Goldmont 10.6/+17% 5.10/- 3.28
37 # Sledgehammer 7.28/+52% -/14.2(ii) -
38 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
39 # VIA Nano 10.5/+46% 6.72/8.60 6.05
40 #
41 # (i) compared to older gcc 3.x one can observe >2x improvement on
42 # most platforms;
43 # (ii) as it can be seen, SSE2 performance is too low on legacy
44 # processors; NxSSE2 results are naturally better, but not
45 # impressively better than IALU ones, which is why you won't
46 # find SSE2 code below;
47 # (iii) this is not optimal result for Atom because of MSROM
48 # limitations, SSE2 can do better, but gain is considered too
49 # low to justify the [maintenance] effort;
50 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
51
52 $flavour = shift;
53 $output = shift;
54 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55
56 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
57
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61 die "can't locate x86_64-xlate.pl";
62
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
66 }
67
68 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
70 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
71 $avx += 1 if ($1==2.11 && $2>=8);
72 }
73
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76 $avx = ($1>=10) + ($1>=11);
77 }
78
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
80 $avx = ($2>=3.0) + ($2>3.0);
81 }
82
83 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
84 *STDOUT=*OUT;
85
86 # input parameter block
87 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
88
89 $code.=<<___;
90 .text
91
92 .extern OPENSSL_ia32cap_P
93
94 .align 64
95 .Lzero:
96 .long 0,0,0,0
97 .Lone:
98 .long 1,0,0,0
99 .Linc:
100 .long 0,1,2,3
101 .Lfour:
102 .long 4,4,4,4
103 .Lincy:
104 .long 0,2,4,6,1,3,5,7
105 .Leight:
106 .long 8,8,8,8,8,8,8,8
107 .Lrot16:
108 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
109 .Lrot24:
110 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
111 .Lsigma:
112 .asciz "expand 32-byte k"
113 .align 64
114 .Lincz:
115 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
116 .Lsixteen:
117 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
118 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
119 ___
120
121 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
122 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
123 my $arg = pop;
124 $arg = "\$$arg" if ($arg*1 eq $arg);
125 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
126 }
127
128 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
129 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
130 @t=("%esi","%edi");
131
132 sub ROUND { # critical path is 24 cycles per round
133 my ($a0,$b0,$c0,$d0)=@_;
134 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
135 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
136 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
137 my ($xc,$xc_)=map("\"$_\"",@t);
138 my @x=map("\"$_\"",@x);
139
140 # Consider order in which variables are addressed by their
141 # index:
142 #
143 # a b c d
144 #
145 # 0 4 8 12 < even round
146 # 1 5 9 13
147 # 2 6 10 14
148 # 3 7 11 15
149 # 0 5 10 15 < odd round
150 # 1 6 11 12
151 # 2 7 8 13
152 # 3 4 9 14
153 #
154 # 'a', 'b' and 'd's are permanently allocated in registers,
155 # @x[0..7,12..15], while 'c's are maintained in memory. If
156 # you observe 'c' column, you'll notice that pair of 'c's is
157 # invariant between rounds. This means that we have to reload
158 # them once per round, in the middle. This is why you'll see
159 # bunch of 'c' stores and loads in the middle, but none in
160 # the beginning or end.
161
162 # Normally instructions would be interleaved to favour in-order
163 # execution. Generally out-of-order cores manage it gracefully,
164 # but not this time for some reason. As in-order execution
165 # cores are dying breed, old Atom is the only one around,
166 # instructions are left uninterleaved. Besides, Atom is better
167 # off executing 1xSSSE3 code anyway...
168
169 (
170 "&add (@x[$a0],@x[$b0])", # Q1
171 "&xor (@x[$d0],@x[$a0])",
172 "&rol (@x[$d0],16)",
173 "&add (@x[$a1],@x[$b1])", # Q2
174 "&xor (@x[$d1],@x[$a1])",
175 "&rol (@x[$d1],16)",
176
177 "&add ($xc,@x[$d0])",
178 "&xor (@x[$b0],$xc)",
179 "&rol (@x[$b0],12)",
180 "&add ($xc_,@x[$d1])",
181 "&xor (@x[$b1],$xc_)",
182 "&rol (@x[$b1],12)",
183
184 "&add (@x[$a0],@x[$b0])",
185 "&xor (@x[$d0],@x[$a0])",
186 "&rol (@x[$d0],8)",
187 "&add (@x[$a1],@x[$b1])",
188 "&xor (@x[$d1],@x[$a1])",
189 "&rol (@x[$d1],8)",
190
191 "&add ($xc,@x[$d0])",
192 "&xor (@x[$b0],$xc)",
193 "&rol (@x[$b0],7)",
194 "&add ($xc_,@x[$d1])",
195 "&xor (@x[$b1],$xc_)",
196 "&rol (@x[$b1],7)",
197
198 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
199 "&mov (\"4*$c1(%rsp)\",$xc_)",
200 "&mov ($xc,\"4*$c2(%rsp)\")",
201 "&mov ($xc_,\"4*$c3(%rsp)\")",
202
203 "&add (@x[$a2],@x[$b2])", # Q3
204 "&xor (@x[$d2],@x[$a2])",
205 "&rol (@x[$d2],16)",
206 "&add (@x[$a3],@x[$b3])", # Q4
207 "&xor (@x[$d3],@x[$a3])",
208 "&rol (@x[$d3],16)",
209
210 "&add ($xc,@x[$d2])",
211 "&xor (@x[$b2],$xc)",
212 "&rol (@x[$b2],12)",
213 "&add ($xc_,@x[$d3])",
214 "&xor (@x[$b3],$xc_)",
215 "&rol (@x[$b3],12)",
216
217 "&add (@x[$a2],@x[$b2])",
218 "&xor (@x[$d2],@x[$a2])",
219 "&rol (@x[$d2],8)",
220 "&add (@x[$a3],@x[$b3])",
221 "&xor (@x[$d3],@x[$a3])",
222 "&rol (@x[$d3],8)",
223
224 "&add ($xc,@x[$d2])",
225 "&xor (@x[$b2],$xc)",
226 "&rol (@x[$b2],7)",
227 "&add ($xc_,@x[$d3])",
228 "&xor (@x[$b3],$xc_)",
229 "&rol (@x[$b3],7)"
230 );
231 }
232
233 ########################################################################
234 # Generic code path that handles all lengths on pre-SSSE3 processors.
235 $code.=<<___;
236 .globl ChaCha20_ctr32
237 .type ChaCha20_ctr32,\@function,5
238 .align 64
239 ChaCha20_ctr32:
240 cmp \$0,$len
241 je .Lno_data
242 mov OPENSSL_ia32cap_P+4(%rip),%r10
243 test \$`1<<(41-32)`,%r10d
244 jnz .LChaCha20_ssse3
245
246 push %rbx
247 push %rbp
248 push %r12
249 push %r13
250 push %r14
251 push %r15
252 sub \$64+24,%rsp
253
254 #movdqa .Lsigma(%rip),%xmm0
255 movdqu ($key),%xmm1
256 movdqu 16($key),%xmm2
257 movdqu ($counter),%xmm3
258 movdqa .Lone(%rip),%xmm4
259
260 #movdqa %xmm0,4*0(%rsp) # key[0]
261 movdqa %xmm1,4*4(%rsp) # key[1]
262 movdqa %xmm2,4*8(%rsp) # key[2]
263 movdqa %xmm3,4*12(%rsp) # key[3]
264 mov $len,%rbp # reassign $len
265 jmp .Loop_outer
266
267 .align 32
268 .Loop_outer:
269 mov \$0x61707865,@x[0] # 'expa'
270 mov \$0x3320646e,@x[1] # 'nd 3'
271 mov \$0x79622d32,@x[2] # '2-by'
272 mov \$0x6b206574,@x[3] # 'te k'
273 mov 4*4(%rsp),@x[4]
274 mov 4*5(%rsp),@x[5]
275 mov 4*6(%rsp),@x[6]
276 mov 4*7(%rsp),@x[7]
277 movd %xmm3,@x[12]
278 mov 4*13(%rsp),@x[13]
279 mov 4*14(%rsp),@x[14]
280 mov 4*15(%rsp),@x[15]
281
282 mov %rbp,64+0(%rsp) # save len
283 mov \$10,%ebp
284 mov $inp,64+8(%rsp) # save inp
285 movq %xmm2,%rsi # "@x[8]"
286 mov $out,64+16(%rsp) # save out
287 mov %rsi,%rdi
288 shr \$32,%rdi # "@x[9]"
289 jmp .Loop
290
291 .align 32
292 .Loop:
293 ___
294 foreach (&ROUND (0, 4, 8,12)) { eval; }
295 foreach (&ROUND (0, 5,10,15)) { eval; }
296 &dec ("%ebp");
297 &jnz (".Loop");
298
299 $code.=<<___;
300 mov @t[1],4*9(%rsp) # modulo-scheduled
301 mov @t[0],4*8(%rsp)
302 mov 64(%rsp),%rbp # load len
303 movdqa %xmm2,%xmm1
304 mov 64+8(%rsp),$inp # load inp
305 paddd %xmm4,%xmm3 # increment counter
306 mov 64+16(%rsp),$out # load out
307
308 add \$0x61707865,@x[0] # 'expa'
309 add \$0x3320646e,@x[1] # 'nd 3'
310 add \$0x79622d32,@x[2] # '2-by'
311 add \$0x6b206574,@x[3] # 'te k'
312 add 4*4(%rsp),@x[4]
313 add 4*5(%rsp),@x[5]
314 add 4*6(%rsp),@x[6]
315 add 4*7(%rsp),@x[7]
316 add 4*12(%rsp),@x[12]
317 add 4*13(%rsp),@x[13]
318 add 4*14(%rsp),@x[14]
319 add 4*15(%rsp),@x[15]
320 paddd 4*8(%rsp),%xmm1
321
322 cmp \$64,%rbp
323 jb .Ltail
324
325 xor 4*0($inp),@x[0] # xor with input
326 xor 4*1($inp),@x[1]
327 xor 4*2($inp),@x[2]
328 xor 4*3($inp),@x[3]
329 xor 4*4($inp),@x[4]
330 xor 4*5($inp),@x[5]
331 xor 4*6($inp),@x[6]
332 xor 4*7($inp),@x[7]
333 movdqu 4*8($inp),%xmm0
334 xor 4*12($inp),@x[12]
335 xor 4*13($inp),@x[13]
336 xor 4*14($inp),@x[14]
337 xor 4*15($inp),@x[15]
338 lea 4*16($inp),$inp # inp+=64
339 pxor %xmm1,%xmm0
340
341 movdqa %xmm2,4*8(%rsp)
342 movd %xmm3,4*12(%rsp)
343
344 mov @x[0],4*0($out) # write output
345 mov @x[1],4*1($out)
346 mov @x[2],4*2($out)
347 mov @x[3],4*3($out)
348 mov @x[4],4*4($out)
349 mov @x[5],4*5($out)
350 mov @x[6],4*6($out)
351 mov @x[7],4*7($out)
352 movdqu %xmm0,4*8($out)
353 mov @x[12],4*12($out)
354 mov @x[13],4*13($out)
355 mov @x[14],4*14($out)
356 mov @x[15],4*15($out)
357 lea 4*16($out),$out # out+=64
358
359 sub \$64,%rbp
360 jnz .Loop_outer
361
362 jmp .Ldone
363
364 .align 16
365 .Ltail:
366 mov @x[0],4*0(%rsp)
367 mov @x[1],4*1(%rsp)
368 xor %rbx,%rbx
369 mov @x[2],4*2(%rsp)
370 mov @x[3],4*3(%rsp)
371 mov @x[4],4*4(%rsp)
372 mov @x[5],4*5(%rsp)
373 mov @x[6],4*6(%rsp)
374 mov @x[7],4*7(%rsp)
375 movdqa %xmm1,4*8(%rsp)
376 mov @x[12],4*12(%rsp)
377 mov @x[13],4*13(%rsp)
378 mov @x[14],4*14(%rsp)
379 mov @x[15],4*15(%rsp)
380
381 .Loop_tail:
382 movzb ($inp,%rbx),%eax
383 movzb (%rsp,%rbx),%edx
384 lea 1(%rbx),%rbx
385 xor %edx,%eax
386 mov %al,-1($out,%rbx)
387 dec %rbp
388 jnz .Loop_tail
389
390 .Ldone:
391 add \$64+24,%rsp
392 pop %r15
393 pop %r14
394 pop %r13
395 pop %r12
396 pop %rbp
397 pop %rbx
398 .Lno_data:
399 ret
400 .size ChaCha20_ctr32,.-ChaCha20_ctr32
401 ___
402
403 ########################################################################
404 # SSSE3 code path that handles shorter lengths
405 {
406 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
407
408 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
409 &paddd ($a,$b);
410 &pxor ($d,$a);
411 &pshufb ($d,$rot16);
412
413 &paddd ($c,$d);
414 &pxor ($b,$c);
415 &movdqa ($t,$b);
416 &psrld ($b,20);
417 &pslld ($t,12);
418 &por ($b,$t);
419
420 &paddd ($a,$b);
421 &pxor ($d,$a);
422 &pshufb ($d,$rot24);
423
424 &paddd ($c,$d);
425 &pxor ($b,$c);
426 &movdqa ($t,$b);
427 &psrld ($b,25);
428 &pslld ($t,7);
429 &por ($b,$t);
430 }
431
432 my $xframe = $win64 ? 32+32+8 : 24;
433
434 $code.=<<___;
435 .type ChaCha20_ssse3,\@function,5
436 .align 32
437 ChaCha20_ssse3:
438 .LChaCha20_ssse3:
439 ___
440 $code.=<<___ if ($avx);
441 test \$`1<<(43-32)`,%r10d
442 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
443 ___
444 $code.=<<___;
445 cmp \$128,$len # we might throw away some data,
446 ja .LChaCha20_4x # but overall it won't be slower
447
448 .Ldo_sse3_after_all:
449 push %rbx
450 push %rbp
451 push %r12
452 push %r13
453 push %r14
454 push %r15
455
456 sub \$64+$xframe,%rsp
457 ___
458 $code.=<<___ if ($win64);
459 movaps %xmm6,64+32(%rsp)
460 movaps %xmm7,64+48(%rsp)
461 ___
462 $code.=<<___;
463 movdqa .Lsigma(%rip),$a
464 movdqu ($key),$b
465 movdqu 16($key),$c
466 movdqu ($counter),$d
467 movdqa .Lrot16(%rip),$rot16
468 movdqa .Lrot24(%rip),$rot24
469
470 movdqa $a,0x00(%rsp)
471 movdqa $b,0x10(%rsp)
472 movdqa $c,0x20(%rsp)
473 movdqa $d,0x30(%rsp)
474 mov \$10,%ebp
475 jmp .Loop_ssse3
476
477 .align 32
478 .Loop_outer_ssse3:
479 movdqa .Lone(%rip),$d
480 movdqa 0x00(%rsp),$a
481 movdqa 0x10(%rsp),$b
482 movdqa 0x20(%rsp),$c
483 paddd 0x30(%rsp),$d
484 mov \$10,%ebp
485 movdqa $d,0x30(%rsp)
486 jmp .Loop_ssse3
487
488 .align 32
489 .Loop_ssse3:
490 ___
491 &SSSE3ROUND();
492 &pshufd ($c,$c,0b01001110);
493 &pshufd ($b,$b,0b00111001);
494 &pshufd ($d,$d,0b10010011);
495 &nop ();
496
497 &SSSE3ROUND();
498 &pshufd ($c,$c,0b01001110);
499 &pshufd ($b,$b,0b10010011);
500 &pshufd ($d,$d,0b00111001);
501
502 &dec ("%ebp");
503 &jnz (".Loop_ssse3");
504
505 $code.=<<___;
506 paddd 0x00(%rsp),$a
507 paddd 0x10(%rsp),$b
508 paddd 0x20(%rsp),$c
509 paddd 0x30(%rsp),$d
510
511 cmp \$64,$len
512 jb .Ltail_ssse3
513
514 movdqu 0x00($inp),$t
515 movdqu 0x10($inp),$t1
516 pxor $t,$a # xor with input
517 movdqu 0x20($inp),$t
518 pxor $t1,$b
519 movdqu 0x30($inp),$t1
520 lea 0x40($inp),$inp # inp+=64
521 pxor $t,$c
522 pxor $t1,$d
523
524 movdqu $a,0x00($out) # write output
525 movdqu $b,0x10($out)
526 movdqu $c,0x20($out)
527 movdqu $d,0x30($out)
528 lea 0x40($out),$out # out+=64
529
530 sub \$64,$len
531 jnz .Loop_outer_ssse3
532
533 jmp .Ldone_ssse3
534
535 .align 16
536 .Ltail_ssse3:
537 movdqa $a,0x00(%rsp)
538 movdqa $b,0x10(%rsp)
539 movdqa $c,0x20(%rsp)
540 movdqa $d,0x30(%rsp)
541 xor %rbx,%rbx
542
543 .Loop_tail_ssse3:
544 movzb ($inp,%rbx),%eax
545 movzb (%rsp,%rbx),%ecx
546 lea 1(%rbx),%rbx
547 xor %ecx,%eax
548 mov %al,-1($out,%rbx)
549 dec $len
550 jnz .Loop_tail_ssse3
551
552 .Ldone_ssse3:
553 ___
554 $code.=<<___ if ($win64);
555 movaps 64+32(%rsp),%xmm6
556 movaps 64+48(%rsp),%xmm7
557 ___
558 $code.=<<___;
559 add \$64+$xframe,%rsp
560 pop %r15
561 pop %r14
562 pop %r13
563 pop %r12
564 pop %rbp
565 pop %rbx
566 ret
567 .size ChaCha20_ssse3,.-ChaCha20_ssse3
568 ___
569 }
570
571 ########################################################################
572 # SSSE3 code path that handles longer messages.
573 {
574 # assign variables to favor Atom front-end
575 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
576 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
577 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
578 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
579
580 sub SSSE3_lane_ROUND {
581 my ($a0,$b0,$c0,$d0)=@_;
582 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
583 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
584 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
585 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
586 my @x=map("\"$_\"",@xx);
587
588 # Consider order in which variables are addressed by their
589 # index:
590 #
591 # a b c d
592 #
593 # 0 4 8 12 < even round
594 # 1 5 9 13
595 # 2 6 10 14
596 # 3 7 11 15
597 # 0 5 10 15 < odd round
598 # 1 6 11 12
599 # 2 7 8 13
600 # 3 4 9 14
601 #
602 # 'a', 'b' and 'd's are permanently allocated in registers,
603 # @x[0..7,12..15], while 'c's are maintained in memory. If
604 # you observe 'c' column, you'll notice that pair of 'c's is
605 # invariant between rounds. This means that we have to reload
606 # them once per round, in the middle. This is why you'll see
607 # bunch of 'c' stores and loads in the middle, but none in
608 # the beginning or end.
609
610 (
611 "&paddd (@x[$a0],@x[$b0])", # Q1
612 "&paddd (@x[$a1],@x[$b1])", # Q2
613 "&pxor (@x[$d0],@x[$a0])",
614 "&pxor (@x[$d1],@x[$a1])",
615 "&pshufb (@x[$d0],$t1)",
616 "&pshufb (@x[$d1],$t1)",
617
618 "&paddd ($xc,@x[$d0])",
619 "&paddd ($xc_,@x[$d1])",
620 "&pxor (@x[$b0],$xc)",
621 "&pxor (@x[$b1],$xc_)",
622 "&movdqa ($t0,@x[$b0])",
623 "&pslld (@x[$b0],12)",
624 "&psrld ($t0,20)",
625 "&movdqa ($t1,@x[$b1])",
626 "&pslld (@x[$b1],12)",
627 "&por (@x[$b0],$t0)",
628 "&psrld ($t1,20)",
629 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
630 "&por (@x[$b1],$t1)",
631
632 "&paddd (@x[$a0],@x[$b0])",
633 "&paddd (@x[$a1],@x[$b1])",
634 "&pxor (@x[$d0],@x[$a0])",
635 "&pxor (@x[$d1],@x[$a1])",
636 "&pshufb (@x[$d0],$t0)",
637 "&pshufb (@x[$d1],$t0)",
638
639 "&paddd ($xc,@x[$d0])",
640 "&paddd ($xc_,@x[$d1])",
641 "&pxor (@x[$b0],$xc)",
642 "&pxor (@x[$b1],$xc_)",
643 "&movdqa ($t1,@x[$b0])",
644 "&pslld (@x[$b0],7)",
645 "&psrld ($t1,25)",
646 "&movdqa ($t0,@x[$b1])",
647 "&pslld (@x[$b1],7)",
648 "&por (@x[$b0],$t1)",
649 "&psrld ($t0,25)",
650 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
651 "&por (@x[$b1],$t0)",
652
653 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
654 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
655 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
656 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
657
658 "&paddd (@x[$a2],@x[$b2])", # Q3
659 "&paddd (@x[$a3],@x[$b3])", # Q4
660 "&pxor (@x[$d2],@x[$a2])",
661 "&pxor (@x[$d3],@x[$a3])",
662 "&pshufb (@x[$d2],$t1)",
663 "&pshufb (@x[$d3],$t1)",
664
665 "&paddd ($xc,@x[$d2])",
666 "&paddd ($xc_,@x[$d3])",
667 "&pxor (@x[$b2],$xc)",
668 "&pxor (@x[$b3],$xc_)",
669 "&movdqa ($t0,@x[$b2])",
670 "&pslld (@x[$b2],12)",
671 "&psrld ($t0,20)",
672 "&movdqa ($t1,@x[$b3])",
673 "&pslld (@x[$b3],12)",
674 "&por (@x[$b2],$t0)",
675 "&psrld ($t1,20)",
676 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
677 "&por (@x[$b3],$t1)",
678
679 "&paddd (@x[$a2],@x[$b2])",
680 "&paddd (@x[$a3],@x[$b3])",
681 "&pxor (@x[$d2],@x[$a2])",
682 "&pxor (@x[$d3],@x[$a3])",
683 "&pshufb (@x[$d2],$t0)",
684 "&pshufb (@x[$d3],$t0)",
685
686 "&paddd ($xc,@x[$d2])",
687 "&paddd ($xc_,@x[$d3])",
688 "&pxor (@x[$b2],$xc)",
689 "&pxor (@x[$b3],$xc_)",
690 "&movdqa ($t1,@x[$b2])",
691 "&pslld (@x[$b2],7)",
692 "&psrld ($t1,25)",
693 "&movdqa ($t0,@x[$b3])",
694 "&pslld (@x[$b3],7)",
695 "&por (@x[$b2],$t1)",
696 "&psrld ($t0,25)",
697 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
698 "&por (@x[$b3],$t0)"
699 );
700 }
701
702 my $xframe = $win64 ? 0xa0 : 0;
703
704 $code.=<<___;
705 .type ChaCha20_4x,\@function,5
706 .align 32
707 ChaCha20_4x:
708 .LChaCha20_4x:
709 mov %r10,%r11
710 ___
711 $code.=<<___ if ($avx>1);
712 shr \$32,%r10 # OPENSSL_ia32cap_P+8
713 test \$`1<<5`,%r10 # test AVX2
714 jnz .LChaCha20_8x
715 ___
716 $code.=<<___;
717 cmp \$192,$len
718 ja .Lproceed4x
719
720 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
721 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
722 je .Ldo_sse3_after_all # to detect Atom
723
724 .Lproceed4x:
725 lea -0x78(%rsp),%r11
726 sub \$0x148+$xframe,%rsp
727 ___
728 ################ stack layout
729 # +0x00 SIMD equivalent of @x[8-12]
730 # ...
731 # +0x40 constant copy of key[0-2] smashed by lanes
732 # ...
733 # +0x100 SIMD counters (with nonce smashed by lanes)
734 # ...
735 # +0x140
736 $code.=<<___ if ($win64);
737 movaps %xmm6,-0x30(%r11)
738 movaps %xmm7,-0x20(%r11)
739 movaps %xmm8,-0x10(%r11)
740 movaps %xmm9,0x00(%r11)
741 movaps %xmm10,0x10(%r11)
742 movaps %xmm11,0x20(%r11)
743 movaps %xmm12,0x30(%r11)
744 movaps %xmm13,0x40(%r11)
745 movaps %xmm14,0x50(%r11)
746 movaps %xmm15,0x60(%r11)
747 ___
748 $code.=<<___;
749 movdqa .Lsigma(%rip),$xa3 # key[0]
750 movdqu ($key),$xb3 # key[1]
751 movdqu 16($key),$xt3 # key[2]
752 movdqu ($counter),$xd3 # key[3]
753 lea 0x100(%rsp),%rcx # size optimization
754 lea .Lrot16(%rip),%r10
755 lea .Lrot24(%rip),%r11
756
757 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
758 pshufd \$0x55,$xa3,$xa1
759 movdqa $xa0,0x40(%rsp) # ... and offload
760 pshufd \$0xaa,$xa3,$xa2
761 movdqa $xa1,0x50(%rsp)
762 pshufd \$0xff,$xa3,$xa3
763 movdqa $xa2,0x60(%rsp)
764 movdqa $xa3,0x70(%rsp)
765
766 pshufd \$0x00,$xb3,$xb0
767 pshufd \$0x55,$xb3,$xb1
768 movdqa $xb0,0x80-0x100(%rcx)
769 pshufd \$0xaa,$xb3,$xb2
770 movdqa $xb1,0x90-0x100(%rcx)
771 pshufd \$0xff,$xb3,$xb3
772 movdqa $xb2,0xa0-0x100(%rcx)
773 movdqa $xb3,0xb0-0x100(%rcx)
774
775 pshufd \$0x00,$xt3,$xt0 # "$xc0"
776 pshufd \$0x55,$xt3,$xt1 # "$xc1"
777 movdqa $xt0,0xc0-0x100(%rcx)
778 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
779 movdqa $xt1,0xd0-0x100(%rcx)
780 pshufd \$0xff,$xt3,$xt3 # "$xc3"
781 movdqa $xt2,0xe0-0x100(%rcx)
782 movdqa $xt3,0xf0-0x100(%rcx)
783
784 pshufd \$0x00,$xd3,$xd0
785 pshufd \$0x55,$xd3,$xd1
786 paddd .Linc(%rip),$xd0 # don't save counters yet
787 pshufd \$0xaa,$xd3,$xd2
788 movdqa $xd1,0x110-0x100(%rcx)
789 pshufd \$0xff,$xd3,$xd3
790 movdqa $xd2,0x120-0x100(%rcx)
791 movdqa $xd3,0x130-0x100(%rcx)
792
793 jmp .Loop_enter4x
794
795 .align 32
796 .Loop_outer4x:
797 movdqa 0x40(%rsp),$xa0 # re-load smashed key
798 movdqa 0x50(%rsp),$xa1
799 movdqa 0x60(%rsp),$xa2
800 movdqa 0x70(%rsp),$xa3
801 movdqa 0x80-0x100(%rcx),$xb0
802 movdqa 0x90-0x100(%rcx),$xb1
803 movdqa 0xa0-0x100(%rcx),$xb2
804 movdqa 0xb0-0x100(%rcx),$xb3
805 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
806 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
807 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
808 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
809 movdqa 0x100-0x100(%rcx),$xd0
810 movdqa 0x110-0x100(%rcx),$xd1
811 movdqa 0x120-0x100(%rcx),$xd2
812 movdqa 0x130-0x100(%rcx),$xd3
813 paddd .Lfour(%rip),$xd0 # next SIMD counters
814
815 .Loop_enter4x:
816 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
817 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
818 movdqa (%r10),$xt3 # .Lrot16(%rip)
819 mov \$10,%eax
820 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
821 jmp .Loop4x
822
823 .align 32
824 .Loop4x:
825 ___
826 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
827 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
828 $code.=<<___;
829 dec %eax
830 jnz .Loop4x
831
832 paddd 0x40(%rsp),$xa0 # accumulate key material
833 paddd 0x50(%rsp),$xa1
834 paddd 0x60(%rsp),$xa2
835 paddd 0x70(%rsp),$xa3
836
837 movdqa $xa0,$xt2 # "de-interlace" data
838 punpckldq $xa1,$xa0
839 movdqa $xa2,$xt3
840 punpckldq $xa3,$xa2
841 punpckhdq $xa1,$xt2
842 punpckhdq $xa3,$xt3
843 movdqa $xa0,$xa1
844 punpcklqdq $xa2,$xa0 # "a0"
845 movdqa $xt2,$xa3
846 punpcklqdq $xt3,$xt2 # "a2"
847 punpckhqdq $xa2,$xa1 # "a1"
848 punpckhqdq $xt3,$xa3 # "a3"
849 ___
850 ($xa2,$xt2)=($xt2,$xa2);
851 $code.=<<___;
852 paddd 0x80-0x100(%rcx),$xb0
853 paddd 0x90-0x100(%rcx),$xb1
854 paddd 0xa0-0x100(%rcx),$xb2
855 paddd 0xb0-0x100(%rcx),$xb3
856
857 movdqa $xa0,0x00(%rsp) # offload $xaN
858 movdqa $xa1,0x10(%rsp)
859 movdqa 0x20(%rsp),$xa0 # "xc2"
860 movdqa 0x30(%rsp),$xa1 # "xc3"
861
862 movdqa $xb0,$xt2
863 punpckldq $xb1,$xb0
864 movdqa $xb2,$xt3
865 punpckldq $xb3,$xb2
866 punpckhdq $xb1,$xt2
867 punpckhdq $xb3,$xt3
868 movdqa $xb0,$xb1
869 punpcklqdq $xb2,$xb0 # "b0"
870 movdqa $xt2,$xb3
871 punpcklqdq $xt3,$xt2 # "b2"
872 punpckhqdq $xb2,$xb1 # "b1"
873 punpckhqdq $xt3,$xb3 # "b3"
874 ___
875 ($xb2,$xt2)=($xt2,$xb2);
876 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
877 $code.=<<___;
878 paddd 0xc0-0x100(%rcx),$xc0
879 paddd 0xd0-0x100(%rcx),$xc1
880 paddd 0xe0-0x100(%rcx),$xc2
881 paddd 0xf0-0x100(%rcx),$xc3
882
883 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
884 movdqa $xa3,0x30(%rsp)
885
886 movdqa $xc0,$xt2
887 punpckldq $xc1,$xc0
888 movdqa $xc2,$xt3
889 punpckldq $xc3,$xc2
890 punpckhdq $xc1,$xt2
891 punpckhdq $xc3,$xt3
892 movdqa $xc0,$xc1
893 punpcklqdq $xc2,$xc0 # "c0"
894 movdqa $xt2,$xc3
895 punpcklqdq $xt3,$xt2 # "c2"
896 punpckhqdq $xc2,$xc1 # "c1"
897 punpckhqdq $xt3,$xc3 # "c3"
898 ___
899 ($xc2,$xt2)=($xt2,$xc2);
900 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
901 $code.=<<___;
902 paddd 0x100-0x100(%rcx),$xd0
903 paddd 0x110-0x100(%rcx),$xd1
904 paddd 0x120-0x100(%rcx),$xd2
905 paddd 0x130-0x100(%rcx),$xd3
906
907 movdqa $xd0,$xt2
908 punpckldq $xd1,$xd0
909 movdqa $xd2,$xt3
910 punpckldq $xd3,$xd2
911 punpckhdq $xd1,$xt2
912 punpckhdq $xd3,$xt3
913 movdqa $xd0,$xd1
914 punpcklqdq $xd2,$xd0 # "d0"
915 movdqa $xt2,$xd3
916 punpcklqdq $xt3,$xt2 # "d2"
917 punpckhqdq $xd2,$xd1 # "d1"
918 punpckhqdq $xt3,$xd3 # "d3"
919 ___
920 ($xd2,$xt2)=($xt2,$xd2);
921 $code.=<<___;
922 cmp \$64*4,$len
923 jb .Ltail4x
924
925 movdqu 0x00($inp),$xt0 # xor with input
926 movdqu 0x10($inp),$xt1
927 movdqu 0x20($inp),$xt2
928 movdqu 0x30($inp),$xt3
929 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
930 pxor $xb0,$xt1
931 pxor $xc0,$xt2
932 pxor $xd0,$xt3
933
934 movdqu $xt0,0x00($out)
935 movdqu 0x40($inp),$xt0
936 movdqu $xt1,0x10($out)
937 movdqu 0x50($inp),$xt1
938 movdqu $xt2,0x20($out)
939 movdqu 0x60($inp),$xt2
940 movdqu $xt3,0x30($out)
941 movdqu 0x70($inp),$xt3
942 lea 0x80($inp),$inp # size optimization
943 pxor 0x10(%rsp),$xt0
944 pxor $xb1,$xt1
945 pxor $xc1,$xt2
946 pxor $xd1,$xt3
947
948 movdqu $xt0,0x40($out)
949 movdqu 0x00($inp),$xt0
950 movdqu $xt1,0x50($out)
951 movdqu 0x10($inp),$xt1
952 movdqu $xt2,0x60($out)
953 movdqu 0x20($inp),$xt2
954 movdqu $xt3,0x70($out)
955 lea 0x80($out),$out # size optimization
956 movdqu 0x30($inp),$xt3
957 pxor 0x20(%rsp),$xt0
958 pxor $xb2,$xt1
959 pxor $xc2,$xt2
960 pxor $xd2,$xt3
961
962 movdqu $xt0,0x00($out)
963 movdqu 0x40($inp),$xt0
964 movdqu $xt1,0x10($out)
965 movdqu 0x50($inp),$xt1
966 movdqu $xt2,0x20($out)
967 movdqu 0x60($inp),$xt2
968 movdqu $xt3,0x30($out)
969 movdqu 0x70($inp),$xt3
970 lea 0x80($inp),$inp # inp+=64*4
971 pxor 0x30(%rsp),$xt0
972 pxor $xb3,$xt1
973 pxor $xc3,$xt2
974 pxor $xd3,$xt3
975 movdqu $xt0,0x40($out)
976 movdqu $xt1,0x50($out)
977 movdqu $xt2,0x60($out)
978 movdqu $xt3,0x70($out)
979 lea 0x80($out),$out # out+=64*4
980
981 sub \$64*4,$len
982 jnz .Loop_outer4x
983
984 jmp .Ldone4x
985
986 .Ltail4x:
987 cmp \$192,$len
988 jae .L192_or_more4x
989 cmp \$128,$len
990 jae .L128_or_more4x
991 cmp \$64,$len
992 jae .L64_or_more4x
993
994 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
995 xor %r10,%r10
996 #movdqa $xt0,0x00(%rsp)
997 movdqa $xb0,0x10(%rsp)
998 movdqa $xc0,0x20(%rsp)
999 movdqa $xd0,0x30(%rsp)
1000 jmp .Loop_tail4x
1001
1002 .align 32
1003 .L64_or_more4x:
1004 movdqu 0x00($inp),$xt0 # xor with input
1005 movdqu 0x10($inp),$xt1
1006 movdqu 0x20($inp),$xt2
1007 movdqu 0x30($inp),$xt3
1008 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1009 pxor $xb0,$xt1
1010 pxor $xc0,$xt2
1011 pxor $xd0,$xt3
1012 movdqu $xt0,0x00($out)
1013 movdqu $xt1,0x10($out)
1014 movdqu $xt2,0x20($out)
1015 movdqu $xt3,0x30($out)
1016 je .Ldone4x
1017
1018 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1019 lea 0x40($inp),$inp # inp+=64*1
1020 xor %r10,%r10
1021 movdqa $xt0,0x00(%rsp)
1022 movdqa $xb1,0x10(%rsp)
1023 lea 0x40($out),$out # out+=64*1
1024 movdqa $xc1,0x20(%rsp)
1025 sub \$64,$len # len-=64*1
1026 movdqa $xd1,0x30(%rsp)
1027 jmp .Loop_tail4x
1028
1029 .align 32
1030 .L128_or_more4x:
1031 movdqu 0x00($inp),$xt0 # xor with input
1032 movdqu 0x10($inp),$xt1
1033 movdqu 0x20($inp),$xt2
1034 movdqu 0x30($inp),$xt3
1035 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1036 pxor $xb0,$xt1
1037 pxor $xc0,$xt2
1038 pxor $xd0,$xt3
1039
1040 movdqu $xt0,0x00($out)
1041 movdqu 0x40($inp),$xt0
1042 movdqu $xt1,0x10($out)
1043 movdqu 0x50($inp),$xt1
1044 movdqu $xt2,0x20($out)
1045 movdqu 0x60($inp),$xt2
1046 movdqu $xt3,0x30($out)
1047 movdqu 0x70($inp),$xt3
1048 pxor 0x10(%rsp),$xt0
1049 pxor $xb1,$xt1
1050 pxor $xc1,$xt2
1051 pxor $xd1,$xt3
1052 movdqu $xt0,0x40($out)
1053 movdqu $xt1,0x50($out)
1054 movdqu $xt2,0x60($out)
1055 movdqu $xt3,0x70($out)
1056 je .Ldone4x
1057
1058 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1059 lea 0x80($inp),$inp # inp+=64*2
1060 xor %r10,%r10
1061 movdqa $xt0,0x00(%rsp)
1062 movdqa $xb2,0x10(%rsp)
1063 lea 0x80($out),$out # out+=64*2
1064 movdqa $xc2,0x20(%rsp)
1065 sub \$128,$len # len-=64*2
1066 movdqa $xd2,0x30(%rsp)
1067 jmp .Loop_tail4x
1068
1069 .align 32
1070 .L192_or_more4x:
1071 movdqu 0x00($inp),$xt0 # xor with input
1072 movdqu 0x10($inp),$xt1
1073 movdqu 0x20($inp),$xt2
1074 movdqu 0x30($inp),$xt3
1075 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1076 pxor $xb0,$xt1
1077 pxor $xc0,$xt2
1078 pxor $xd0,$xt3
1079
1080 movdqu $xt0,0x00($out)
1081 movdqu 0x40($inp),$xt0
1082 movdqu $xt1,0x10($out)
1083 movdqu 0x50($inp),$xt1
1084 movdqu $xt2,0x20($out)
1085 movdqu 0x60($inp),$xt2
1086 movdqu $xt3,0x30($out)
1087 movdqu 0x70($inp),$xt3
1088 lea 0x80($inp),$inp # size optimization
1089 pxor 0x10(%rsp),$xt0
1090 pxor $xb1,$xt1
1091 pxor $xc1,$xt2
1092 pxor $xd1,$xt3
1093
1094 movdqu $xt0,0x40($out)
1095 movdqu 0x00($inp),$xt0
1096 movdqu $xt1,0x50($out)
1097 movdqu 0x10($inp),$xt1
1098 movdqu $xt2,0x60($out)
1099 movdqu 0x20($inp),$xt2
1100 movdqu $xt3,0x70($out)
1101 lea 0x80($out),$out # size optimization
1102 movdqu 0x30($inp),$xt3
1103 pxor 0x20(%rsp),$xt0
1104 pxor $xb2,$xt1
1105 pxor $xc2,$xt2
1106 pxor $xd2,$xt3
1107 movdqu $xt0,0x00($out)
1108 movdqu $xt1,0x10($out)
1109 movdqu $xt2,0x20($out)
1110 movdqu $xt3,0x30($out)
1111 je .Ldone4x
1112
1113 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1114 lea 0x40($inp),$inp # inp+=64*3
1115 xor %r10,%r10
1116 movdqa $xt0,0x00(%rsp)
1117 movdqa $xb3,0x10(%rsp)
1118 lea 0x40($out),$out # out+=64*3
1119 movdqa $xc3,0x20(%rsp)
1120 sub \$192,$len # len-=64*3
1121 movdqa $xd3,0x30(%rsp)
1122
1123 .Loop_tail4x:
1124 movzb ($inp,%r10),%eax
1125 movzb (%rsp,%r10),%ecx
1126 lea 1(%r10),%r10
1127 xor %ecx,%eax
1128 mov %al,-1($out,%r10)
1129 dec $len
1130 jnz .Loop_tail4x
1131
1132 .Ldone4x:
1133 ___
1134 $code.=<<___ if ($win64);
1135 lea 0x140+0x30(%rsp),%r11
1136 movaps -0x30(%r11),%xmm6
1137 movaps -0x20(%r11),%xmm7
1138 movaps -0x10(%r11),%xmm8
1139 movaps 0x00(%r11),%xmm9
1140 movaps 0x10(%r11),%xmm10
1141 movaps 0x20(%r11),%xmm11
1142 movaps 0x30(%r11),%xmm12
1143 movaps 0x40(%r11),%xmm13
1144 movaps 0x50(%r11),%xmm14
1145 movaps 0x60(%r11),%xmm15
1146 ___
1147 $code.=<<___;
1148 add \$0x148+$xframe,%rsp
1149 ret
1150 .size ChaCha20_4x,.-ChaCha20_4x
1151 ___
1152 }
1153
1154 ########################################################################
1155 # XOP code path that handles all lengths.
1156 if ($avx) {
1157 # There is some "anomaly" observed depending on instructions' size or
1158 # alignment. If you look closely at below code you'll notice that
1159 # sometimes argument order varies. The order affects instruction
1160 # encoding by making it larger, and such fiddling gives 5% performance
1161 # improvement. This is on FX-4100...
1162
1163 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1164 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1165 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1166 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1167
1168 sub XOP_lane_ROUND {
1169 my ($a0,$b0,$c0,$d0)=@_;
1170 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1171 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1172 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1173 my @x=map("\"$_\"",@xx);
1174
1175 (
1176 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1177 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1178 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1179 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1180 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1181 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1182 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1183 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1184 "&vprotd (@x[$d0],@x[$d0],16)",
1185 "&vprotd (@x[$d1],@x[$d1],16)",
1186 "&vprotd (@x[$d2],@x[$d2],16)",
1187 "&vprotd (@x[$d3],@x[$d3],16)",
1188
1189 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1190 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1191 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1192 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1193 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1194 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1195 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1196 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1197 "&vprotd (@x[$b0],@x[$b0],12)",
1198 "&vprotd (@x[$b1],@x[$b1],12)",
1199 "&vprotd (@x[$b2],@x[$b2],12)",
1200 "&vprotd (@x[$b3],@x[$b3],12)",
1201
1202 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1203 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1204 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1205 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1206 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1207 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1208 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1209 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1210 "&vprotd (@x[$d0],@x[$d0],8)",
1211 "&vprotd (@x[$d1],@x[$d1],8)",
1212 "&vprotd (@x[$d2],@x[$d2],8)",
1213 "&vprotd (@x[$d3],@x[$d3],8)",
1214
1215 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1216 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1217 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1218 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1219 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1220 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1221 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1222 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1223 "&vprotd (@x[$b0],@x[$b0],7)",
1224 "&vprotd (@x[$b1],@x[$b1],7)",
1225 "&vprotd (@x[$b2],@x[$b2],7)",
1226 "&vprotd (@x[$b3],@x[$b3],7)"
1227 );
1228 }
1229
1230 my $xframe = $win64 ? 0xa0 : 0;
1231
1232 $code.=<<___;
1233 .type ChaCha20_4xop,\@function,5
1234 .align 32
1235 ChaCha20_4xop:
1236 .LChaCha20_4xop:
1237 lea -0x78(%rsp),%r11
1238 sub \$0x148+$xframe,%rsp
1239 ___
1240 ################ stack layout
1241 # +0x00 SIMD equivalent of @x[8-12]
1242 # ...
1243 # +0x40 constant copy of key[0-2] smashed by lanes
1244 # ...
1245 # +0x100 SIMD counters (with nonce smashed by lanes)
1246 # ...
1247 # +0x140
1248 $code.=<<___ if ($win64);
1249 movaps %xmm6,-0x30(%r11)
1250 movaps %xmm7,-0x20(%r11)
1251 movaps %xmm8,-0x10(%r11)
1252 movaps %xmm9,0x00(%r11)
1253 movaps %xmm10,0x10(%r11)
1254 movaps %xmm11,0x20(%r11)
1255 movaps %xmm12,0x30(%r11)
1256 movaps %xmm13,0x40(%r11)
1257 movaps %xmm14,0x50(%r11)
1258 movaps %xmm15,0x60(%r11)
1259 ___
1260 $code.=<<___;
1261 vzeroupper
1262
1263 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1264 vmovdqu ($key),$xb3 # key[1]
1265 vmovdqu 16($key),$xt3 # key[2]
1266 vmovdqu ($counter),$xd3 # key[3]
1267 lea 0x100(%rsp),%rcx # size optimization
1268
1269 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1270 vpshufd \$0x55,$xa3,$xa1
1271 vmovdqa $xa0,0x40(%rsp) # ... and offload
1272 vpshufd \$0xaa,$xa3,$xa2
1273 vmovdqa $xa1,0x50(%rsp)
1274 vpshufd \$0xff,$xa3,$xa3
1275 vmovdqa $xa2,0x60(%rsp)
1276 vmovdqa $xa3,0x70(%rsp)
1277
1278 vpshufd \$0x00,$xb3,$xb0
1279 vpshufd \$0x55,$xb3,$xb1
1280 vmovdqa $xb0,0x80-0x100(%rcx)
1281 vpshufd \$0xaa,$xb3,$xb2
1282 vmovdqa $xb1,0x90-0x100(%rcx)
1283 vpshufd \$0xff,$xb3,$xb3
1284 vmovdqa $xb2,0xa0-0x100(%rcx)
1285 vmovdqa $xb3,0xb0-0x100(%rcx)
1286
1287 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1288 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1289 vmovdqa $xt0,0xc0-0x100(%rcx)
1290 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1291 vmovdqa $xt1,0xd0-0x100(%rcx)
1292 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1293 vmovdqa $xt2,0xe0-0x100(%rcx)
1294 vmovdqa $xt3,0xf0-0x100(%rcx)
1295
1296 vpshufd \$0x00,$xd3,$xd0
1297 vpshufd \$0x55,$xd3,$xd1
1298 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1299 vpshufd \$0xaa,$xd3,$xd2
1300 vmovdqa $xd1,0x110-0x100(%rcx)
1301 vpshufd \$0xff,$xd3,$xd3
1302 vmovdqa $xd2,0x120-0x100(%rcx)
1303 vmovdqa $xd3,0x130-0x100(%rcx)
1304
1305 jmp .Loop_enter4xop
1306
1307 .align 32
1308 .Loop_outer4xop:
1309 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1310 vmovdqa 0x50(%rsp),$xa1
1311 vmovdqa 0x60(%rsp),$xa2
1312 vmovdqa 0x70(%rsp),$xa3
1313 vmovdqa 0x80-0x100(%rcx),$xb0
1314 vmovdqa 0x90-0x100(%rcx),$xb1
1315 vmovdqa 0xa0-0x100(%rcx),$xb2
1316 vmovdqa 0xb0-0x100(%rcx),$xb3
1317 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1318 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1319 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1320 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1321 vmovdqa 0x100-0x100(%rcx),$xd0
1322 vmovdqa 0x110-0x100(%rcx),$xd1
1323 vmovdqa 0x120-0x100(%rcx),$xd2
1324 vmovdqa 0x130-0x100(%rcx),$xd3
1325 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1326
1327 .Loop_enter4xop:
1328 mov \$10,%eax
1329 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1330 jmp .Loop4xop
1331
1332 .align 32
1333 .Loop4xop:
1334 ___
1335 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1336 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1337 $code.=<<___;
1338 dec %eax
1339 jnz .Loop4xop
1340
1341 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1342 vpaddd 0x50(%rsp),$xa1,$xa1
1343 vpaddd 0x60(%rsp),$xa2,$xa2
1344 vpaddd 0x70(%rsp),$xa3,$xa3
1345
1346 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1347 vmovdqa $xt3,0x30(%rsp)
1348
1349 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1350 vpunpckldq $xa3,$xa2,$xt3
1351 vpunpckhdq $xa1,$xa0,$xa0
1352 vpunpckhdq $xa3,$xa2,$xa2
1353 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1354 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1355 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1356 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1357 ___
1358 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1359 $code.=<<___;
1360 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1361 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1362 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1363 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1364
1365 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1366 vmovdqa $xa1,0x10(%rsp)
1367 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1368 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1369
1370 vpunpckldq $xb1,$xb0,$xt2
1371 vpunpckldq $xb3,$xb2,$xt3
1372 vpunpckhdq $xb1,$xb0,$xb0
1373 vpunpckhdq $xb3,$xb2,$xb2
1374 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1375 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1376 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1377 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1378 ___
1379 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1380 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1381 $code.=<<___;
1382 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1383 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1384 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1385 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1386
1387 vpunpckldq $xc1,$xc0,$xt2
1388 vpunpckldq $xc3,$xc2,$xt3
1389 vpunpckhdq $xc1,$xc0,$xc0
1390 vpunpckhdq $xc3,$xc2,$xc2
1391 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1392 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1393 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1394 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1395 ___
1396 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1397 $code.=<<___;
1398 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1399 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1400 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1401 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1402
1403 vpunpckldq $xd1,$xd0,$xt2
1404 vpunpckldq $xd3,$xd2,$xt3
1405 vpunpckhdq $xd1,$xd0,$xd0
1406 vpunpckhdq $xd3,$xd2,$xd2
1407 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1408 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1409 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1410 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1411 ___
1412 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1413 ($xa0,$xa1)=($xt2,$xt3);
1414 $code.=<<___;
1415 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1416 vmovdqa 0x10(%rsp),$xa1
1417
1418 cmp \$64*4,$len
1419 jb .Ltail4xop
1420
1421 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1422 vpxor 0x10($inp),$xb0,$xb0
1423 vpxor 0x20($inp),$xc0,$xc0
1424 vpxor 0x30($inp),$xd0,$xd0
1425 vpxor 0x40($inp),$xa1,$xa1
1426 vpxor 0x50($inp),$xb1,$xb1
1427 vpxor 0x60($inp),$xc1,$xc1
1428 vpxor 0x70($inp),$xd1,$xd1
1429 lea 0x80($inp),$inp # size optimization
1430 vpxor 0x00($inp),$xa2,$xa2
1431 vpxor 0x10($inp),$xb2,$xb2
1432 vpxor 0x20($inp),$xc2,$xc2
1433 vpxor 0x30($inp),$xd2,$xd2
1434 vpxor 0x40($inp),$xa3,$xa3
1435 vpxor 0x50($inp),$xb3,$xb3
1436 vpxor 0x60($inp),$xc3,$xc3
1437 vpxor 0x70($inp),$xd3,$xd3
1438 lea 0x80($inp),$inp # inp+=64*4
1439
1440 vmovdqu $xa0,0x00($out)
1441 vmovdqu $xb0,0x10($out)
1442 vmovdqu $xc0,0x20($out)
1443 vmovdqu $xd0,0x30($out)
1444 vmovdqu $xa1,0x40($out)
1445 vmovdqu $xb1,0x50($out)
1446 vmovdqu $xc1,0x60($out)
1447 vmovdqu $xd1,0x70($out)
1448 lea 0x80($out),$out # size optimization
1449 vmovdqu $xa2,0x00($out)
1450 vmovdqu $xb2,0x10($out)
1451 vmovdqu $xc2,0x20($out)
1452 vmovdqu $xd2,0x30($out)
1453 vmovdqu $xa3,0x40($out)
1454 vmovdqu $xb3,0x50($out)
1455 vmovdqu $xc3,0x60($out)
1456 vmovdqu $xd3,0x70($out)
1457 lea 0x80($out),$out # out+=64*4
1458
1459 sub \$64*4,$len
1460 jnz .Loop_outer4xop
1461
1462 jmp .Ldone4xop
1463
1464 .align 32
1465 .Ltail4xop:
1466 cmp \$192,$len
1467 jae .L192_or_more4xop
1468 cmp \$128,$len
1469 jae .L128_or_more4xop
1470 cmp \$64,$len
1471 jae .L64_or_more4xop
1472
1473 xor %r10,%r10
1474 vmovdqa $xa0,0x00(%rsp)
1475 vmovdqa $xb0,0x10(%rsp)
1476 vmovdqa $xc0,0x20(%rsp)
1477 vmovdqa $xd0,0x30(%rsp)
1478 jmp .Loop_tail4xop
1479
1480 .align 32
1481 .L64_or_more4xop:
1482 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1483 vpxor 0x10($inp),$xb0,$xb0
1484 vpxor 0x20($inp),$xc0,$xc0
1485 vpxor 0x30($inp),$xd0,$xd0
1486 vmovdqu $xa0,0x00($out)
1487 vmovdqu $xb0,0x10($out)
1488 vmovdqu $xc0,0x20($out)
1489 vmovdqu $xd0,0x30($out)
1490 je .Ldone4xop
1491
1492 lea 0x40($inp),$inp # inp+=64*1
1493 vmovdqa $xa1,0x00(%rsp)
1494 xor %r10,%r10
1495 vmovdqa $xb1,0x10(%rsp)
1496 lea 0x40($out),$out # out+=64*1
1497 vmovdqa $xc1,0x20(%rsp)
1498 sub \$64,$len # len-=64*1
1499 vmovdqa $xd1,0x30(%rsp)
1500 jmp .Loop_tail4xop
1501
1502 .align 32
1503 .L128_or_more4xop:
1504 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1505 vpxor 0x10($inp),$xb0,$xb0
1506 vpxor 0x20($inp),$xc0,$xc0
1507 vpxor 0x30($inp),$xd0,$xd0
1508 vpxor 0x40($inp),$xa1,$xa1
1509 vpxor 0x50($inp),$xb1,$xb1
1510 vpxor 0x60($inp),$xc1,$xc1
1511 vpxor 0x70($inp),$xd1,$xd1
1512
1513 vmovdqu $xa0,0x00($out)
1514 vmovdqu $xb0,0x10($out)
1515 vmovdqu $xc0,0x20($out)
1516 vmovdqu $xd0,0x30($out)
1517 vmovdqu $xa1,0x40($out)
1518 vmovdqu $xb1,0x50($out)
1519 vmovdqu $xc1,0x60($out)
1520 vmovdqu $xd1,0x70($out)
1521 je .Ldone4xop
1522
1523 lea 0x80($inp),$inp # inp+=64*2
1524 vmovdqa $xa2,0x00(%rsp)
1525 xor %r10,%r10
1526 vmovdqa $xb2,0x10(%rsp)
1527 lea 0x80($out),$out # out+=64*2
1528 vmovdqa $xc2,0x20(%rsp)
1529 sub \$128,$len # len-=64*2
1530 vmovdqa $xd2,0x30(%rsp)
1531 jmp .Loop_tail4xop
1532
1533 .align 32
1534 .L192_or_more4xop:
1535 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1536 vpxor 0x10($inp),$xb0,$xb0
1537 vpxor 0x20($inp),$xc0,$xc0
1538 vpxor 0x30($inp),$xd0,$xd0
1539 vpxor 0x40($inp),$xa1,$xa1
1540 vpxor 0x50($inp),$xb1,$xb1
1541 vpxor 0x60($inp),$xc1,$xc1
1542 vpxor 0x70($inp),$xd1,$xd1
1543 lea 0x80($inp),$inp # size optimization
1544 vpxor 0x00($inp),$xa2,$xa2
1545 vpxor 0x10($inp),$xb2,$xb2
1546 vpxor 0x20($inp),$xc2,$xc2
1547 vpxor 0x30($inp),$xd2,$xd2
1548
1549 vmovdqu $xa0,0x00($out)
1550 vmovdqu $xb0,0x10($out)
1551 vmovdqu $xc0,0x20($out)
1552 vmovdqu $xd0,0x30($out)
1553 vmovdqu $xa1,0x40($out)
1554 vmovdqu $xb1,0x50($out)
1555 vmovdqu $xc1,0x60($out)
1556 vmovdqu $xd1,0x70($out)
1557 lea 0x80($out),$out # size optimization
1558 vmovdqu $xa2,0x00($out)
1559 vmovdqu $xb2,0x10($out)
1560 vmovdqu $xc2,0x20($out)
1561 vmovdqu $xd2,0x30($out)
1562 je .Ldone4xop
1563
1564 lea 0x40($inp),$inp # inp+=64*3
1565 vmovdqa $xa3,0x00(%rsp)
1566 xor %r10,%r10
1567 vmovdqa $xb3,0x10(%rsp)
1568 lea 0x40($out),$out # out+=64*3
1569 vmovdqa $xc3,0x20(%rsp)
1570 sub \$192,$len # len-=64*3
1571 vmovdqa $xd3,0x30(%rsp)
1572
1573 .Loop_tail4xop:
1574 movzb ($inp,%r10),%eax
1575 movzb (%rsp,%r10),%ecx
1576 lea 1(%r10),%r10
1577 xor %ecx,%eax
1578 mov %al,-1($out,%r10)
1579 dec $len
1580 jnz .Loop_tail4xop
1581
1582 .Ldone4xop:
1583 vzeroupper
1584 ___
1585 $code.=<<___ if ($win64);
1586 lea 0x140+0x30(%rsp),%r11
1587 movaps -0x30(%r11),%xmm6
1588 movaps -0x20(%r11),%xmm7
1589 movaps -0x10(%r11),%xmm8
1590 movaps 0x00(%r11),%xmm9
1591 movaps 0x10(%r11),%xmm10
1592 movaps 0x20(%r11),%xmm11
1593 movaps 0x30(%r11),%xmm12
1594 movaps 0x40(%r11),%xmm13
1595 movaps 0x50(%r11),%xmm14
1596 movaps 0x60(%r11),%xmm15
1597 ___
1598 $code.=<<___;
1599 add \$0x148+$xframe,%rsp
1600 ret
1601 .size ChaCha20_4xop,.-ChaCha20_4xop
1602 ___
1603 }
1604
1605 ########################################################################
1606 # AVX2 code path
1607 if ($avx>1) {
1608 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1609 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1610 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1611 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1612
1613 sub AVX2_lane_ROUND {
1614 my ($a0,$b0,$c0,$d0)=@_;
1615 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1616 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1617 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1618 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1619 my @x=map("\"$_\"",@xx);
1620
1621 # Consider order in which variables are addressed by their
1622 # index:
1623 #
1624 # a b c d
1625 #
1626 # 0 4 8 12 < even round
1627 # 1 5 9 13
1628 # 2 6 10 14
1629 # 3 7 11 15
1630 # 0 5 10 15 < odd round
1631 # 1 6 11 12
1632 # 2 7 8 13
1633 # 3 4 9 14
1634 #
1635 # 'a', 'b' and 'd's are permanently allocated in registers,
1636 # @x[0..7,12..15], while 'c's are maintained in memory. If
1637 # you observe 'c' column, you'll notice that pair of 'c's is
1638 # invariant between rounds. This means that we have to reload
1639 # them once per round, in the middle. This is why you'll see
1640 # bunch of 'c' stores and loads in the middle, but none in
1641 # the beginning or end.
1642
1643 (
1644 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1645 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1646 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1647 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1648 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1649 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1650
1651 "&vpaddd ($xc,$xc,@x[$d0])",
1652 "&vpxor (@x[$b0],$xc,@x[$b0])",
1653 "&vpslld ($t0,@x[$b0],12)",
1654 "&vpsrld (@x[$b0],@x[$b0],20)",
1655 "&vpor (@x[$b0],$t0,@x[$b0])",
1656 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1657 "&vpaddd ($xc_,$xc_,@x[$d1])",
1658 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1659 "&vpslld ($t1,@x[$b1],12)",
1660 "&vpsrld (@x[$b1],@x[$b1],20)",
1661 "&vpor (@x[$b1],$t1,@x[$b1])",
1662
1663 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1664 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1665 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1666 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1667 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1668 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1669
1670 "&vpaddd ($xc,$xc,@x[$d0])",
1671 "&vpxor (@x[$b0],$xc,@x[$b0])",
1672 "&vpslld ($t1,@x[$b0],7)",
1673 "&vpsrld (@x[$b0],@x[$b0],25)",
1674 "&vpor (@x[$b0],$t1,@x[$b0])",
1675 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1676 "&vpaddd ($xc_,$xc_,@x[$d1])",
1677 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1678 "&vpslld ($t0,@x[$b1],7)",
1679 "&vpsrld (@x[$b1],@x[$b1],25)",
1680 "&vpor (@x[$b1],$t0,@x[$b1])",
1681
1682 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1683 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1684 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1685 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1686
1687 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1688 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1689 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1690 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1691 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1692 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1693
1694 "&vpaddd ($xc,$xc,@x[$d2])",
1695 "&vpxor (@x[$b2],$xc,@x[$b2])",
1696 "&vpslld ($t0,@x[$b2],12)",
1697 "&vpsrld (@x[$b2],@x[$b2],20)",
1698 "&vpor (@x[$b2],$t0,@x[$b2])",
1699 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1700 "&vpaddd ($xc_,$xc_,@x[$d3])",
1701 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1702 "&vpslld ($t1,@x[$b3],12)",
1703 "&vpsrld (@x[$b3],@x[$b3],20)",
1704 "&vpor (@x[$b3],$t1,@x[$b3])",
1705
1706 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1707 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1708 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1709 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1710 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1711 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1712
1713 "&vpaddd ($xc,$xc,@x[$d2])",
1714 "&vpxor (@x[$b2],$xc,@x[$b2])",
1715 "&vpslld ($t1,@x[$b2],7)",
1716 "&vpsrld (@x[$b2],@x[$b2],25)",
1717 "&vpor (@x[$b2],$t1,@x[$b2])",
1718 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1719 "&vpaddd ($xc_,$xc_,@x[$d3])",
1720 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1721 "&vpslld ($t0,@x[$b3],7)",
1722 "&vpsrld (@x[$b3],@x[$b3],25)",
1723 "&vpor (@x[$b3],$t0,@x[$b3])"
1724 );
1725 }
1726
1727 my $xframe = $win64 ? 0xb0 : 8;
1728
1729 $code.=<<___;
1730 .type ChaCha20_8x,\@function,5
1731 .align 32
1732 ChaCha20_8x:
1733 .LChaCha20_8x:
1734 ___
1735 $code.=<<___ if ($avx>2);
1736 test \$`1<<16`,%r10d # check for AVX512F
1737 jnz .LChaCha20_16x
1738 ___
1739 $code.=<<___;
1740 mov %rsp,%r10
1741 sub \$0x280+$xframe,%rsp
1742 and \$-32,%rsp
1743 ___
1744 $code.=<<___ if ($win64);
1745 lea 0x290+0x30(%rsp),%r11
1746 movaps %xmm6,-0x30(%r11)
1747 movaps %xmm7,-0x20(%r11)
1748 movaps %xmm8,-0x10(%r11)
1749 movaps %xmm9,0x00(%r11)
1750 movaps %xmm10,0x10(%r11)
1751 movaps %xmm11,0x20(%r11)
1752 movaps %xmm12,0x30(%r11)
1753 movaps %xmm13,0x40(%r11)
1754 movaps %xmm14,0x50(%r11)
1755 movaps %xmm15,0x60(%r11)
1756 ___
1757 $code.=<<___;
1758 vzeroupper
1759 mov %r10,0x280(%rsp)
1760
1761 ################ stack layout
1762 # +0x00 SIMD equivalent of @x[8-12]
1763 # ...
1764 # +0x80 constant copy of key[0-2] smashed by lanes
1765 # ...
1766 # +0x200 SIMD counters (with nonce smashed by lanes)
1767 # ...
1768 # +0x280 saved %rsp
1769
1770 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1771 vbroadcasti128 ($key),$xb3 # key[1]
1772 vbroadcasti128 16($key),$xt3 # key[2]
1773 vbroadcasti128 ($counter),$xd3 # key[3]
1774 lea 0x100(%rsp),%rcx # size optimization
1775 lea 0x200(%rsp),%rax # size optimization
1776 lea .Lrot16(%rip),%r10
1777 lea .Lrot24(%rip),%r11
1778
1779 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1780 vpshufd \$0x55,$xa3,$xa1
1781 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1782 vpshufd \$0xaa,$xa3,$xa2
1783 vmovdqa $xa1,0xa0-0x100(%rcx)
1784 vpshufd \$0xff,$xa3,$xa3
1785 vmovdqa $xa2,0xc0-0x100(%rcx)
1786 vmovdqa $xa3,0xe0-0x100(%rcx)
1787
1788 vpshufd \$0x00,$xb3,$xb0
1789 vpshufd \$0x55,$xb3,$xb1
1790 vmovdqa $xb0,0x100-0x100(%rcx)
1791 vpshufd \$0xaa,$xb3,$xb2
1792 vmovdqa $xb1,0x120-0x100(%rcx)
1793 vpshufd \$0xff,$xb3,$xb3
1794 vmovdqa $xb2,0x140-0x100(%rcx)
1795 vmovdqa $xb3,0x160-0x100(%rcx)
1796
1797 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1798 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1799 vmovdqa $xt0,0x180-0x200(%rax)
1800 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1801 vmovdqa $xt1,0x1a0-0x200(%rax)
1802 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1803 vmovdqa $xt2,0x1c0-0x200(%rax)
1804 vmovdqa $xt3,0x1e0-0x200(%rax)
1805
1806 vpshufd \$0x00,$xd3,$xd0
1807 vpshufd \$0x55,$xd3,$xd1
1808 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1809 vpshufd \$0xaa,$xd3,$xd2
1810 vmovdqa $xd1,0x220-0x200(%rax)
1811 vpshufd \$0xff,$xd3,$xd3
1812 vmovdqa $xd2,0x240-0x200(%rax)
1813 vmovdqa $xd3,0x260-0x200(%rax)
1814
1815 jmp .Loop_enter8x
1816
1817 .align 32
1818 .Loop_outer8x:
1819 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1820 vmovdqa 0xa0-0x100(%rcx),$xa1
1821 vmovdqa 0xc0-0x100(%rcx),$xa2
1822 vmovdqa 0xe0-0x100(%rcx),$xa3
1823 vmovdqa 0x100-0x100(%rcx),$xb0
1824 vmovdqa 0x120-0x100(%rcx),$xb1
1825 vmovdqa 0x140-0x100(%rcx),$xb2
1826 vmovdqa 0x160-0x100(%rcx),$xb3
1827 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1828 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1829 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1830 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1831 vmovdqa 0x200-0x200(%rax),$xd0
1832 vmovdqa 0x220-0x200(%rax),$xd1
1833 vmovdqa 0x240-0x200(%rax),$xd2
1834 vmovdqa 0x260-0x200(%rax),$xd3
1835 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1836
1837 .Loop_enter8x:
1838 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1839 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1840 vbroadcasti128 (%r10),$xt3
1841 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1842 mov \$10,%eax
1843 jmp .Loop8x
1844
1845 .align 32
1846 .Loop8x:
1847 ___
1848 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1849 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1850 $code.=<<___;
1851 dec %eax
1852 jnz .Loop8x
1853
1854 lea 0x200(%rsp),%rax # size optimization
1855 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1856 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1857 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1858 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1859
1860 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1861 vpunpckldq $xa3,$xa2,$xt3
1862 vpunpckhdq $xa1,$xa0,$xa0
1863 vpunpckhdq $xa3,$xa2,$xa2
1864 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1865 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1866 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1867 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1868 ___
1869 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1870 $code.=<<___;
1871 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1872 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1873 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1874 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1875
1876 vpunpckldq $xb1,$xb0,$xt2
1877 vpunpckldq $xb3,$xb2,$xt3
1878 vpunpckhdq $xb1,$xb0,$xb0
1879 vpunpckhdq $xb3,$xb2,$xb2
1880 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1881 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1882 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1883 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1884 ___
1885 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1886 $code.=<<___;
1887 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1888 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1889 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1890 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1891 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1892 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1893 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1894 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1895 ___
1896 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1897 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1898 $code.=<<___;
1899 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1900 vmovdqa $xa1,0x20(%rsp)
1901 vmovdqa 0x40(%rsp),$xc2 # $xa0
1902 vmovdqa 0x60(%rsp),$xc3 # $xa1
1903
1904 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1905 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1906 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1907 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1908
1909 vpunpckldq $xc1,$xc0,$xt2
1910 vpunpckldq $xc3,$xc2,$xt3
1911 vpunpckhdq $xc1,$xc0,$xc0
1912 vpunpckhdq $xc3,$xc2,$xc2
1913 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1914 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1915 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1916 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1917 ___
1918 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1919 $code.=<<___;
1920 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1921 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1922 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1923 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1924
1925 vpunpckldq $xd1,$xd0,$xt2
1926 vpunpckldq $xd3,$xd2,$xt3
1927 vpunpckhdq $xd1,$xd0,$xd0
1928 vpunpckhdq $xd3,$xd2,$xd2
1929 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1930 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1931 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1932 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1933 ___
1934 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1935 $code.=<<___;
1936 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1937 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1938 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1939 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1940 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1941 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1942 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1943 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1944 ___
1945 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1946 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1947 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1948 ($xa0,$xa1)=($xt2,$xt3);
1949 $code.=<<___;
1950 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1951 vmovdqa 0x20(%rsp),$xa1
1952
1953 cmp \$64*8,$len
1954 jb .Ltail8x
1955
1956 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1957 vpxor 0x20($inp),$xb0,$xb0
1958 vpxor 0x40($inp),$xc0,$xc0
1959 vpxor 0x60($inp),$xd0,$xd0
1960 lea 0x80($inp),$inp # size optimization
1961 vmovdqu $xa0,0x00($out)
1962 vmovdqu $xb0,0x20($out)
1963 vmovdqu $xc0,0x40($out)
1964 vmovdqu $xd0,0x60($out)
1965 lea 0x80($out),$out # size optimization
1966
1967 vpxor 0x00($inp),$xa1,$xa1
1968 vpxor 0x20($inp),$xb1,$xb1
1969 vpxor 0x40($inp),$xc1,$xc1
1970 vpxor 0x60($inp),$xd1,$xd1
1971 lea 0x80($inp),$inp # size optimization
1972 vmovdqu $xa1,0x00($out)
1973 vmovdqu $xb1,0x20($out)
1974 vmovdqu $xc1,0x40($out)
1975 vmovdqu $xd1,0x60($out)
1976 lea 0x80($out),$out # size optimization
1977
1978 vpxor 0x00($inp),$xa2,$xa2
1979 vpxor 0x20($inp),$xb2,$xb2
1980 vpxor 0x40($inp),$xc2,$xc2
1981 vpxor 0x60($inp),$xd2,$xd2
1982 lea 0x80($inp),$inp # size optimization
1983 vmovdqu $xa2,0x00($out)
1984 vmovdqu $xb2,0x20($out)
1985 vmovdqu $xc2,0x40($out)
1986 vmovdqu $xd2,0x60($out)
1987 lea 0x80($out),$out # size optimization
1988
1989 vpxor 0x00($inp),$xa3,$xa3
1990 vpxor 0x20($inp),$xb3,$xb3
1991 vpxor 0x40($inp),$xc3,$xc3
1992 vpxor 0x60($inp),$xd3,$xd3
1993 lea 0x80($inp),$inp # size optimization
1994 vmovdqu $xa3,0x00($out)
1995 vmovdqu $xb3,0x20($out)
1996 vmovdqu $xc3,0x40($out)
1997 vmovdqu $xd3,0x60($out)
1998 lea 0x80($out),$out # size optimization
1999
2000 sub \$64*8,$len
2001 jnz .Loop_outer8x
2002
2003 jmp .Ldone8x
2004
2005 .Ltail8x:
2006 cmp \$448,$len
2007 jae .L448_or_more8x
2008 cmp \$384,$len
2009 jae .L384_or_more8x
2010 cmp \$320,$len
2011 jae .L320_or_more8x
2012 cmp \$256,$len
2013 jae .L256_or_more8x
2014 cmp \$192,$len
2015 jae .L192_or_more8x
2016 cmp \$128,$len
2017 jae .L128_or_more8x
2018 cmp \$64,$len
2019 jae .L64_or_more8x
2020
2021 xor %r10,%r10
2022 vmovdqa $xa0,0x00(%rsp)
2023 vmovdqa $xb0,0x20(%rsp)
2024 jmp .Loop_tail8x
2025
2026 .align 32
2027 .L64_or_more8x:
2028 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2029 vpxor 0x20($inp),$xb0,$xb0
2030 vmovdqu $xa0,0x00($out)
2031 vmovdqu $xb0,0x20($out)
2032 je .Ldone8x
2033
2034 lea 0x40($inp),$inp # inp+=64*1
2035 xor %r10,%r10
2036 vmovdqa $xc0,0x00(%rsp)
2037 lea 0x40($out),$out # out+=64*1
2038 sub \$64,$len # len-=64*1
2039 vmovdqa $xd0,0x20(%rsp)
2040 jmp .Loop_tail8x
2041
2042 .align 32
2043 .L128_or_more8x:
2044 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2045 vpxor 0x20($inp),$xb0,$xb0
2046 vpxor 0x40($inp),$xc0,$xc0
2047 vpxor 0x60($inp),$xd0,$xd0
2048 vmovdqu $xa0,0x00($out)
2049 vmovdqu $xb0,0x20($out)
2050 vmovdqu $xc0,0x40($out)
2051 vmovdqu $xd0,0x60($out)
2052 je .Ldone8x
2053
2054 lea 0x80($inp),$inp # inp+=64*2
2055 xor %r10,%r10
2056 vmovdqa $xa1,0x00(%rsp)
2057 lea 0x80($out),$out # out+=64*2
2058 sub \$128,$len # len-=64*2
2059 vmovdqa $xb1,0x20(%rsp)
2060 jmp .Loop_tail8x
2061
2062 .align 32
2063 .L192_or_more8x:
2064 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2065 vpxor 0x20($inp),$xb0,$xb0
2066 vpxor 0x40($inp),$xc0,$xc0
2067 vpxor 0x60($inp),$xd0,$xd0
2068 vpxor 0x80($inp),$xa1,$xa1
2069 vpxor 0xa0($inp),$xb1,$xb1
2070 vmovdqu $xa0,0x00($out)
2071 vmovdqu $xb0,0x20($out)
2072 vmovdqu $xc0,0x40($out)
2073 vmovdqu $xd0,0x60($out)
2074 vmovdqu $xa1,0x80($out)
2075 vmovdqu $xb1,0xa0($out)
2076 je .Ldone8x
2077
2078 lea 0xc0($inp),$inp # inp+=64*3
2079 xor %r10,%r10
2080 vmovdqa $xc1,0x00(%rsp)
2081 lea 0xc0($out),$out # out+=64*3
2082 sub \$192,$len # len-=64*3
2083 vmovdqa $xd1,0x20(%rsp)
2084 jmp .Loop_tail8x
2085
2086 .align 32
2087 .L256_or_more8x:
2088 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2089 vpxor 0x20($inp),$xb0,$xb0
2090 vpxor 0x40($inp),$xc0,$xc0
2091 vpxor 0x60($inp),$xd0,$xd0
2092 vpxor 0x80($inp),$xa1,$xa1
2093 vpxor 0xa0($inp),$xb1,$xb1
2094 vpxor 0xc0($inp),$xc1,$xc1
2095 vpxor 0xe0($inp),$xd1,$xd1
2096 vmovdqu $xa0,0x00($out)
2097 vmovdqu $xb0,0x20($out)
2098 vmovdqu $xc0,0x40($out)
2099 vmovdqu $xd0,0x60($out)
2100 vmovdqu $xa1,0x80($out)
2101 vmovdqu $xb1,0xa0($out)
2102 vmovdqu $xc1,0xc0($out)
2103 vmovdqu $xd1,0xe0($out)
2104 je .Ldone8x
2105
2106 lea 0x100($inp),$inp # inp+=64*4
2107 xor %r10,%r10
2108 vmovdqa $xa2,0x00(%rsp)
2109 lea 0x100($out),$out # out+=64*4
2110 sub \$256,$len # len-=64*4
2111 vmovdqa $xb2,0x20(%rsp)
2112 jmp .Loop_tail8x
2113
2114 .align 32
2115 .L320_or_more8x:
2116 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2117 vpxor 0x20($inp),$xb0,$xb0
2118 vpxor 0x40($inp),$xc0,$xc0
2119 vpxor 0x60($inp),$xd0,$xd0
2120 vpxor 0x80($inp),$xa1,$xa1
2121 vpxor 0xa0($inp),$xb1,$xb1
2122 vpxor 0xc0($inp),$xc1,$xc1
2123 vpxor 0xe0($inp),$xd1,$xd1
2124 vpxor 0x100($inp),$xa2,$xa2
2125 vpxor 0x120($inp),$xb2,$xb2
2126 vmovdqu $xa0,0x00($out)
2127 vmovdqu $xb0,0x20($out)
2128 vmovdqu $xc0,0x40($out)
2129 vmovdqu $xd0,0x60($out)
2130 vmovdqu $xa1,0x80($out)
2131 vmovdqu $xb1,0xa0($out)
2132 vmovdqu $xc1,0xc0($out)
2133 vmovdqu $xd1,0xe0($out)
2134 vmovdqu $xa2,0x100($out)
2135 vmovdqu $xb2,0x120($out)
2136 je .Ldone8x
2137
2138 lea 0x140($inp),$inp # inp+=64*5
2139 xor %r10,%r10
2140 vmovdqa $xc2,0x00(%rsp)
2141 lea 0x140($out),$out # out+=64*5
2142 sub \$320,$len # len-=64*5
2143 vmovdqa $xd2,0x20(%rsp)
2144 jmp .Loop_tail8x
2145
2146 .align 32
2147 .L384_or_more8x:
2148 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2149 vpxor 0x20($inp),$xb0,$xb0
2150 vpxor 0x40($inp),$xc0,$xc0
2151 vpxor 0x60($inp),$xd0,$xd0
2152 vpxor 0x80($inp),$xa1,$xa1
2153 vpxor 0xa0($inp),$xb1,$xb1
2154 vpxor 0xc0($inp),$xc1,$xc1
2155 vpxor 0xe0($inp),$xd1,$xd1
2156 vpxor 0x100($inp),$xa2,$xa2
2157 vpxor 0x120($inp),$xb2,$xb2
2158 vpxor 0x140($inp),$xc2,$xc2
2159 vpxor 0x160($inp),$xd2,$xd2
2160 vmovdqu $xa0,0x00($out)
2161 vmovdqu $xb0,0x20($out)
2162 vmovdqu $xc0,0x40($out)
2163 vmovdqu $xd0,0x60($out)
2164 vmovdqu $xa1,0x80($out)
2165 vmovdqu $xb1,0xa0($out)
2166 vmovdqu $xc1,0xc0($out)
2167 vmovdqu $xd1,0xe0($out)
2168 vmovdqu $xa2,0x100($out)
2169 vmovdqu $xb2,0x120($out)
2170 vmovdqu $xc2,0x140($out)
2171 vmovdqu $xd2,0x160($out)
2172 je .Ldone8x
2173
2174 lea 0x180($inp),$inp # inp+=64*6
2175 xor %r10,%r10
2176 vmovdqa $xa3,0x00(%rsp)
2177 lea 0x180($out),$out # out+=64*6
2178 sub \$384,$len # len-=64*6
2179 vmovdqa $xb3,0x20(%rsp)
2180 jmp .Loop_tail8x
2181
2182 .align 32
2183 .L448_or_more8x:
2184 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2185 vpxor 0x20($inp),$xb0,$xb0
2186 vpxor 0x40($inp),$xc0,$xc0
2187 vpxor 0x60($inp),$xd0,$xd0
2188 vpxor 0x80($inp),$xa1,$xa1
2189 vpxor 0xa0($inp),$xb1,$xb1
2190 vpxor 0xc0($inp),$xc1,$xc1
2191 vpxor 0xe0($inp),$xd1,$xd1
2192 vpxor 0x100($inp),$xa2,$xa2
2193 vpxor 0x120($inp),$xb2,$xb2
2194 vpxor 0x140($inp),$xc2,$xc2
2195 vpxor 0x160($inp),$xd2,$xd2
2196 vpxor 0x180($inp),$xa3,$xa3
2197 vpxor 0x1a0($inp),$xb3,$xb3
2198 vmovdqu $xa0,0x00($out)
2199 vmovdqu $xb0,0x20($out)
2200 vmovdqu $xc0,0x40($out)
2201 vmovdqu $xd0,0x60($out)
2202 vmovdqu $xa1,0x80($out)
2203 vmovdqu $xb1,0xa0($out)
2204 vmovdqu $xc1,0xc0($out)
2205 vmovdqu $xd1,0xe0($out)
2206 vmovdqu $xa2,0x100($out)
2207 vmovdqu $xb2,0x120($out)
2208 vmovdqu $xc2,0x140($out)
2209 vmovdqu $xd2,0x160($out)
2210 vmovdqu $xa3,0x180($out)
2211 vmovdqu $xb3,0x1a0($out)
2212 je .Ldone8x
2213
2214 lea 0x1c0($inp),$inp # inp+=64*7
2215 xor %r10,%r10
2216 vmovdqa $xc3,0x00(%rsp)
2217 lea 0x1c0($out),$out # out+=64*7
2218 sub \$448,$len # len-=64*7
2219 vmovdqa $xd3,0x20(%rsp)
2220
2221 .Loop_tail8x:
2222 movzb ($inp,%r10),%eax
2223 movzb (%rsp,%r10),%ecx
2224 lea 1(%r10),%r10
2225 xor %ecx,%eax
2226 mov %al,-1($out,%r10)
2227 dec $len
2228 jnz .Loop_tail8x
2229
2230 .Ldone8x:
2231 vzeroupper
2232 ___
2233 $code.=<<___ if ($win64);
2234 lea 0x290+0x30(%rsp),%r11
2235 movaps -0x30(%r11),%xmm6
2236 movaps -0x20(%r11),%xmm7
2237 movaps -0x10(%r11),%xmm8
2238 movaps 0x00(%r11),%xmm9
2239 movaps 0x10(%r11),%xmm10
2240 movaps 0x20(%r11),%xmm11
2241 movaps 0x30(%r11),%xmm12
2242 movaps 0x40(%r11),%xmm13
2243 movaps 0x50(%r11),%xmm14
2244 movaps 0x60(%r11),%xmm15
2245 ___
2246 $code.=<<___;
2247 mov 0x280(%rsp),%rsp
2248 ret
2249 .size ChaCha20_8x,.-ChaCha20_8x
2250 ___
2251 }
2252
2253 ########################################################################
2254 # AVX512 code paths
2255 if ($avx>2) {
2256 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2257 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2258 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2259 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2260 my @key=map("%zmm$_",(16..31));
2261 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2262
2263 sub AVX512_lane_ROUND {
2264 my ($a0,$b0,$c0,$d0)=@_;
2265 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2266 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2267 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2268 my @x=map("\"$_\"",@xx);
2269
2270 (
2271 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2272 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2273 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2274 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2275 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2276 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2277 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2278 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2279 "&vprold (@x[$d0],@x[$d0],16)",
2280 "&vprold (@x[$d1],@x[$d1],16)",
2281 "&vprold (@x[$d2],@x[$d2],16)",
2282 "&vprold (@x[$d3],@x[$d3],16)",
2283
2284 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2285 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2286 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2287 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2288 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2289 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2290 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2291 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2292 "&vprold (@x[$b0],@x[$b0],12)",
2293 "&vprold (@x[$b1],@x[$b1],12)",
2294 "&vprold (@x[$b2],@x[$b2],12)",
2295 "&vprold (@x[$b3],@x[$b3],12)",
2296
2297 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2298 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2299 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2300 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2301 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2302 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2303 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2304 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2305 "&vprold (@x[$d0],@x[$d0],8)",
2306 "&vprold (@x[$d1],@x[$d1],8)",
2307 "&vprold (@x[$d2],@x[$d2],8)",
2308 "&vprold (@x[$d3],@x[$d3],8)",
2309
2310 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2311 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2312 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2313 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2314 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2315 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2316 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2317 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2318 "&vprold (@x[$b0],@x[$b0],7)",
2319 "&vprold (@x[$b1],@x[$b1],7)",
2320 "&vprold (@x[$b2],@x[$b2],7)",
2321 "&vprold (@x[$b3],@x[$b3],7)"
2322 );
2323 }
2324
2325 my $xframe = $win64 ? 0xb0 : 8;
2326
2327 $code.=<<___;
2328 .type ChaCha20_16x,\@function,5
2329 .align 32
2330 ChaCha20_16x:
2331 .LChaCha20_16x:
2332 mov %rsp,%r11
2333 sub \$64+$xframe,%rsp
2334 and \$-64,%rsp
2335 ___
2336 $code.=<<___ if ($win64);
2337 lea 0x290+0x30(%rsp),%r11
2338 movaps %xmm6,-0x30(%r11)
2339 movaps %xmm7,-0x20(%r11)
2340 movaps %xmm8,-0x10(%r11)
2341 movaps %xmm9,0x00(%r11)
2342 movaps %xmm10,0x10(%r11)
2343 movaps %xmm11,0x20(%r11)
2344 movaps %xmm12,0x30(%r11)
2345 movaps %xmm13,0x40(%r11)
2346 movaps %xmm14,0x50(%r11)
2347 movaps %xmm15,0x60(%r11)
2348 ___
2349 $code.=<<___;
2350 vzeroupper
2351
2352 lea .Lsigma(%rip),%r10
2353 vbroadcasti32x4 (%r10),$xa3 # key[0]
2354 vbroadcasti32x4 ($key),$xb3 # key[1]
2355 vbroadcasti32x4 16($key),$xc3 # key[2]
2356 vbroadcasti32x4 ($counter),$xd3 # key[3]
2357
2358 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2359 vpshufd \$0x55,$xa3,$xa1
2360 vpshufd \$0xaa,$xa3,$xa2
2361 vpshufd \$0xff,$xa3,$xa3
2362 vmovdqa64 $xa0,@key[0]
2363 vmovdqa64 $xa1,@key[1]
2364 vmovdqa64 $xa2,@key[2]
2365 vmovdqa64 $xa3,@key[3]
2366
2367 vpshufd \$0x00,$xb3,$xb0
2368 vpshufd \$0x55,$xb3,$xb1
2369 vpshufd \$0xaa,$xb3,$xb2
2370 vpshufd \$0xff,$xb3,$xb3
2371 vmovdqa64 $xb0,@key[4]
2372 vmovdqa64 $xb1,@key[5]
2373 vmovdqa64 $xb2,@key[6]
2374 vmovdqa64 $xb3,@key[7]
2375
2376 vpshufd \$0x00,$xc3,$xc0
2377 vpshufd \$0x55,$xc3,$xc1
2378 vpshufd \$0xaa,$xc3,$xc2
2379 vpshufd \$0xff,$xc3,$xc3
2380 vmovdqa64 $xc0,@key[8]
2381 vmovdqa64 $xc1,@key[9]
2382 vmovdqa64 $xc2,@key[10]
2383 vmovdqa64 $xc3,@key[11]
2384
2385 vpshufd \$0x00,$xd3,$xd0
2386 vpshufd \$0x55,$xd3,$xd1
2387 vpshufd \$0xaa,$xd3,$xd2
2388 vpshufd \$0xff,$xd3,$xd3
2389 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2390 vmovdqa64 $xd0,@key[12]
2391 vmovdqa64 $xd1,@key[13]
2392 vmovdqa64 $xd2,@key[14]
2393 vmovdqa64 $xd3,@key[15]
2394
2395 mov \$10,%eax
2396 jmp .Loop16x
2397
2398 .align 32
2399 .Loop_outer16x:
2400 vpbroadcastd 0(%r10),$xa0 # reload key
2401 vpbroadcastd 4(%r10),$xa1
2402 vpbroadcastd 8(%r10),$xa2
2403 vpbroadcastd 12(%r10),$xa3
2404 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2405 vmovdqa64 @key[4],$xb0
2406 vmovdqa64 @key[5],$xb1
2407 vmovdqa64 @key[6],$xb2
2408 vmovdqa64 @key[7],$xb3
2409 vmovdqa64 @key[8],$xc0
2410 vmovdqa64 @key[9],$xc1
2411 vmovdqa64 @key[10],$xc2
2412 vmovdqa64 @key[11],$xc3
2413 vmovdqa64 @key[12],$xd0
2414 vmovdqa64 @key[13],$xd1
2415 vmovdqa64 @key[14],$xd2
2416 vmovdqa64 @key[15],$xd3
2417
2418 vmovdqa64 $xa0,@key[0]
2419 vmovdqa64 $xa1,@key[1]
2420 vmovdqa64 $xa2,@key[2]
2421 vmovdqa64 $xa3,@key[3]
2422
2423 mov \$10,%eax
2424 jmp .Loop16x
2425
2426 .align 32
2427 .Loop16x:
2428 ___
2429 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2430 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2431 $code.=<<___;
2432 dec %eax
2433 jnz .Loop16x
2434
2435 vpaddd @key[0],$xa0,$xa0 # accumulate key
2436 vpaddd @key[1],$xa1,$xa1
2437 vpaddd @key[2],$xa2,$xa2
2438 vpaddd @key[3],$xa3,$xa3
2439
2440 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2441 vpunpckldq $xa3,$xa2,$xt3
2442 vpunpckhdq $xa1,$xa0,$xa0
2443 vpunpckhdq $xa3,$xa2,$xa2
2444 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2445 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2446 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2447 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2448 ___
2449 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2450 $code.=<<___;
2451 vpaddd @key[4],$xb0,$xb0
2452 vpaddd @key[5],$xb1,$xb1
2453 vpaddd @key[6],$xb2,$xb2
2454 vpaddd @key[7],$xb3,$xb3
2455
2456 vpunpckldq $xb1,$xb0,$xt2
2457 vpunpckldq $xb3,$xb2,$xt3
2458 vpunpckhdq $xb1,$xb0,$xb0
2459 vpunpckhdq $xb3,$xb2,$xb2
2460 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2461 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2462 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2463 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2464 ___
2465 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2466 $code.=<<___;
2467 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2468 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2469 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2470 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2471 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2472 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2473 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2474 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2475 ___
2476 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2477 $code.=<<___;
2478 vpaddd @key[8],$xc0,$xc0
2479 vpaddd @key[9],$xc1,$xc1
2480 vpaddd @key[10],$xc2,$xc2
2481 vpaddd @key[11],$xc3,$xc3
2482
2483 vpunpckldq $xc1,$xc0,$xt2
2484 vpunpckldq $xc3,$xc2,$xt3
2485 vpunpckhdq $xc1,$xc0,$xc0
2486 vpunpckhdq $xc3,$xc2,$xc2
2487 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2488 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2489 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2490 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2491 ___
2492 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2493 $code.=<<___;
2494 vpaddd @key[12],$xd0,$xd0
2495 vpaddd @key[13],$xd1,$xd1
2496 vpaddd @key[14],$xd2,$xd2
2497 vpaddd @key[15],$xd3,$xd3
2498
2499 vpunpckldq $xd1,$xd0,$xt2
2500 vpunpckldq $xd3,$xd2,$xt3
2501 vpunpckhdq $xd1,$xd0,$xd0
2502 vpunpckhdq $xd3,$xd2,$xd2
2503 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2504 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2505 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2506 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2507 ___
2508 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2509 $code.=<<___;
2510 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2511 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2512 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2513 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2514 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2515 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2516 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2517 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2518 ___
2519 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2520 $code.=<<___;
2521 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2522 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2523 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2524 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2525 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2526 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2527 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2528 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2529 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2530 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2531 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2532 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2533 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2534 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2535 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2536 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2537 ___
2538 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2539 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2540
2541 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2542 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2543 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2544 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2545 $code.=<<___;
2546 cmp \$64*16,$len
2547 jb .Ltail16x
2548
2549 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2550 vpxord 0x40($inp),$xb0,$xb0
2551 vpxord 0x80($inp),$xc0,$xc0
2552 vpxord 0xc0($inp),$xd0,$xd0
2553 vmovdqu32 $xa0,0x00($out)
2554 vmovdqu32 $xb0,0x40($out)
2555 vmovdqu32 $xc0,0x80($out)
2556 vmovdqu32 $xd0,0xc0($out)
2557
2558 vpxord 0x100($inp),$xa1,$xa1
2559 vpxord 0x140($inp),$xb1,$xb1
2560 vpxord 0x180($inp),$xc1,$xc1
2561 vpxord 0x1c0($inp),$xd1,$xd1
2562 vmovdqu32 $xa1,0x100($out)
2563 vmovdqu32 $xb1,0x140($out)
2564 vmovdqu32 $xc1,0x180($out)
2565 vmovdqu32 $xd1,0x1c0($out)
2566
2567 vpxord 0x200($inp),$xa2,$xa2
2568 vpxord 0x240($inp),$xb2,$xb2
2569 vpxord 0x280($inp),$xc2,$xc2
2570 vpxord 0x2c0($inp),$xd2,$xd2
2571 vmovdqu32 $xa2,0x200($out)
2572 vmovdqu32 $xb2,0x240($out)
2573 vmovdqu32 $xc2,0x280($out)
2574 vmovdqu32 $xd2,0x2c0($out)
2575
2576 vpxord 0x300($inp),$xa3,$xa3
2577 vpxord 0x340($inp),$xb3,$xb3
2578 vpxord 0x380($inp),$xc3,$xc3
2579 vpxord 0x3c0($inp),$xd3,$xd3
2580 lea 0x400($inp),$inp
2581 vmovdqu32 $xa3,0x300($out)
2582 vmovdqu32 $xb3,0x340($out)
2583 vmovdqu32 $xc3,0x380($out)
2584 vmovdqu32 $xd3,0x3c0($out)
2585 lea 0x400($out),$out
2586
2587 sub \$64*16,$len
2588 jnz .Loop_outer16x
2589
2590 jmp .Ldone16x
2591
2592 .align 32
2593 .Ltail16x:
2594 xor %r10,%r10
2595 sub $inp,$out
2596 cmp \$64*1,$len
2597 jb .Less_than_64_16x
2598 vpxord ($inp),$xa0,$xa0 # xor with input
2599 vmovdqu32 $xa0,($out,$inp)
2600 je .Ldone16x
2601 vmovdqa32 $xb0,$xa0
2602 lea 64($inp),$inp
2603
2604 cmp \$64*2,$len
2605 jb .Less_than_64_16x
2606 vpxord ($inp),$xb0,$xb0
2607 vmovdqu32 $xb0,($out,$inp)
2608 je .Ldone16x
2609 vmovdqa32 $xc0,$xa0
2610 lea 64($inp),$inp
2611
2612 cmp \$64*3,$len
2613 jb .Less_than_64_16x
2614 vpxord ($inp),$xc0,$xc0
2615 vmovdqu32 $xc0,($out,$inp)
2616 je .Ldone16x
2617 vmovdqa32 $xd0,$xa0
2618 lea 64($inp),$inp
2619
2620 cmp \$64*4,$len
2621 jb .Less_than_64_16x
2622 vpxord ($inp),$xd0,$xd0
2623 vmovdqu32 $xd0,($out,$inp)
2624 je .Ldone16x
2625 vmovdqa32 $xa1,$xa0
2626 lea 64($inp),$inp
2627
2628 cmp \$64*5,$len
2629 jb .Less_than_64_16x
2630 vpxord ($inp),$xa1,$xa1
2631 vmovdqu32 $xa1,($out,$inp)
2632 je .Ldone16x
2633 vmovdqa32 $xb1,$xa0
2634 lea 64($inp),$inp
2635
2636 cmp \$64*6,$len
2637 jb .Less_than_64_16x
2638 vpxord ($inp),$xb1,$xb1
2639 vmovdqu32 $xb1,($out,$inp)
2640 je .Ldone16x
2641 vmovdqa32 $xc1,$xa0
2642 lea 64($inp),$inp
2643
2644 cmp \$64*7,$len
2645 jb .Less_than_64_16x
2646 vpxord ($inp),$xc1,$xc1
2647 vmovdqu32 $xc1,($out,$inp)
2648 je .Ldone16x
2649 vmovdqa32 $xd1,$xa0
2650 lea 64($inp),$inp
2651
2652 cmp \$64*8,$len
2653 jb .Less_than_64_16x
2654 vpxord ($inp),$xd1,$xd1
2655 vmovdqu32 $xd1,($out,$inp)
2656 je .Ldone16x
2657 vmovdqa32 $xa2,$xa0
2658 lea 64($inp),$inp
2659
2660 cmp \$64*9,$len
2661 jb .Less_than_64_16x
2662 vpxord ($inp),$xa2,$xa2
2663 vmovdqu32 $xa2,($out,$inp)
2664 je .Ldone16x
2665 vmovdqa32 $xb2,$xa0
2666 lea 64($inp),$inp
2667
2668 cmp \$64*10,$len
2669 jb .Less_than_64_16x
2670 vpxord ($inp),$xb2,$xb2
2671 vmovdqu32 $xb2,($out,$inp)
2672 je .Ldone16x
2673 vmovdqa32 $xc2,$xa0
2674 lea 64($inp),$inp
2675
2676 cmp \$64*11,$len
2677 jb .Less_than_64_16x
2678 vpxord ($inp),$xc2,$xc2
2679 vmovdqu32 $xc2,($out,$inp)
2680 je .Ldone16x
2681 vmovdqa32 $xd2,$xa0
2682 lea 64($inp),$inp
2683
2684 cmp \$64*12,$len
2685 jb .Less_than_64_16x
2686 vpxord ($inp),$xd2,$xd2
2687 vmovdqu32 $xd2,($out,$inp)
2688 je .Ldone16x
2689 vmovdqa32 $xa3,$xa0
2690 lea 64($inp),$inp
2691
2692 cmp \$64*13,$len
2693 jb .Less_than_64_16x
2694 vpxord ($inp),$xa3,$xa3
2695 vmovdqu32 $xa3,($out,$inp)
2696 je .Ldone16x
2697 vmovdqa32 $xb3,$xa0
2698 lea 64($inp),$inp
2699
2700 cmp \$64*14,$len
2701 jb .Less_than_64_16x
2702 vpxord ($inp),$xb3,$xb3
2703 vmovdqu32 $xb3,($out,$inp)
2704 je .Ldone16x
2705 vmovdqa32 $xc3,$xa0
2706 lea 64($inp),$inp
2707
2708 cmp \$64*15,$len
2709 jb .Less_than_64_16x
2710 vpxord ($inp),$xc3,$xc3
2711 vmovdqu32 $xc3,($out,$inp)
2712 je .Ldone16x
2713 vmovdqa32 $xd3,$xa0
2714 lea 64($inp),$inp
2715
2716 .Less_than_64_16x:
2717 vmovdqa32 $xa0,0x00(%rsp)
2718 lea ($out,$inp),$out
2719 and \$63,$len
2720
2721 .Loop_tail16x:
2722 movzb ($inp,%r10),%eax
2723 movzb (%rsp,%r10),%ecx
2724 lea 1(%r10),%r10
2725 xor %ecx,%eax
2726 mov %al,-1($out,%r10)
2727 dec $len
2728 jnz .Loop_tail16x
2729
2730 .Ldone16x:
2731 vzeroupper
2732 ___
2733 $code.=<<___ if ($win64);
2734 lea 0x290+0x30(%rsp),%r11
2735 movaps -0x30(%r11),%xmm6
2736 movaps -0x20(%r11),%xmm7
2737 movaps -0x10(%r11),%xmm8
2738 movaps 0x00(%r11),%xmm9
2739 movaps 0x10(%r11),%xmm10
2740 movaps 0x20(%r11),%xmm11
2741 movaps 0x30(%r11),%xmm12
2742 movaps 0x40(%r11),%xmm13
2743 movaps 0x50(%r11),%xmm14
2744 movaps 0x60(%r11),%xmm15
2745 ___
2746 $code.=<<___;
2747 mov %r11,%rsp
2748 ret
2749 .size ChaCha20_16x,.-ChaCha20_16x
2750 ___
2751 }
2752
2753 foreach (split("\n",$code)) {
2754 s/\`([^\`]*)\`/eval $1/geo;
2755
2756 s/%x#%y/%x/go;
2757
2758 print $_,"\n";
2759 }
2760
2761 close STDOUT;