]>
git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/chacha/asm/chacha-x86_64.pl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
25 # Performance in cycles per byte out of large buffer.
27 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
29 # P4 9.48/+99% -/22.7(ii) -
30 # Core2 7.83/+55% 7.90/8.08 4.35
31 # Westmere 7.19/+50% 5.60/6.70 3.00
32 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
35 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
36 # Goldmont 10.6/+17% 5.10/- 3.28
37 # Sledgehammer 7.28/+52% -/14.2(ii) -
38 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
39 # VIA Nano 10.5/+46% 6.72/8.60 6.05
41 # (i) compared to older gcc 3.x one can observe >2x improvement on
43 # (ii) as it can be seen, SSE2 performance is too low on legacy
44 # processors; NxSSE2 results are naturally better, but not
45 # impressively better than IALU ones, which is why you won't
46 # find SSE2 code below;
47 # (iii) this is not optimal result for Atom because of MSROM
48 # limitations, SSE2 can do better, but gain is considered too
49 # low to justify the [maintenance] effort;
50 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
54 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
59 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
60 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
61 die "can't locate x86_64-xlate.pl";
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
68 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
70 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
71 $avx += 1 if ($1==2.11 && $2>=8);
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
75 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76 $avx = ($1>=10) + ($1>=11);
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
80 $avx = ($2>=3.0) + ($2>3.0);
83 open OUT
,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
86 # input parameter block
87 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
92 .extern OPENSSL_ia32cap_P
104 .long
0,2,4,6,1,3,5,7
106 .long
8,8,8,8,8,8,8,8
108 .byte
0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
110 .byte
0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
112 .asciz
"expand 32-byte k"
115 .long
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
117 .long
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
118 .asciz
"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
121 sub AUTOLOAD
() # thunk [simplified] 32-bit style perlasm
122 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
124 $arg = "\$$arg" if ($arg*1 eq $arg);
125 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
128 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
129 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
132 sub ROUND
{ # critical path is 24 cycles per round
133 my ($a0,$b0,$c0,$d0)=@_;
134 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
135 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
136 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
137 my ($xc,$xc_)=map("\"$_\"",@t);
138 my @x=map("\"$_\"",@x);
140 # Consider order in which variables are addressed by their
145 # 0 4 8 12 < even round
149 # 0 5 10 15 < odd round
154 # 'a', 'b' and 'd's are permanently allocated in registers,
155 # @x[0..7,12..15], while 'c's are maintained in memory. If
156 # you observe 'c' column, you'll notice that pair of 'c's is
157 # invariant between rounds. This means that we have to reload
158 # them once per round, in the middle. This is why you'll see
159 # bunch of 'c' stores and loads in the middle, but none in
160 # the beginning or end.
162 # Normally instructions would be interleaved to favour in-order
163 # execution. Generally out-of-order cores manage it gracefully,
164 # but not this time for some reason. As in-order execution
165 # cores are dying breed, old Atom is the only one around,
166 # instructions are left uninterleaved. Besides, Atom is better
167 # off executing 1xSSSE3 code anyway...
170 "&add (@x[$a0],@x[$b0])", # Q1
171 "&xor (@x[$d0],@x[$a0])",
173 "&add (@x[$a1],@x[$b1])", # Q2
174 "&xor (@x[$d1],@x[$a1])",
177 "&add ($xc,@x[$d0])",
178 "&xor (@x[$b0],$xc)",
180 "&add ($xc_,@x[$d1])",
181 "&xor (@x[$b1],$xc_)",
184 "&add (@x[$a0],@x[$b0])",
185 "&xor (@x[$d0],@x[$a0])",
187 "&add (@x[$a1],@x[$b1])",
188 "&xor (@x[$d1],@x[$a1])",
191 "&add ($xc,@x[$d0])",
192 "&xor (@x[$b0],$xc)",
194 "&add ($xc_,@x[$d1])",
195 "&xor (@x[$b1],$xc_)",
198 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
199 "&mov (\"4*$c1(%rsp)\",$xc_)",
200 "&mov ($xc,\"4*$c2(%rsp)\")",
201 "&mov ($xc_,\"4*$c3(%rsp)\")",
203 "&add (@x[$a2],@x[$b2])", # Q3
204 "&xor (@x[$d2],@x[$a2])",
206 "&add (@x[$a3],@x[$b3])", # Q4
207 "&xor (@x[$d3],@x[$a3])",
210 "&add ($xc,@x[$d2])",
211 "&xor (@x[$b2],$xc)",
213 "&add ($xc_,@x[$d3])",
214 "&xor (@x[$b3],$xc_)",
217 "&add (@x[$a2],@x[$b2])",
218 "&xor (@x[$d2],@x[$a2])",
220 "&add (@x[$a3],@x[$b3])",
221 "&xor (@x[$d3],@x[$a3])",
224 "&add ($xc,@x[$d2])",
225 "&xor (@x[$b2],$xc)",
227 "&add ($xc_,@x[$d3])",
228 "&xor (@x[$b3],$xc_)",
233 ########################################################################
234 # Generic code path that handles all lengths on pre-SSSE3 processors.
236 .globl ChaCha20_ctr32
237 .type ChaCha20_ctr32
,\
@function,5
242 mov OPENSSL_ia32cap_P
+4(%rip),%r10
243 test \
$`1<<(41-32)`,%r10d
254 #movdqa .Lsigma(%rip),%xmm0
256 movdqu
16($key),%xmm2
257 movdqu
($counter),%xmm3
258 movdqa
.Lone
(%rip),%xmm4
260 #movdqa %xmm0,4*0(%rsp) # key[0]
261 movdqa
%xmm1,4*4(%rsp) # key[1]
262 movdqa
%xmm2,4*8(%rsp) # key[2]
263 movdqa
%xmm3,4*12(%rsp) # key[3]
264 mov
$len,%rbp # reassign $len
269 mov \
$0x61707865,@x[0] # 'expa'
270 mov \
$0x3320646e,@x[1] # 'nd 3'
271 mov \
$0x79622d32,@x[2] # '2-by'
272 mov \
$0x6b206574,@x[3] # 'te k'
278 mov
4*13(%rsp),@x[13]
279 mov
4*14(%rsp),@x[14]
280 mov
4*15(%rsp),@x[15]
282 mov
%rbp,64+0(%rsp) # save len
284 mov
$inp,64+8(%rsp) # save inp
285 movq
%xmm2,%rsi # "@x[8]"
286 mov
$out,64+16(%rsp) # save out
288 shr \
$32,%rdi # "@x[9]"
294 foreach (&ROUND
(0, 4, 8,12)) { eval; }
295 foreach (&ROUND
(0, 5,10,15)) { eval; }
300 mov
@t[1],4*9(%rsp) # modulo-scheduled
302 mov
64(%rsp),%rbp # load len
304 mov
64+8(%rsp),$inp # load inp
305 paddd
%xmm4,%xmm3 # increment counter
306 mov
64+16(%rsp),$out # load out
308 add \
$0x61707865,@x[0] # 'expa'
309 add \
$0x3320646e,@x[1] # 'nd 3'
310 add \
$0x79622d32,@x[2] # '2-by'
311 add \
$0x6b206574,@x[3] # 'te k'
316 add
4*12(%rsp),@x[12]
317 add
4*13(%rsp),@x[13]
318 add
4*14(%rsp),@x[14]
319 add
4*15(%rsp),@x[15]
320 paddd
4*8(%rsp),%xmm1
325 xor 4*0($inp),@x[0] # xor with input
333 movdqu
4*8($inp),%xmm0
334 xor 4*12($inp),@x[12]
335 xor 4*13($inp),@x[13]
336 xor 4*14($inp),@x[14]
337 xor 4*15($inp),@x[15]
338 lea
4*16($inp),$inp # inp+=64
341 movdqa
%xmm2,4*8(%rsp)
342 movd
%xmm3,4*12(%rsp)
344 mov
@x[0],4*0($out) # write output
352 movdqu
%xmm0,4*8($out)
353 mov
@x[12],4*12($out)
354 mov
@x[13],4*13($out)
355 mov
@x[14],4*14($out)
356 mov
@x[15],4*15($out)
357 lea
4*16($out),$out # out+=64
375 movdqa
%xmm1,4*8(%rsp)
376 mov
@x[12],4*12(%rsp)
377 mov
@x[13],4*13(%rsp)
378 mov
@x[14],4*14(%rsp)
379 mov
@x[15],4*15(%rsp)
382 movzb
($inp,%rbx),%eax
383 movzb
(%rsp,%rbx),%edx
386 mov
%al,-1($out,%rbx)
400 .size ChaCha20_ctr32
,.-ChaCha20_ctr32
403 ########################################################################
404 # SSSE3 code path that handles shorter lengths
406 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
408 sub SSSE3ROUND
{ # critical path is 20 "SIMD ticks" per round
432 my $xframe = $win64 ?
32+32+8 : 24;
435 .type ChaCha20_ssse3
,\
@function,5
440 $code.=<<___
if ($avx);
441 test \
$`1<<(43-32)`,%r10d
442 jnz
.LChaCha20_4xop
# XOP is fastest even if we use 1/4
445 cmp \
$128,$len # we might throw away some data,
446 ja
.LChaCha20_4x
# but overall it won't be slower
456 sub \
$64+$xframe,%rsp
458 $code.=<<___
if ($win64);
459 movaps
%xmm6,64+32(%rsp)
460 movaps
%xmm7,64+48(%rsp)
463 movdqa
.Lsigma
(%rip),$a
467 movdqa
.Lrot16
(%rip),$rot16
468 movdqa
.Lrot24
(%rip),$rot24
479 movdqa
.Lone
(%rip),$d
492 &pshufd
($c,$c,0b01001110
);
493 &pshufd
($b,$b,0b00111001
);
494 &pshufd
($d,$d,0b10010011
);
498 &pshufd
($c,$c,0b01001110
);
499 &pshufd
($b,$b,0b10010011
);
500 &pshufd
($d,$d,0b00111001
);
503 &jnz
(".Loop_ssse3");
515 movdqu
0x10($inp),$t1
516 pxor
$t,$a # xor with input
519 movdqu
0x30($inp),$t1
520 lea
0x40($inp),$inp # inp+=64
524 movdqu
$a,0x00($out) # write output
528 lea
0x40($out),$out # out+=64
531 jnz
.Loop_outer_ssse3
544 movzb
($inp,%rbx),%eax
545 movzb
(%rsp,%rbx),%ecx
548 mov
%al,-1($out,%rbx)
554 $code.=<<___
if ($win64);
555 movaps
64+32(%rsp),%xmm6
556 movaps
64+48(%rsp),%xmm7
559 add \
$64+$xframe,%rsp
567 .size ChaCha20_ssse3
,.-ChaCha20_ssse3
571 ########################################################################
572 # SSSE3 code path that handles longer messages.
574 # assign variables to favor Atom front-end
575 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
576 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
577 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
578 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
580 sub SSSE3_lane_ROUND
{
581 my ($a0,$b0,$c0,$d0)=@_;
582 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
583 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
584 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
585 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
586 my @x=map("\"$_\"",@xx);
588 # Consider order in which variables are addressed by their
593 # 0 4 8 12 < even round
597 # 0 5 10 15 < odd round
602 # 'a', 'b' and 'd's are permanently allocated in registers,
603 # @x[0..7,12..15], while 'c's are maintained in memory. If
604 # you observe 'c' column, you'll notice that pair of 'c's is
605 # invariant between rounds. This means that we have to reload
606 # them once per round, in the middle. This is why you'll see
607 # bunch of 'c' stores and loads in the middle, but none in
608 # the beginning or end.
611 "&paddd (@x[$a0],@x[$b0])", # Q1
612 "&paddd (@x[$a1],@x[$b1])", # Q2
613 "&pxor (@x[$d0],@x[$a0])",
614 "&pxor (@x[$d1],@x[$a1])",
615 "&pshufb (@x[$d0],$t1)",
616 "&pshufb (@x[$d1],$t1)",
618 "&paddd ($xc,@x[$d0])",
619 "&paddd ($xc_,@x[$d1])",
620 "&pxor (@x[$b0],$xc)",
621 "&pxor (@x[$b1],$xc_)",
622 "&movdqa ($t0,@x[$b0])",
623 "&pslld (@x[$b0],12)",
625 "&movdqa ($t1,@x[$b1])",
626 "&pslld (@x[$b1],12)",
627 "&por (@x[$b0],$t0)",
629 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
630 "&por (@x[$b1],$t1)",
632 "&paddd (@x[$a0],@x[$b0])",
633 "&paddd (@x[$a1],@x[$b1])",
634 "&pxor (@x[$d0],@x[$a0])",
635 "&pxor (@x[$d1],@x[$a1])",
636 "&pshufb (@x[$d0],$t0)",
637 "&pshufb (@x[$d1],$t0)",
639 "&paddd ($xc,@x[$d0])",
640 "&paddd ($xc_,@x[$d1])",
641 "&pxor (@x[$b0],$xc)",
642 "&pxor (@x[$b1],$xc_)",
643 "&movdqa ($t1,@x[$b0])",
644 "&pslld (@x[$b0],7)",
646 "&movdqa ($t0,@x[$b1])",
647 "&pslld (@x[$b1],7)",
648 "&por (@x[$b0],$t1)",
650 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
651 "&por (@x[$b1],$t0)",
653 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
654 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
655 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
656 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
658 "&paddd (@x[$a2],@x[$b2])", # Q3
659 "&paddd (@x[$a3],@x[$b3])", # Q4
660 "&pxor (@x[$d2],@x[$a2])",
661 "&pxor (@x[$d3],@x[$a3])",
662 "&pshufb (@x[$d2],$t1)",
663 "&pshufb (@x[$d3],$t1)",
665 "&paddd ($xc,@x[$d2])",
666 "&paddd ($xc_,@x[$d3])",
667 "&pxor (@x[$b2],$xc)",
668 "&pxor (@x[$b3],$xc_)",
669 "&movdqa ($t0,@x[$b2])",
670 "&pslld (@x[$b2],12)",
672 "&movdqa ($t1,@x[$b3])",
673 "&pslld (@x[$b3],12)",
674 "&por (@x[$b2],$t0)",
676 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
677 "&por (@x[$b3],$t1)",
679 "&paddd (@x[$a2],@x[$b2])",
680 "&paddd (@x[$a3],@x[$b3])",
681 "&pxor (@x[$d2],@x[$a2])",
682 "&pxor (@x[$d3],@x[$a3])",
683 "&pshufb (@x[$d2],$t0)",
684 "&pshufb (@x[$d3],$t0)",
686 "&paddd ($xc,@x[$d2])",
687 "&paddd ($xc_,@x[$d3])",
688 "&pxor (@x[$b2],$xc)",
689 "&pxor (@x[$b3],$xc_)",
690 "&movdqa ($t1,@x[$b2])",
691 "&pslld (@x[$b2],7)",
693 "&movdqa ($t0,@x[$b3])",
694 "&pslld (@x[$b3],7)",
695 "&por (@x[$b2],$t1)",
697 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
702 my $xframe = $win64 ?
0xa0 : 0;
705 .type ChaCha20_4x
,\
@function,5
711 $code.=<<___
if ($avx>1);
712 shr \
$32,%r10 # OPENSSL_ia32cap_P+8
713 test \
$`1<<5`,%r10 # test AVX2
720 and \
$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
721 cmp \
$`1<<22`,%r11 # check for MOVBE without XSAVE
722 je
.Ldo_sse3_after_all
# to detect Atom
726 sub \
$0x148+$xframe,%rsp
728 ################ stack layout
729 # +0x00 SIMD equivalent of @x[8-12]
731 # +0x40 constant copy of key[0-2] smashed by lanes
733 # +0x100 SIMD counters (with nonce smashed by lanes)
736 $code.=<<___
if ($win64);
737 movaps
%xmm6,-0x30(%r11)
738 movaps
%xmm7,-0x20(%r11)
739 movaps
%xmm8,-0x10(%r11)
740 movaps
%xmm9,0x00(%r11)
741 movaps
%xmm10,0x10(%r11)
742 movaps
%xmm11,0x20(%r11)
743 movaps
%xmm12,0x30(%r11)
744 movaps
%xmm13,0x40(%r11)
745 movaps
%xmm14,0x50(%r11)
746 movaps
%xmm15,0x60(%r11)
749 movdqa
.Lsigma
(%rip),$xa3 # key[0]
750 movdqu
($key),$xb3 # key[1]
751 movdqu
16($key),$xt3 # key[2]
752 movdqu
($counter),$xd3 # key[3]
753 lea
0x100(%rsp),%rcx # size optimization
754 lea
.Lrot16
(%rip),%r10
755 lea
.Lrot24
(%rip),%r11
757 pshufd \
$0x00,$xa3,$xa0 # smash key by lanes...
758 pshufd \
$0x55,$xa3,$xa1
759 movdqa
$xa0,0x40(%rsp) # ... and offload
760 pshufd \
$0xaa,$xa3,$xa2
761 movdqa
$xa1,0x50(%rsp)
762 pshufd \
$0xff,$xa3,$xa3
763 movdqa
$xa2,0x60(%rsp)
764 movdqa
$xa3,0x70(%rsp)
766 pshufd \
$0x00,$xb3,$xb0
767 pshufd \
$0x55,$xb3,$xb1
768 movdqa
$xb0,0x80-0x100(%rcx)
769 pshufd \
$0xaa,$xb3,$xb2
770 movdqa
$xb1,0x90-0x100(%rcx)
771 pshufd \
$0xff,$xb3,$xb3
772 movdqa
$xb2,0xa0-0x100(%rcx)
773 movdqa
$xb3,0xb0-0x100(%rcx)
775 pshufd \
$0x00,$xt3,$xt0 # "$xc0"
776 pshufd \
$0x55,$xt3,$xt1 # "$xc1"
777 movdqa
$xt0,0xc0-0x100(%rcx)
778 pshufd \
$0xaa,$xt3,$xt2 # "$xc2"
779 movdqa
$xt1,0xd0-0x100(%rcx)
780 pshufd \
$0xff,$xt3,$xt3 # "$xc3"
781 movdqa
$xt2,0xe0-0x100(%rcx)
782 movdqa
$xt3,0xf0-0x100(%rcx)
784 pshufd \
$0x00,$xd3,$xd0
785 pshufd \
$0x55,$xd3,$xd1
786 paddd
.Linc
(%rip),$xd0 # don't save counters yet
787 pshufd \
$0xaa,$xd3,$xd2
788 movdqa
$xd1,0x110-0x100(%rcx)
789 pshufd \
$0xff,$xd3,$xd3
790 movdqa
$xd2,0x120-0x100(%rcx)
791 movdqa
$xd3,0x130-0x100(%rcx)
797 movdqa
0x40(%rsp),$xa0 # re-load smashed key
798 movdqa
0x50(%rsp),$xa1
799 movdqa
0x60(%rsp),$xa2
800 movdqa
0x70(%rsp),$xa3
801 movdqa
0x80-0x100(%rcx),$xb0
802 movdqa
0x90-0x100(%rcx),$xb1
803 movdqa
0xa0-0x100(%rcx),$xb2
804 movdqa
0xb0-0x100(%rcx),$xb3
805 movdqa
0xc0-0x100(%rcx),$xt0 # "$xc0"
806 movdqa
0xd0-0x100(%rcx),$xt1 # "$xc1"
807 movdqa
0xe0-0x100(%rcx),$xt2 # "$xc2"
808 movdqa
0xf0-0x100(%rcx),$xt3 # "$xc3"
809 movdqa
0x100-0x100(%rcx),$xd0
810 movdqa
0x110-0x100(%rcx),$xd1
811 movdqa
0x120-0x100(%rcx),$xd2
812 movdqa
0x130-0x100(%rcx),$xd3
813 paddd
.Lfour
(%rip),$xd0 # next SIMD counters
816 movdqa
$xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
817 movdqa
$xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
818 movdqa
(%r10),$xt3 # .Lrot16(%rip)
820 movdqa
$xd0,0x100-0x100(%rcx) # save SIMD counters
826 foreach (&SSSE3_lane_ROUND
(0, 4, 8,12)) { eval; }
827 foreach (&SSSE3_lane_ROUND
(0, 5,10,15)) { eval; }
832 paddd
0x40(%rsp),$xa0 # accumulate key material
833 paddd
0x50(%rsp),$xa1
834 paddd
0x60(%rsp),$xa2
835 paddd
0x70(%rsp),$xa3
837 movdqa
$xa0,$xt2 # "de-interlace" data
844 punpcklqdq
$xa2,$xa0 # "a0"
846 punpcklqdq
$xt3,$xt2 # "a2"
847 punpckhqdq
$xa2,$xa1 # "a1"
848 punpckhqdq
$xt3,$xa3 # "a3"
850 ($xa2,$xt2)=($xt2,$xa2);
852 paddd
0x80-0x100(%rcx),$xb0
853 paddd
0x90-0x100(%rcx),$xb1
854 paddd
0xa0-0x100(%rcx),$xb2
855 paddd
0xb0-0x100(%rcx),$xb3
857 movdqa
$xa0,0x00(%rsp) # offload $xaN
858 movdqa
$xa1,0x10(%rsp)
859 movdqa
0x20(%rsp),$xa0 # "xc2"
860 movdqa
0x30(%rsp),$xa1 # "xc3"
869 punpcklqdq
$xb2,$xb0 # "b0"
871 punpcklqdq
$xt3,$xt2 # "b2"
872 punpckhqdq
$xb2,$xb1 # "b1"
873 punpckhqdq
$xt3,$xb3 # "b3"
875 ($xb2,$xt2)=($xt2,$xb2);
876 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
878 paddd
0xc0-0x100(%rcx),$xc0
879 paddd
0xd0-0x100(%rcx),$xc1
880 paddd
0xe0-0x100(%rcx),$xc2
881 paddd
0xf0-0x100(%rcx),$xc3
883 movdqa
$xa2,0x20(%rsp) # keep offloading $xaN
884 movdqa
$xa3,0x30(%rsp)
893 punpcklqdq
$xc2,$xc0 # "c0"
895 punpcklqdq
$xt3,$xt2 # "c2"
896 punpckhqdq
$xc2,$xc1 # "c1"
897 punpckhqdq
$xt3,$xc3 # "c3"
899 ($xc2,$xt2)=($xt2,$xc2);
900 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
902 paddd
0x100-0x100(%rcx),$xd0
903 paddd
0x110-0x100(%rcx),$xd1
904 paddd
0x120-0x100(%rcx),$xd2
905 paddd
0x130-0x100(%rcx),$xd3
914 punpcklqdq
$xd2,$xd0 # "d0"
916 punpcklqdq
$xt3,$xt2 # "d2"
917 punpckhqdq
$xd2,$xd1 # "d1"
918 punpckhqdq
$xt3,$xd3 # "d3"
920 ($xd2,$xt2)=($xt2,$xd2);
925 movdqu
0x00($inp),$xt0 # xor with input
926 movdqu
0x10($inp),$xt1
927 movdqu
0x20($inp),$xt2
928 movdqu
0x30($inp),$xt3
929 pxor
0x00(%rsp),$xt0 # $xaN is offloaded, remember?
934 movdqu
$xt0,0x00($out)
935 movdqu
0x40($inp),$xt0
936 movdqu
$xt1,0x10($out)
937 movdqu
0x50($inp),$xt1
938 movdqu
$xt2,0x20($out)
939 movdqu
0x60($inp),$xt2
940 movdqu
$xt3,0x30($out)
941 movdqu
0x70($inp),$xt3
942 lea
0x80($inp),$inp # size optimization
948 movdqu
$xt0,0x40($out)
949 movdqu
0x00($inp),$xt0
950 movdqu
$xt1,0x50($out)
951 movdqu
0x10($inp),$xt1
952 movdqu
$xt2,0x60($out)
953 movdqu
0x20($inp),$xt2
954 movdqu
$xt3,0x70($out)
955 lea
0x80($out),$out # size optimization
956 movdqu
0x30($inp),$xt3
962 movdqu
$xt0,0x00($out)
963 movdqu
0x40($inp),$xt0
964 movdqu
$xt1,0x10($out)
965 movdqu
0x50($inp),$xt1
966 movdqu
$xt2,0x20($out)
967 movdqu
0x60($inp),$xt2
968 movdqu
$xt3,0x30($out)
969 movdqu
0x70($inp),$xt3
970 lea
0x80($inp),$inp # inp+=64*4
975 movdqu
$xt0,0x40($out)
976 movdqu
$xt1,0x50($out)
977 movdqu
$xt2,0x60($out)
978 movdqu
$xt3,0x70($out)
979 lea
0x80($out),$out # out+=64*4
994 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
996 #movdqa $xt0,0x00(%rsp)
997 movdqa
$xb0,0x10(%rsp)
998 movdqa
$xc0,0x20(%rsp)
999 movdqa
$xd0,0x30(%rsp)
1004 movdqu
0x00($inp),$xt0 # xor with input
1005 movdqu
0x10($inp),$xt1
1006 movdqu
0x20($inp),$xt2
1007 movdqu
0x30($inp),$xt3
1008 pxor
0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1012 movdqu
$xt0,0x00($out)
1013 movdqu
$xt1,0x10($out)
1014 movdqu
$xt2,0x20($out)
1015 movdqu
$xt3,0x30($out)
1018 movdqa
0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1019 lea
0x40($inp),$inp # inp+=64*1
1021 movdqa
$xt0,0x00(%rsp)
1022 movdqa
$xb1,0x10(%rsp)
1023 lea
0x40($out),$out # out+=64*1
1024 movdqa
$xc1,0x20(%rsp)
1025 sub \
$64,$len # len-=64*1
1026 movdqa
$xd1,0x30(%rsp)
1031 movdqu
0x00($inp),$xt0 # xor with input
1032 movdqu
0x10($inp),$xt1
1033 movdqu
0x20($inp),$xt2
1034 movdqu
0x30($inp),$xt3
1035 pxor
0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1040 movdqu
$xt0,0x00($out)
1041 movdqu
0x40($inp),$xt0
1042 movdqu
$xt1,0x10($out)
1043 movdqu
0x50($inp),$xt1
1044 movdqu
$xt2,0x20($out)
1045 movdqu
0x60($inp),$xt2
1046 movdqu
$xt3,0x30($out)
1047 movdqu
0x70($inp),$xt3
1048 pxor
0x10(%rsp),$xt0
1052 movdqu
$xt0,0x40($out)
1053 movdqu
$xt1,0x50($out)
1054 movdqu
$xt2,0x60($out)
1055 movdqu
$xt3,0x70($out)
1058 movdqa
0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1059 lea
0x80($inp),$inp # inp+=64*2
1061 movdqa
$xt0,0x00(%rsp)
1062 movdqa
$xb2,0x10(%rsp)
1063 lea
0x80($out),$out # out+=64*2
1064 movdqa
$xc2,0x20(%rsp)
1065 sub \
$128,$len # len-=64*2
1066 movdqa
$xd2,0x30(%rsp)
1071 movdqu
0x00($inp),$xt0 # xor with input
1072 movdqu
0x10($inp),$xt1
1073 movdqu
0x20($inp),$xt2
1074 movdqu
0x30($inp),$xt3
1075 pxor
0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1080 movdqu
$xt0,0x00($out)
1081 movdqu
0x40($inp),$xt0
1082 movdqu
$xt1,0x10($out)
1083 movdqu
0x50($inp),$xt1
1084 movdqu
$xt2,0x20($out)
1085 movdqu
0x60($inp),$xt2
1086 movdqu
$xt3,0x30($out)
1087 movdqu
0x70($inp),$xt3
1088 lea
0x80($inp),$inp # size optimization
1089 pxor
0x10(%rsp),$xt0
1094 movdqu
$xt0,0x40($out)
1095 movdqu
0x00($inp),$xt0
1096 movdqu
$xt1,0x50($out)
1097 movdqu
0x10($inp),$xt1
1098 movdqu
$xt2,0x60($out)
1099 movdqu
0x20($inp),$xt2
1100 movdqu
$xt3,0x70($out)
1101 lea
0x80($out),$out # size optimization
1102 movdqu
0x30($inp),$xt3
1103 pxor
0x20(%rsp),$xt0
1107 movdqu
$xt0,0x00($out)
1108 movdqu
$xt1,0x10($out)
1109 movdqu
$xt2,0x20($out)
1110 movdqu
$xt3,0x30($out)
1113 movdqa
0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1114 lea
0x40($inp),$inp # inp+=64*3
1116 movdqa
$xt0,0x00(%rsp)
1117 movdqa
$xb3,0x10(%rsp)
1118 lea
0x40($out),$out # out+=64*3
1119 movdqa
$xc3,0x20(%rsp)
1120 sub \
$192,$len # len-=64*3
1121 movdqa
$xd3,0x30(%rsp)
1124 movzb
($inp,%r10),%eax
1125 movzb
(%rsp,%r10),%ecx
1128 mov
%al,-1($out,%r10)
1134 $code.=<<___
if ($win64);
1135 lea
0x140+0x30(%rsp),%r11
1136 movaps
-0x30(%r11),%xmm6
1137 movaps
-0x20(%r11),%xmm7
1138 movaps
-0x10(%r11),%xmm8
1139 movaps
0x00(%r11),%xmm9
1140 movaps
0x10(%r11),%xmm10
1141 movaps
0x20(%r11),%xmm11
1142 movaps
0x30(%r11),%xmm12
1143 movaps
0x40(%r11),%xmm13
1144 movaps
0x50(%r11),%xmm14
1145 movaps
0x60(%r11),%xmm15
1148 add \
$0x148+$xframe,%rsp
1150 .size ChaCha20_4x
,.-ChaCha20_4x
1154 ########################################################################
1155 # XOP code path that handles all lengths.
1157 # There is some "anomaly" observed depending on instructions' size or
1158 # alignment. If you look closely at below code you'll notice that
1159 # sometimes argument order varies. The order affects instruction
1160 # encoding by making it larger, and such fiddling gives 5% performance
1161 # improvement. This is on FX-4100...
1163 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1164 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1165 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1166 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1168 sub XOP_lane_ROUND
{
1169 my ($a0,$b0,$c0,$d0)=@_;
1170 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1171 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1172 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1173 my @x=map("\"$_\"",@xx);
1176 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1177 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1178 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1179 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1180 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1181 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1182 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1183 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1184 "&vprotd (@x[$d0],@x[$d0],16)",
1185 "&vprotd (@x[$d1],@x[$d1],16)",
1186 "&vprotd (@x[$d2],@x[$d2],16)",
1187 "&vprotd (@x[$d3],@x[$d3],16)",
1189 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1190 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1191 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1192 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1193 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1194 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1195 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1196 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1197 "&vprotd (@x[$b0],@x[$b0],12)",
1198 "&vprotd (@x[$b1],@x[$b1],12)",
1199 "&vprotd (@x[$b2],@x[$b2],12)",
1200 "&vprotd (@x[$b3],@x[$b3],12)",
1202 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1203 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1204 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1205 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1206 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1207 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1208 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1209 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1210 "&vprotd (@x[$d0],@x[$d0],8)",
1211 "&vprotd (@x[$d1],@x[$d1],8)",
1212 "&vprotd (@x[$d2],@x[$d2],8)",
1213 "&vprotd (@x[$d3],@x[$d3],8)",
1215 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1216 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1217 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1218 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1219 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1220 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1221 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1222 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1223 "&vprotd (@x[$b0],@x[$b0],7)",
1224 "&vprotd (@x[$b1],@x[$b1],7)",
1225 "&vprotd (@x[$b2],@x[$b2],7)",
1226 "&vprotd (@x[$b3],@x[$b3],7)"
1230 my $xframe = $win64 ?
0xa0 : 0;
1233 .type ChaCha20_4xop
,\
@function,5
1237 lea
-0x78(%rsp),%r11
1238 sub \
$0x148+$xframe,%rsp
1240 ################ stack layout
1241 # +0x00 SIMD equivalent of @x[8-12]
1243 # +0x40 constant copy of key[0-2] smashed by lanes
1245 # +0x100 SIMD counters (with nonce smashed by lanes)
1248 $code.=<<___
if ($win64);
1249 movaps
%xmm6,-0x30(%r11)
1250 movaps
%xmm7,-0x20(%r11)
1251 movaps
%xmm8,-0x10(%r11)
1252 movaps
%xmm9,0x00(%r11)
1253 movaps
%xmm10,0x10(%r11)
1254 movaps
%xmm11,0x20(%r11)
1255 movaps
%xmm12,0x30(%r11)
1256 movaps
%xmm13,0x40(%r11)
1257 movaps
%xmm14,0x50(%r11)
1258 movaps
%xmm15,0x60(%r11)
1263 vmovdqa
.Lsigma
(%rip),$xa3 # key[0]
1264 vmovdqu
($key),$xb3 # key[1]
1265 vmovdqu
16($key),$xt3 # key[2]
1266 vmovdqu
($counter),$xd3 # key[3]
1267 lea
0x100(%rsp),%rcx # size optimization
1269 vpshufd \
$0x00,$xa3,$xa0 # smash key by lanes...
1270 vpshufd \
$0x55,$xa3,$xa1
1271 vmovdqa
$xa0,0x40(%rsp) # ... and offload
1272 vpshufd \
$0xaa,$xa3,$xa2
1273 vmovdqa
$xa1,0x50(%rsp)
1274 vpshufd \
$0xff,$xa3,$xa3
1275 vmovdqa
$xa2,0x60(%rsp)
1276 vmovdqa
$xa3,0x70(%rsp)
1278 vpshufd \
$0x00,$xb3,$xb0
1279 vpshufd \
$0x55,$xb3,$xb1
1280 vmovdqa
$xb0,0x80-0x100(%rcx)
1281 vpshufd \
$0xaa,$xb3,$xb2
1282 vmovdqa
$xb1,0x90-0x100(%rcx)
1283 vpshufd \
$0xff,$xb3,$xb3
1284 vmovdqa
$xb2,0xa0-0x100(%rcx)
1285 vmovdqa
$xb3,0xb0-0x100(%rcx)
1287 vpshufd \
$0x00,$xt3,$xt0 # "$xc0"
1288 vpshufd \
$0x55,$xt3,$xt1 # "$xc1"
1289 vmovdqa
$xt0,0xc0-0x100(%rcx)
1290 vpshufd \
$0xaa,$xt3,$xt2 # "$xc2"
1291 vmovdqa
$xt1,0xd0-0x100(%rcx)
1292 vpshufd \
$0xff,$xt3,$xt3 # "$xc3"
1293 vmovdqa
$xt2,0xe0-0x100(%rcx)
1294 vmovdqa
$xt3,0xf0-0x100(%rcx)
1296 vpshufd \
$0x00,$xd3,$xd0
1297 vpshufd \
$0x55,$xd3,$xd1
1298 vpaddd
.Linc
(%rip),$xd0,$xd0 # don't save counters yet
1299 vpshufd \
$0xaa,$xd3,$xd2
1300 vmovdqa
$xd1,0x110-0x100(%rcx)
1301 vpshufd \
$0xff,$xd3,$xd3
1302 vmovdqa
$xd2,0x120-0x100(%rcx)
1303 vmovdqa
$xd3,0x130-0x100(%rcx)
1309 vmovdqa
0x40(%rsp),$xa0 # re-load smashed key
1310 vmovdqa
0x50(%rsp),$xa1
1311 vmovdqa
0x60(%rsp),$xa2
1312 vmovdqa
0x70(%rsp),$xa3
1313 vmovdqa
0x80-0x100(%rcx),$xb0
1314 vmovdqa
0x90-0x100(%rcx),$xb1
1315 vmovdqa
0xa0-0x100(%rcx),$xb2
1316 vmovdqa
0xb0-0x100(%rcx),$xb3
1317 vmovdqa
0xc0-0x100(%rcx),$xt0 # "$xc0"
1318 vmovdqa
0xd0-0x100(%rcx),$xt1 # "$xc1"
1319 vmovdqa
0xe0-0x100(%rcx),$xt2 # "$xc2"
1320 vmovdqa
0xf0-0x100(%rcx),$xt3 # "$xc3"
1321 vmovdqa
0x100-0x100(%rcx),$xd0
1322 vmovdqa
0x110-0x100(%rcx),$xd1
1323 vmovdqa
0x120-0x100(%rcx),$xd2
1324 vmovdqa
0x130-0x100(%rcx),$xd3
1325 vpaddd
.Lfour
(%rip),$xd0,$xd0 # next SIMD counters
1329 vmovdqa
$xd0,0x100-0x100(%rcx) # save SIMD counters
1335 foreach (&XOP_lane_ROUND
(0, 4, 8,12)) { eval; }
1336 foreach (&XOP_lane_ROUND
(0, 5,10,15)) { eval; }
1341 vpaddd
0x40(%rsp),$xa0,$xa0 # accumulate key material
1342 vpaddd
0x50(%rsp),$xa1,$xa1
1343 vpaddd
0x60(%rsp),$xa2,$xa2
1344 vpaddd
0x70(%rsp),$xa3,$xa3
1346 vmovdqa
$xt2,0x20(%rsp) # offload $xc2,3
1347 vmovdqa
$xt3,0x30(%rsp)
1349 vpunpckldq
$xa1,$xa0,$xt2 # "de-interlace" data
1350 vpunpckldq
$xa3,$xa2,$xt3
1351 vpunpckhdq
$xa1,$xa0,$xa0
1352 vpunpckhdq
$xa3,$xa2,$xa2
1353 vpunpcklqdq
$xt3,$xt2,$xa1 # "a0"
1354 vpunpckhqdq
$xt3,$xt2,$xt2 # "a1"
1355 vpunpcklqdq
$xa2,$xa0,$xa3 # "a2"
1356 vpunpckhqdq
$xa2,$xa0,$xa0 # "a3"
1358 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1360 vpaddd
0x80-0x100(%rcx),$xb0,$xb0
1361 vpaddd
0x90-0x100(%rcx),$xb1,$xb1
1362 vpaddd
0xa0-0x100(%rcx),$xb2,$xb2
1363 vpaddd
0xb0-0x100(%rcx),$xb3,$xb3
1365 vmovdqa
$xa0,0x00(%rsp) # offload $xa0,1
1366 vmovdqa
$xa1,0x10(%rsp)
1367 vmovdqa
0x20(%rsp),$xa0 # "xc2"
1368 vmovdqa
0x30(%rsp),$xa1 # "xc3"
1370 vpunpckldq
$xb1,$xb0,$xt2
1371 vpunpckldq
$xb3,$xb2,$xt3
1372 vpunpckhdq
$xb1,$xb0,$xb0
1373 vpunpckhdq
$xb3,$xb2,$xb2
1374 vpunpcklqdq
$xt3,$xt2,$xb1 # "b0"
1375 vpunpckhqdq
$xt3,$xt2,$xt2 # "b1"
1376 vpunpcklqdq
$xb2,$xb0,$xb3 # "b2"
1377 vpunpckhqdq
$xb2,$xb0,$xb0 # "b3"
1379 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1380 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1382 vpaddd
0xc0-0x100(%rcx),$xc0,$xc0
1383 vpaddd
0xd0-0x100(%rcx),$xc1,$xc1
1384 vpaddd
0xe0-0x100(%rcx),$xc2,$xc2
1385 vpaddd
0xf0-0x100(%rcx),$xc3,$xc3
1387 vpunpckldq
$xc1,$xc0,$xt2
1388 vpunpckldq
$xc3,$xc2,$xt3
1389 vpunpckhdq
$xc1,$xc0,$xc0
1390 vpunpckhdq
$xc3,$xc2,$xc2
1391 vpunpcklqdq
$xt3,$xt2,$xc1 # "c0"
1392 vpunpckhqdq
$xt3,$xt2,$xt2 # "c1"
1393 vpunpcklqdq
$xc2,$xc0,$xc3 # "c2"
1394 vpunpckhqdq
$xc2,$xc0,$xc0 # "c3"
1396 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1398 vpaddd
0x100-0x100(%rcx),$xd0,$xd0
1399 vpaddd
0x110-0x100(%rcx),$xd1,$xd1
1400 vpaddd
0x120-0x100(%rcx),$xd2,$xd2
1401 vpaddd
0x130-0x100(%rcx),$xd3,$xd3
1403 vpunpckldq
$xd1,$xd0,$xt2
1404 vpunpckldq
$xd3,$xd2,$xt3
1405 vpunpckhdq
$xd1,$xd0,$xd0
1406 vpunpckhdq
$xd3,$xd2,$xd2
1407 vpunpcklqdq
$xt3,$xt2,$xd1 # "d0"
1408 vpunpckhqdq
$xt3,$xt2,$xt2 # "d1"
1409 vpunpcklqdq
$xd2,$xd0,$xd3 # "d2"
1410 vpunpckhqdq
$xd2,$xd0,$xd0 # "d3"
1412 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1413 ($xa0,$xa1)=($xt2,$xt3);
1415 vmovdqa
0x00(%rsp),$xa0 # restore $xa0,1
1416 vmovdqa
0x10(%rsp),$xa1
1421 vpxor
0x00($inp),$xa0,$xa0 # xor with input
1422 vpxor
0x10($inp),$xb0,$xb0
1423 vpxor
0x20($inp),$xc0,$xc0
1424 vpxor
0x30($inp),$xd0,$xd0
1425 vpxor
0x40($inp),$xa1,$xa1
1426 vpxor
0x50($inp),$xb1,$xb1
1427 vpxor
0x60($inp),$xc1,$xc1
1428 vpxor
0x70($inp),$xd1,$xd1
1429 lea
0x80($inp),$inp # size optimization
1430 vpxor
0x00($inp),$xa2,$xa2
1431 vpxor
0x10($inp),$xb2,$xb2
1432 vpxor
0x20($inp),$xc2,$xc2
1433 vpxor
0x30($inp),$xd2,$xd2
1434 vpxor
0x40($inp),$xa3,$xa3
1435 vpxor
0x50($inp),$xb3,$xb3
1436 vpxor
0x60($inp),$xc3,$xc3
1437 vpxor
0x70($inp),$xd3,$xd3
1438 lea
0x80($inp),$inp # inp+=64*4
1440 vmovdqu
$xa0,0x00($out)
1441 vmovdqu
$xb0,0x10($out)
1442 vmovdqu
$xc0,0x20($out)
1443 vmovdqu
$xd0,0x30($out)
1444 vmovdqu
$xa1,0x40($out)
1445 vmovdqu
$xb1,0x50($out)
1446 vmovdqu
$xc1,0x60($out)
1447 vmovdqu
$xd1,0x70($out)
1448 lea
0x80($out),$out # size optimization
1449 vmovdqu
$xa2,0x00($out)
1450 vmovdqu
$xb2,0x10($out)
1451 vmovdqu
$xc2,0x20($out)
1452 vmovdqu
$xd2,0x30($out)
1453 vmovdqu
$xa3,0x40($out)
1454 vmovdqu
$xb3,0x50($out)
1455 vmovdqu
$xc3,0x60($out)
1456 vmovdqu
$xd3,0x70($out)
1457 lea
0x80($out),$out # out+=64*4
1467 jae
.L192_or_more4xop
1469 jae
.L128_or_more4xop
1471 jae
.L64_or_more4xop
1474 vmovdqa
$xa0,0x00(%rsp)
1475 vmovdqa
$xb0,0x10(%rsp)
1476 vmovdqa
$xc0,0x20(%rsp)
1477 vmovdqa
$xd0,0x30(%rsp)
1482 vpxor
0x00($inp),$xa0,$xa0 # xor with input
1483 vpxor
0x10($inp),$xb0,$xb0
1484 vpxor
0x20($inp),$xc0,$xc0
1485 vpxor
0x30($inp),$xd0,$xd0
1486 vmovdqu
$xa0,0x00($out)
1487 vmovdqu
$xb0,0x10($out)
1488 vmovdqu
$xc0,0x20($out)
1489 vmovdqu
$xd0,0x30($out)
1492 lea
0x40($inp),$inp # inp+=64*1
1493 vmovdqa
$xa1,0x00(%rsp)
1495 vmovdqa
$xb1,0x10(%rsp)
1496 lea
0x40($out),$out # out+=64*1
1497 vmovdqa
$xc1,0x20(%rsp)
1498 sub \
$64,$len # len-=64*1
1499 vmovdqa
$xd1,0x30(%rsp)
1504 vpxor
0x00($inp),$xa0,$xa0 # xor with input
1505 vpxor
0x10($inp),$xb0,$xb0
1506 vpxor
0x20($inp),$xc0,$xc0
1507 vpxor
0x30($inp),$xd0,$xd0
1508 vpxor
0x40($inp),$xa1,$xa1
1509 vpxor
0x50($inp),$xb1,$xb1
1510 vpxor
0x60($inp),$xc1,$xc1
1511 vpxor
0x70($inp),$xd1,$xd1
1513 vmovdqu
$xa0,0x00($out)
1514 vmovdqu
$xb0,0x10($out)
1515 vmovdqu
$xc0,0x20($out)
1516 vmovdqu
$xd0,0x30($out)
1517 vmovdqu
$xa1,0x40($out)
1518 vmovdqu
$xb1,0x50($out)
1519 vmovdqu
$xc1,0x60($out)
1520 vmovdqu
$xd1,0x70($out)
1523 lea
0x80($inp),$inp # inp+=64*2
1524 vmovdqa
$xa2,0x00(%rsp)
1526 vmovdqa
$xb2,0x10(%rsp)
1527 lea
0x80($out),$out # out+=64*2
1528 vmovdqa
$xc2,0x20(%rsp)
1529 sub \
$128,$len # len-=64*2
1530 vmovdqa
$xd2,0x30(%rsp)
1535 vpxor
0x00($inp),$xa0,$xa0 # xor with input
1536 vpxor
0x10($inp),$xb0,$xb0
1537 vpxor
0x20($inp),$xc0,$xc0
1538 vpxor
0x30($inp),$xd0,$xd0
1539 vpxor
0x40($inp),$xa1,$xa1
1540 vpxor
0x50($inp),$xb1,$xb1
1541 vpxor
0x60($inp),$xc1,$xc1
1542 vpxor
0x70($inp),$xd1,$xd1
1543 lea
0x80($inp),$inp # size optimization
1544 vpxor
0x00($inp),$xa2,$xa2
1545 vpxor
0x10($inp),$xb2,$xb2
1546 vpxor
0x20($inp),$xc2,$xc2
1547 vpxor
0x30($inp),$xd2,$xd2
1549 vmovdqu
$xa0,0x00($out)
1550 vmovdqu
$xb0,0x10($out)
1551 vmovdqu
$xc0,0x20($out)
1552 vmovdqu
$xd0,0x30($out)
1553 vmovdqu
$xa1,0x40($out)
1554 vmovdqu
$xb1,0x50($out)
1555 vmovdqu
$xc1,0x60($out)
1556 vmovdqu
$xd1,0x70($out)
1557 lea
0x80($out),$out # size optimization
1558 vmovdqu
$xa2,0x00($out)
1559 vmovdqu
$xb2,0x10($out)
1560 vmovdqu
$xc2,0x20($out)
1561 vmovdqu
$xd2,0x30($out)
1564 lea
0x40($inp),$inp # inp+=64*3
1565 vmovdqa
$xa3,0x00(%rsp)
1567 vmovdqa
$xb3,0x10(%rsp)
1568 lea
0x40($out),$out # out+=64*3
1569 vmovdqa
$xc3,0x20(%rsp)
1570 sub \
$192,$len # len-=64*3
1571 vmovdqa
$xd3,0x30(%rsp)
1574 movzb
($inp,%r10),%eax
1575 movzb
(%rsp,%r10),%ecx
1578 mov
%al,-1($out,%r10)
1585 $code.=<<___
if ($win64);
1586 lea
0x140+0x30(%rsp),%r11
1587 movaps
-0x30(%r11),%xmm6
1588 movaps
-0x20(%r11),%xmm7
1589 movaps
-0x10(%r11),%xmm8
1590 movaps
0x00(%r11),%xmm9
1591 movaps
0x10(%r11),%xmm10
1592 movaps
0x20(%r11),%xmm11
1593 movaps
0x30(%r11),%xmm12
1594 movaps
0x40(%r11),%xmm13
1595 movaps
0x50(%r11),%xmm14
1596 movaps
0x60(%r11),%xmm15
1599 add \
$0x148+$xframe,%rsp
1601 .size ChaCha20_4xop
,.-ChaCha20_4xop
1605 ########################################################################
1608 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1609 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1610 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1611 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1613 sub AVX2_lane_ROUND
{
1614 my ($a0,$b0,$c0,$d0)=@_;
1615 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1616 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1617 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1618 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1619 my @x=map("\"$_\"",@xx);
1621 # Consider order in which variables are addressed by their
1626 # 0 4 8 12 < even round
1630 # 0 5 10 15 < odd round
1635 # 'a', 'b' and 'd's are permanently allocated in registers,
1636 # @x[0..7,12..15], while 'c's are maintained in memory. If
1637 # you observe 'c' column, you'll notice that pair of 'c's is
1638 # invariant between rounds. This means that we have to reload
1639 # them once per round, in the middle. This is why you'll see
1640 # bunch of 'c' stores and loads in the middle, but none in
1641 # the beginning or end.
1644 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1645 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1646 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1647 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1648 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1649 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1651 "&vpaddd ($xc,$xc,@x[$d0])",
1652 "&vpxor (@x[$b0],$xc,@x[$b0])",
1653 "&vpslld ($t0,@x[$b0],12)",
1654 "&vpsrld (@x[$b0],@x[$b0],20)",
1655 "&vpor (@x[$b0],$t0,@x[$b0])",
1656 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1657 "&vpaddd ($xc_,$xc_,@x[$d1])",
1658 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1659 "&vpslld ($t1,@x[$b1],12)",
1660 "&vpsrld (@x[$b1],@x[$b1],20)",
1661 "&vpor (@x[$b1],$t1,@x[$b1])",
1663 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1664 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1665 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1666 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1667 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1668 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1670 "&vpaddd ($xc,$xc,@x[$d0])",
1671 "&vpxor (@x[$b0],$xc,@x[$b0])",
1672 "&vpslld ($t1,@x[$b0],7)",
1673 "&vpsrld (@x[$b0],@x[$b0],25)",
1674 "&vpor (@x[$b0],$t1,@x[$b0])",
1675 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1676 "&vpaddd ($xc_,$xc_,@x[$d1])",
1677 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1678 "&vpslld ($t0,@x[$b1],7)",
1679 "&vpsrld (@x[$b1],@x[$b1],25)",
1680 "&vpor (@x[$b1],$t0,@x[$b1])",
1682 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1683 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1684 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1685 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1687 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1688 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1689 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1690 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1691 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1692 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1694 "&vpaddd ($xc,$xc,@x[$d2])",
1695 "&vpxor (@x[$b2],$xc,@x[$b2])",
1696 "&vpslld ($t0,@x[$b2],12)",
1697 "&vpsrld (@x[$b2],@x[$b2],20)",
1698 "&vpor (@x[$b2],$t0,@x[$b2])",
1699 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1700 "&vpaddd ($xc_,$xc_,@x[$d3])",
1701 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1702 "&vpslld ($t1,@x[$b3],12)",
1703 "&vpsrld (@x[$b3],@x[$b3],20)",
1704 "&vpor (@x[$b3],$t1,@x[$b3])",
1706 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1707 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1708 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1709 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1710 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1711 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1713 "&vpaddd ($xc,$xc,@x[$d2])",
1714 "&vpxor (@x[$b2],$xc,@x[$b2])",
1715 "&vpslld ($t1,@x[$b2],7)",
1716 "&vpsrld (@x[$b2],@x[$b2],25)",
1717 "&vpor (@x[$b2],$t1,@x[$b2])",
1718 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1719 "&vpaddd ($xc_,$xc_,@x[$d3])",
1720 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1721 "&vpslld ($t0,@x[$b3],7)",
1722 "&vpsrld (@x[$b3],@x[$b3],25)",
1723 "&vpor (@x[$b3],$t0,@x[$b3])"
1727 my $xframe = $win64 ?
0xb0 : 8;
1730 .type ChaCha20_8x
,\
@function,5
1735 $code.=<<___
if ($avx>2);
1736 test \
$`1<<16`,%r10d # check for AVX512F
1741 sub \
$0x280+$xframe,%rsp
1744 $code.=<<___
if ($win64);
1745 lea
0x290+0x30(%rsp),%r11
1746 movaps
%xmm6,-0x30(%r11)
1747 movaps
%xmm7,-0x20(%r11)
1748 movaps
%xmm8,-0x10(%r11)
1749 movaps
%xmm9,0x00(%r11)
1750 movaps
%xmm10,0x10(%r11)
1751 movaps
%xmm11,0x20(%r11)
1752 movaps
%xmm12,0x30(%r11)
1753 movaps
%xmm13,0x40(%r11)
1754 movaps
%xmm14,0x50(%r11)
1755 movaps
%xmm15,0x60(%r11)
1759 mov
%r10,0x280(%rsp)
1761 ################ stack layout
1762 # +0x00 SIMD equivalent of @x[8-12]
1764 # +0x80 constant copy of key[0-2] smashed by lanes
1766 # +0x200 SIMD counters (with nonce smashed by lanes)
1770 vbroadcasti128
.Lsigma
(%rip),$xa3 # key[0]
1771 vbroadcasti128
($key),$xb3 # key[1]
1772 vbroadcasti128
16($key),$xt3 # key[2]
1773 vbroadcasti128
($counter),$xd3 # key[3]
1774 lea
0x100(%rsp),%rcx # size optimization
1775 lea
0x200(%rsp),%rax # size optimization
1776 lea
.Lrot16
(%rip),%r10
1777 lea
.Lrot24
(%rip),%r11
1779 vpshufd \
$0x00,$xa3,$xa0 # smash key by lanes...
1780 vpshufd \
$0x55,$xa3,$xa1
1781 vmovdqa
$xa0,0x80-0x100(%rcx) # ... and offload
1782 vpshufd \
$0xaa,$xa3,$xa2
1783 vmovdqa
$xa1,0xa0-0x100(%rcx)
1784 vpshufd \
$0xff,$xa3,$xa3
1785 vmovdqa
$xa2,0xc0-0x100(%rcx)
1786 vmovdqa
$xa3,0xe0-0x100(%rcx)
1788 vpshufd \
$0x00,$xb3,$xb0
1789 vpshufd \
$0x55,$xb3,$xb1
1790 vmovdqa
$xb0,0x100-0x100(%rcx)
1791 vpshufd \
$0xaa,$xb3,$xb2
1792 vmovdqa
$xb1,0x120-0x100(%rcx)
1793 vpshufd \
$0xff,$xb3,$xb3
1794 vmovdqa
$xb2,0x140-0x100(%rcx)
1795 vmovdqa
$xb3,0x160-0x100(%rcx)
1797 vpshufd \
$0x00,$xt3,$xt0 # "xc0"
1798 vpshufd \
$0x55,$xt3,$xt1 # "xc1"
1799 vmovdqa
$xt0,0x180-0x200(%rax)
1800 vpshufd \
$0xaa,$xt3,$xt2 # "xc2"
1801 vmovdqa
$xt1,0x1a0-0x200(%rax)
1802 vpshufd \
$0xff,$xt3,$xt3 # "xc3"
1803 vmovdqa
$xt2,0x1c0-0x200(%rax)
1804 vmovdqa
$xt3,0x1e0-0x200(%rax)
1806 vpshufd \
$0x00,$xd3,$xd0
1807 vpshufd \
$0x55,$xd3,$xd1
1808 vpaddd
.Lincy
(%rip),$xd0,$xd0 # don't save counters yet
1809 vpshufd \
$0xaa,$xd3,$xd2
1810 vmovdqa
$xd1,0x220-0x200(%rax)
1811 vpshufd \
$0xff,$xd3,$xd3
1812 vmovdqa
$xd2,0x240-0x200(%rax)
1813 vmovdqa
$xd3,0x260-0x200(%rax)
1819 vmovdqa
0x80-0x100(%rcx),$xa0 # re-load smashed key
1820 vmovdqa
0xa0-0x100(%rcx),$xa1
1821 vmovdqa
0xc0-0x100(%rcx),$xa2
1822 vmovdqa
0xe0-0x100(%rcx),$xa3
1823 vmovdqa
0x100-0x100(%rcx),$xb0
1824 vmovdqa
0x120-0x100(%rcx),$xb1
1825 vmovdqa
0x140-0x100(%rcx),$xb2
1826 vmovdqa
0x160-0x100(%rcx),$xb3
1827 vmovdqa
0x180-0x200(%rax),$xt0 # "xc0"
1828 vmovdqa
0x1a0-0x200(%rax),$xt1 # "xc1"
1829 vmovdqa
0x1c0-0x200(%rax),$xt2 # "xc2"
1830 vmovdqa
0x1e0-0x200(%rax),$xt3 # "xc3"
1831 vmovdqa
0x200-0x200(%rax),$xd0
1832 vmovdqa
0x220-0x200(%rax),$xd1
1833 vmovdqa
0x240-0x200(%rax),$xd2
1834 vmovdqa
0x260-0x200(%rax),$xd3
1835 vpaddd
.Leight
(%rip),$xd0,$xd0 # next SIMD counters
1838 vmovdqa
$xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1839 vmovdqa
$xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1840 vbroadcasti128
(%r10),$xt3
1841 vmovdqa
$xd0,0x200-0x200(%rax) # save SIMD counters
1848 foreach (&AVX2_lane_ROUND
(0, 4, 8,12)) { eval; }
1849 foreach (&AVX2_lane_ROUND
(0, 5,10,15)) { eval; }
1854 lea
0x200(%rsp),%rax # size optimization
1855 vpaddd
0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1856 vpaddd
0xa0-0x100(%rcx),$xa1,$xa1
1857 vpaddd
0xc0-0x100(%rcx),$xa2,$xa2
1858 vpaddd
0xe0-0x100(%rcx),$xa3,$xa3
1860 vpunpckldq
$xa1,$xa0,$xt2 # "de-interlace" data
1861 vpunpckldq
$xa3,$xa2,$xt3
1862 vpunpckhdq
$xa1,$xa0,$xa0
1863 vpunpckhdq
$xa3,$xa2,$xa2
1864 vpunpcklqdq
$xt3,$xt2,$xa1 # "a0"
1865 vpunpckhqdq
$xt3,$xt2,$xt2 # "a1"
1866 vpunpcklqdq
$xa2,$xa0,$xa3 # "a2"
1867 vpunpckhqdq
$xa2,$xa0,$xa0 # "a3"
1869 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1871 vpaddd
0x100-0x100(%rcx),$xb0,$xb0
1872 vpaddd
0x120-0x100(%rcx),$xb1,$xb1
1873 vpaddd
0x140-0x100(%rcx),$xb2,$xb2
1874 vpaddd
0x160-0x100(%rcx),$xb3,$xb3
1876 vpunpckldq
$xb1,$xb0,$xt2
1877 vpunpckldq
$xb3,$xb2,$xt3
1878 vpunpckhdq
$xb1,$xb0,$xb0
1879 vpunpckhdq
$xb3,$xb2,$xb2
1880 vpunpcklqdq
$xt3,$xt2,$xb1 # "b0"
1881 vpunpckhqdq
$xt3,$xt2,$xt2 # "b1"
1882 vpunpcklqdq
$xb2,$xb0,$xb3 # "b2"
1883 vpunpckhqdq
$xb2,$xb0,$xb0 # "b3"
1885 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1887 vperm2i128 \
$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1888 vperm2i128 \
$0x31,$xb0,$xa0,$xb0
1889 vperm2i128 \
$0x20,$xb1,$xa1,$xa0
1890 vperm2i128 \
$0x31,$xb1,$xa1,$xb1
1891 vperm2i128 \
$0x20,$xb2,$xa2,$xa1
1892 vperm2i128 \
$0x31,$xb2,$xa2,$xb2
1893 vperm2i128 \
$0x20,$xb3,$xa3,$xa2
1894 vperm2i128 \
$0x31,$xb3,$xa3,$xb3
1896 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1897 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1899 vmovdqa
$xa0,0x00(%rsp) # offload $xaN
1900 vmovdqa
$xa1,0x20(%rsp)
1901 vmovdqa
0x40(%rsp),$xc2 # $xa0
1902 vmovdqa
0x60(%rsp),$xc3 # $xa1
1904 vpaddd
0x180-0x200(%rax),$xc0,$xc0
1905 vpaddd
0x1a0-0x200(%rax),$xc1,$xc1
1906 vpaddd
0x1c0-0x200(%rax),$xc2,$xc2
1907 vpaddd
0x1e0-0x200(%rax),$xc3,$xc3
1909 vpunpckldq
$xc1,$xc0,$xt2
1910 vpunpckldq
$xc3,$xc2,$xt3
1911 vpunpckhdq
$xc1,$xc0,$xc0
1912 vpunpckhdq
$xc3,$xc2,$xc2
1913 vpunpcklqdq
$xt3,$xt2,$xc1 # "c0"
1914 vpunpckhqdq
$xt3,$xt2,$xt2 # "c1"
1915 vpunpcklqdq
$xc2,$xc0,$xc3 # "c2"
1916 vpunpckhqdq
$xc2,$xc0,$xc0 # "c3"
1918 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1920 vpaddd
0x200-0x200(%rax),$xd0,$xd0
1921 vpaddd
0x220-0x200(%rax),$xd1,$xd1
1922 vpaddd
0x240-0x200(%rax),$xd2,$xd2
1923 vpaddd
0x260-0x200(%rax),$xd3,$xd3
1925 vpunpckldq
$xd1,$xd0,$xt2
1926 vpunpckldq
$xd3,$xd2,$xt3
1927 vpunpckhdq
$xd1,$xd0,$xd0
1928 vpunpckhdq
$xd3,$xd2,$xd2
1929 vpunpcklqdq
$xt3,$xt2,$xd1 # "d0"
1930 vpunpckhqdq
$xt3,$xt2,$xt2 # "d1"
1931 vpunpcklqdq
$xd2,$xd0,$xd3 # "d2"
1932 vpunpckhqdq
$xd2,$xd0,$xd0 # "d3"
1934 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1936 vperm2i128 \
$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1937 vperm2i128 \
$0x31,$xd0,$xc0,$xd0
1938 vperm2i128 \
$0x20,$xd1,$xc1,$xc0
1939 vperm2i128 \
$0x31,$xd1,$xc1,$xd1
1940 vperm2i128 \
$0x20,$xd2,$xc2,$xc1
1941 vperm2i128 \
$0x31,$xd2,$xc2,$xd2
1942 vperm2i128 \
$0x20,$xd3,$xc3,$xc2
1943 vperm2i128 \
$0x31,$xd3,$xc3,$xd3
1945 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1946 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1947 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1948 ($xa0,$xa1)=($xt2,$xt3);
1950 vmovdqa
0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1951 vmovdqa
0x20(%rsp),$xa1
1956 vpxor
0x00($inp),$xa0,$xa0 # xor with input
1957 vpxor
0x20($inp),$xb0,$xb0
1958 vpxor
0x40($inp),$xc0,$xc0
1959 vpxor
0x60($inp),$xd0,$xd0
1960 lea
0x80($inp),$inp # size optimization
1961 vmovdqu
$xa0,0x00($out)
1962 vmovdqu
$xb0,0x20($out)
1963 vmovdqu
$xc0,0x40($out)
1964 vmovdqu
$xd0,0x60($out)
1965 lea
0x80($out),$out # size optimization
1967 vpxor
0x00($inp),$xa1,$xa1
1968 vpxor
0x20($inp),$xb1,$xb1
1969 vpxor
0x40($inp),$xc1,$xc1
1970 vpxor
0x60($inp),$xd1,$xd1
1971 lea
0x80($inp),$inp # size optimization
1972 vmovdqu
$xa1,0x00($out)
1973 vmovdqu
$xb1,0x20($out)
1974 vmovdqu
$xc1,0x40($out)
1975 vmovdqu
$xd1,0x60($out)
1976 lea
0x80($out),$out # size optimization
1978 vpxor
0x00($inp),$xa2,$xa2
1979 vpxor
0x20($inp),$xb2,$xb2
1980 vpxor
0x40($inp),$xc2,$xc2
1981 vpxor
0x60($inp),$xd2,$xd2
1982 lea
0x80($inp),$inp # size optimization
1983 vmovdqu
$xa2,0x00($out)
1984 vmovdqu
$xb2,0x20($out)
1985 vmovdqu
$xc2,0x40($out)
1986 vmovdqu
$xd2,0x60($out)
1987 lea
0x80($out),$out # size optimization
1989 vpxor
0x00($inp),$xa3,$xa3
1990 vpxor
0x20($inp),$xb3,$xb3
1991 vpxor
0x40($inp),$xc3,$xc3
1992 vpxor
0x60($inp),$xd3,$xd3
1993 lea
0x80($inp),$inp # size optimization
1994 vmovdqu
$xa3,0x00($out)
1995 vmovdqu
$xb3,0x20($out)
1996 vmovdqu
$xc3,0x40($out)
1997 vmovdqu
$xd3,0x60($out)
1998 lea
0x80($out),$out # size optimization
2022 vmovdqa
$xa0,0x00(%rsp)
2023 vmovdqa
$xb0,0x20(%rsp)
2028 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2029 vpxor
0x20($inp),$xb0,$xb0
2030 vmovdqu
$xa0,0x00($out)
2031 vmovdqu
$xb0,0x20($out)
2034 lea
0x40($inp),$inp # inp+=64*1
2036 vmovdqa
$xc0,0x00(%rsp)
2037 lea
0x40($out),$out # out+=64*1
2038 sub \
$64,$len # len-=64*1
2039 vmovdqa
$xd0,0x20(%rsp)
2044 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2045 vpxor
0x20($inp),$xb0,$xb0
2046 vpxor
0x40($inp),$xc0,$xc0
2047 vpxor
0x60($inp),$xd0,$xd0
2048 vmovdqu
$xa0,0x00($out)
2049 vmovdqu
$xb0,0x20($out)
2050 vmovdqu
$xc0,0x40($out)
2051 vmovdqu
$xd0,0x60($out)
2054 lea
0x80($inp),$inp # inp+=64*2
2056 vmovdqa
$xa1,0x00(%rsp)
2057 lea
0x80($out),$out # out+=64*2
2058 sub \
$128,$len # len-=64*2
2059 vmovdqa
$xb1,0x20(%rsp)
2064 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2065 vpxor
0x20($inp),$xb0,$xb0
2066 vpxor
0x40($inp),$xc0,$xc0
2067 vpxor
0x60($inp),$xd0,$xd0
2068 vpxor
0x80($inp),$xa1,$xa1
2069 vpxor
0xa0($inp),$xb1,$xb1
2070 vmovdqu
$xa0,0x00($out)
2071 vmovdqu
$xb0,0x20($out)
2072 vmovdqu
$xc0,0x40($out)
2073 vmovdqu
$xd0,0x60($out)
2074 vmovdqu
$xa1,0x80($out)
2075 vmovdqu
$xb1,0xa0($out)
2078 lea
0xc0($inp),$inp # inp+=64*3
2080 vmovdqa
$xc1,0x00(%rsp)
2081 lea
0xc0($out),$out # out+=64*3
2082 sub \
$192,$len # len-=64*3
2083 vmovdqa
$xd1,0x20(%rsp)
2088 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2089 vpxor
0x20($inp),$xb0,$xb0
2090 vpxor
0x40($inp),$xc0,$xc0
2091 vpxor
0x60($inp),$xd0,$xd0
2092 vpxor
0x80($inp),$xa1,$xa1
2093 vpxor
0xa0($inp),$xb1,$xb1
2094 vpxor
0xc0($inp),$xc1,$xc1
2095 vpxor
0xe0($inp),$xd1,$xd1
2096 vmovdqu
$xa0,0x00($out)
2097 vmovdqu
$xb0,0x20($out)
2098 vmovdqu
$xc0,0x40($out)
2099 vmovdqu
$xd0,0x60($out)
2100 vmovdqu
$xa1,0x80($out)
2101 vmovdqu
$xb1,0xa0($out)
2102 vmovdqu
$xc1,0xc0($out)
2103 vmovdqu
$xd1,0xe0($out)
2106 lea
0x100($inp),$inp # inp+=64*4
2108 vmovdqa
$xa2,0x00(%rsp)
2109 lea
0x100($out),$out # out+=64*4
2110 sub \
$256,$len # len-=64*4
2111 vmovdqa
$xb2,0x20(%rsp)
2116 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2117 vpxor
0x20($inp),$xb0,$xb0
2118 vpxor
0x40($inp),$xc0,$xc0
2119 vpxor
0x60($inp),$xd0,$xd0
2120 vpxor
0x80($inp),$xa1,$xa1
2121 vpxor
0xa0($inp),$xb1,$xb1
2122 vpxor
0xc0($inp),$xc1,$xc1
2123 vpxor
0xe0($inp),$xd1,$xd1
2124 vpxor
0x100($inp),$xa2,$xa2
2125 vpxor
0x120($inp),$xb2,$xb2
2126 vmovdqu
$xa0,0x00($out)
2127 vmovdqu
$xb0,0x20($out)
2128 vmovdqu
$xc0,0x40($out)
2129 vmovdqu
$xd0,0x60($out)
2130 vmovdqu
$xa1,0x80($out)
2131 vmovdqu
$xb1,0xa0($out)
2132 vmovdqu
$xc1,0xc0($out)
2133 vmovdqu
$xd1,0xe0($out)
2134 vmovdqu
$xa2,0x100($out)
2135 vmovdqu
$xb2,0x120($out)
2138 lea
0x140($inp),$inp # inp+=64*5
2140 vmovdqa
$xc2,0x00(%rsp)
2141 lea
0x140($out),$out # out+=64*5
2142 sub \
$320,$len # len-=64*5
2143 vmovdqa
$xd2,0x20(%rsp)
2148 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2149 vpxor
0x20($inp),$xb0,$xb0
2150 vpxor
0x40($inp),$xc0,$xc0
2151 vpxor
0x60($inp),$xd0,$xd0
2152 vpxor
0x80($inp),$xa1,$xa1
2153 vpxor
0xa0($inp),$xb1,$xb1
2154 vpxor
0xc0($inp),$xc1,$xc1
2155 vpxor
0xe0($inp),$xd1,$xd1
2156 vpxor
0x100($inp),$xa2,$xa2
2157 vpxor
0x120($inp),$xb2,$xb2
2158 vpxor
0x140($inp),$xc2,$xc2
2159 vpxor
0x160($inp),$xd2,$xd2
2160 vmovdqu
$xa0,0x00($out)
2161 vmovdqu
$xb0,0x20($out)
2162 vmovdqu
$xc0,0x40($out)
2163 vmovdqu
$xd0,0x60($out)
2164 vmovdqu
$xa1,0x80($out)
2165 vmovdqu
$xb1,0xa0($out)
2166 vmovdqu
$xc1,0xc0($out)
2167 vmovdqu
$xd1,0xe0($out)
2168 vmovdqu
$xa2,0x100($out)
2169 vmovdqu
$xb2,0x120($out)
2170 vmovdqu
$xc2,0x140($out)
2171 vmovdqu
$xd2,0x160($out)
2174 lea
0x180($inp),$inp # inp+=64*6
2176 vmovdqa
$xa3,0x00(%rsp)
2177 lea
0x180($out),$out # out+=64*6
2178 sub \
$384,$len # len-=64*6
2179 vmovdqa
$xb3,0x20(%rsp)
2184 vpxor
0x00($inp),$xa0,$xa0 # xor with input
2185 vpxor
0x20($inp),$xb0,$xb0
2186 vpxor
0x40($inp),$xc0,$xc0
2187 vpxor
0x60($inp),$xd0,$xd0
2188 vpxor
0x80($inp),$xa1,$xa1
2189 vpxor
0xa0($inp),$xb1,$xb1
2190 vpxor
0xc0($inp),$xc1,$xc1
2191 vpxor
0xe0($inp),$xd1,$xd1
2192 vpxor
0x100($inp),$xa2,$xa2
2193 vpxor
0x120($inp),$xb2,$xb2
2194 vpxor
0x140($inp),$xc2,$xc2
2195 vpxor
0x160($inp),$xd2,$xd2
2196 vpxor
0x180($inp),$xa3,$xa3
2197 vpxor
0x1a0($inp),$xb3,$xb3
2198 vmovdqu
$xa0,0x00($out)
2199 vmovdqu
$xb0,0x20($out)
2200 vmovdqu
$xc0,0x40($out)
2201 vmovdqu
$xd0,0x60($out)
2202 vmovdqu
$xa1,0x80($out)
2203 vmovdqu
$xb1,0xa0($out)
2204 vmovdqu
$xc1,0xc0($out)
2205 vmovdqu
$xd1,0xe0($out)
2206 vmovdqu
$xa2,0x100($out)
2207 vmovdqu
$xb2,0x120($out)
2208 vmovdqu
$xc2,0x140($out)
2209 vmovdqu
$xd2,0x160($out)
2210 vmovdqu
$xa3,0x180($out)
2211 vmovdqu
$xb3,0x1a0($out)
2214 lea
0x1c0($inp),$inp # inp+=64*7
2216 vmovdqa
$xc3,0x00(%rsp)
2217 lea
0x1c0($out),$out # out+=64*7
2218 sub \
$448,$len # len-=64*7
2219 vmovdqa
$xd3,0x20(%rsp)
2222 movzb
($inp,%r10),%eax
2223 movzb
(%rsp,%r10),%ecx
2226 mov
%al,-1($out,%r10)
2233 $code.=<<___
if ($win64);
2234 lea
0x290+0x30(%rsp),%r11
2235 movaps
-0x30(%r11),%xmm6
2236 movaps
-0x20(%r11),%xmm7
2237 movaps
-0x10(%r11),%xmm8
2238 movaps
0x00(%r11),%xmm9
2239 movaps
0x10(%r11),%xmm10
2240 movaps
0x20(%r11),%xmm11
2241 movaps
0x30(%r11),%xmm12
2242 movaps
0x40(%r11),%xmm13
2243 movaps
0x50(%r11),%xmm14
2244 movaps
0x60(%r11),%xmm15
2247 mov
0x280(%rsp),%rsp
2249 .size ChaCha20_8x
,.-ChaCha20_8x
2253 ########################################################################
2256 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2257 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2258 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2259 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2260 my @key=map("%zmm$_",(16..31));
2261 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2263 sub AVX512_lane_ROUND
{
2264 my ($a0,$b0,$c0,$d0)=@_;
2265 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2266 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2267 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2268 my @x=map("\"$_\"",@xx);
2271 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2272 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2273 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2274 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2275 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2276 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2277 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2278 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2279 "&vprold (@x[$d0],@x[$d0],16)",
2280 "&vprold (@x[$d1],@x[$d1],16)",
2281 "&vprold (@x[$d2],@x[$d2],16)",
2282 "&vprold (@x[$d3],@x[$d3],16)",
2284 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2285 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2286 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2287 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2288 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2289 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2290 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2291 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2292 "&vprold (@x[$b0],@x[$b0],12)",
2293 "&vprold (@x[$b1],@x[$b1],12)",
2294 "&vprold (@x[$b2],@x[$b2],12)",
2295 "&vprold (@x[$b3],@x[$b3],12)",
2297 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2298 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2299 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2300 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2301 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2302 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2303 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2304 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2305 "&vprold (@x[$d0],@x[$d0],8)",
2306 "&vprold (@x[$d1],@x[$d1],8)",
2307 "&vprold (@x[$d2],@x[$d2],8)",
2308 "&vprold (@x[$d3],@x[$d3],8)",
2310 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2311 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2312 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2313 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2314 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2315 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2316 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2317 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2318 "&vprold (@x[$b0],@x[$b0],7)",
2319 "&vprold (@x[$b1],@x[$b1],7)",
2320 "&vprold (@x[$b2],@x[$b2],7)",
2321 "&vprold (@x[$b3],@x[$b3],7)"
2325 my $xframe = $win64 ?
0xb0 : 8;
2328 .type ChaCha20_16x
,\
@function,5
2333 sub \
$64+$xframe,%rsp
2336 $code.=<<___
if ($win64);
2337 lea
0x290+0x30(%rsp),%r11
2338 movaps
%xmm6,-0x30(%r11)
2339 movaps
%xmm7,-0x20(%r11)
2340 movaps
%xmm8,-0x10(%r11)
2341 movaps
%xmm9,0x00(%r11)
2342 movaps
%xmm10,0x10(%r11)
2343 movaps
%xmm11,0x20(%r11)
2344 movaps
%xmm12,0x30(%r11)
2345 movaps
%xmm13,0x40(%r11)
2346 movaps
%xmm14,0x50(%r11)
2347 movaps
%xmm15,0x60(%r11)
2352 lea
.Lsigma
(%rip),%r10
2353 vbroadcasti32x4
(%r10),$xa3 # key[0]
2354 vbroadcasti32x4
($key),$xb3 # key[1]
2355 vbroadcasti32x4
16($key),$xc3 # key[2]
2356 vbroadcasti32x4
($counter),$xd3 # key[3]
2358 vpshufd \
$0x00,$xa3,$xa0 # smash key by lanes...
2359 vpshufd \
$0x55,$xa3,$xa1
2360 vpshufd \
$0xaa,$xa3,$xa2
2361 vpshufd \
$0xff,$xa3,$xa3
2362 vmovdqa64
$xa0,@key[0]
2363 vmovdqa64
$xa1,@key[1]
2364 vmovdqa64
$xa2,@key[2]
2365 vmovdqa64
$xa3,@key[3]
2367 vpshufd \
$0x00,$xb3,$xb0
2368 vpshufd \
$0x55,$xb3,$xb1
2369 vpshufd \
$0xaa,$xb3,$xb2
2370 vpshufd \
$0xff,$xb3,$xb3
2371 vmovdqa64
$xb0,@key[4]
2372 vmovdqa64
$xb1,@key[5]
2373 vmovdqa64
$xb2,@key[6]
2374 vmovdqa64
$xb3,@key[7]
2376 vpshufd \
$0x00,$xc3,$xc0
2377 vpshufd \
$0x55,$xc3,$xc1
2378 vpshufd \
$0xaa,$xc3,$xc2
2379 vpshufd \
$0xff,$xc3,$xc3
2380 vmovdqa64
$xc0,@key[8]
2381 vmovdqa64
$xc1,@key[9]
2382 vmovdqa64
$xc2,@key[10]
2383 vmovdqa64
$xc3,@key[11]
2385 vpshufd \
$0x00,$xd3,$xd0
2386 vpshufd \
$0x55,$xd3,$xd1
2387 vpshufd \
$0xaa,$xd3,$xd2
2388 vpshufd \
$0xff,$xd3,$xd3
2389 vpaddd
.Lincz
(%rip),$xd0,$xd0 # don't save counters yet
2390 vmovdqa64
$xd0,@key[12]
2391 vmovdqa64
$xd1,@key[13]
2392 vmovdqa64
$xd2,@key[14]
2393 vmovdqa64
$xd3,@key[15]
2400 vpbroadcastd
0(%r10),$xa0 # reload key
2401 vpbroadcastd
4(%r10),$xa1
2402 vpbroadcastd
8(%r10),$xa2
2403 vpbroadcastd
12(%r10),$xa3
2404 vpaddd
.Lsixteen
(%rip),@key[12],@key[12] # next SIMD counters
2405 vmovdqa64
@key[4],$xb0
2406 vmovdqa64
@key[5],$xb1
2407 vmovdqa64
@key[6],$xb2
2408 vmovdqa64
@key[7],$xb3
2409 vmovdqa64
@key[8],$xc0
2410 vmovdqa64
@key[9],$xc1
2411 vmovdqa64
@key[10],$xc2
2412 vmovdqa64
@key[11],$xc3
2413 vmovdqa64
@key[12],$xd0
2414 vmovdqa64
@key[13],$xd1
2415 vmovdqa64
@key[14],$xd2
2416 vmovdqa64
@key[15],$xd3
2418 vmovdqa64
$xa0,@key[0]
2419 vmovdqa64
$xa1,@key[1]
2420 vmovdqa64
$xa2,@key[2]
2421 vmovdqa64
$xa3,@key[3]
2429 foreach (&AVX512_lane_ROUND
(0, 4, 8,12)) { eval; }
2430 foreach (&AVX512_lane_ROUND
(0, 5,10,15)) { eval; }
2435 vpaddd
@key[0],$xa0,$xa0 # accumulate key
2436 vpaddd
@key[1],$xa1,$xa1
2437 vpaddd
@key[2],$xa2,$xa2
2438 vpaddd
@key[3],$xa3,$xa3
2440 vpunpckldq
$xa1,$xa0,$xt2 # "de-interlace" data
2441 vpunpckldq
$xa3,$xa2,$xt3
2442 vpunpckhdq
$xa1,$xa0,$xa0
2443 vpunpckhdq
$xa3,$xa2,$xa2
2444 vpunpcklqdq
$xt3,$xt2,$xa1 # "a0"
2445 vpunpckhqdq
$xt3,$xt2,$xt2 # "a1"
2446 vpunpcklqdq
$xa2,$xa0,$xa3 # "a2"
2447 vpunpckhqdq
$xa2,$xa0,$xa0 # "a3"
2449 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2451 vpaddd
@key[4],$xb0,$xb0
2452 vpaddd
@key[5],$xb1,$xb1
2453 vpaddd
@key[6],$xb2,$xb2
2454 vpaddd
@key[7],$xb3,$xb3
2456 vpunpckldq
$xb1,$xb0,$xt2
2457 vpunpckldq
$xb3,$xb2,$xt3
2458 vpunpckhdq
$xb1,$xb0,$xb0
2459 vpunpckhdq
$xb3,$xb2,$xb2
2460 vpunpcklqdq
$xt3,$xt2,$xb1 # "b0"
2461 vpunpckhqdq
$xt3,$xt2,$xt2 # "b1"
2462 vpunpcklqdq
$xb2,$xb0,$xb3 # "b2"
2463 vpunpckhqdq
$xb2,$xb0,$xb0 # "b3"
2465 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2467 vshufi32x4 \
$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2468 vshufi32x4 \
$0xee,$xb0,$xa0,$xb0
2469 vshufi32x4 \
$0x44,$xb1,$xa1,$xa0
2470 vshufi32x4 \
$0xee,$xb1,$xa1,$xb1
2471 vshufi32x4 \
$0x44,$xb2,$xa2,$xa1
2472 vshufi32x4 \
$0xee,$xb2,$xa2,$xb2
2473 vshufi32x4 \
$0x44,$xb3,$xa3,$xa2
2474 vshufi32x4 \
$0xee,$xb3,$xa3,$xb3
2476 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2478 vpaddd
@key[8],$xc0,$xc0
2479 vpaddd
@key[9],$xc1,$xc1
2480 vpaddd
@key[10],$xc2,$xc2
2481 vpaddd
@key[11],$xc3,$xc3
2483 vpunpckldq
$xc1,$xc0,$xt2
2484 vpunpckldq
$xc3,$xc2,$xt3
2485 vpunpckhdq
$xc1,$xc0,$xc0
2486 vpunpckhdq
$xc3,$xc2,$xc2
2487 vpunpcklqdq
$xt3,$xt2,$xc1 # "c0"
2488 vpunpckhqdq
$xt3,$xt2,$xt2 # "c1"
2489 vpunpcklqdq
$xc2,$xc0,$xc3 # "c2"
2490 vpunpckhqdq
$xc2,$xc0,$xc0 # "c3"
2492 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2494 vpaddd
@key[12],$xd0,$xd0
2495 vpaddd
@key[13],$xd1,$xd1
2496 vpaddd
@key[14],$xd2,$xd2
2497 vpaddd
@key[15],$xd3,$xd3
2499 vpunpckldq
$xd1,$xd0,$xt2
2500 vpunpckldq
$xd3,$xd2,$xt3
2501 vpunpckhdq
$xd1,$xd0,$xd0
2502 vpunpckhdq
$xd3,$xd2,$xd2
2503 vpunpcklqdq
$xt3,$xt2,$xd1 # "d0"
2504 vpunpckhqdq
$xt3,$xt2,$xt2 # "d1"
2505 vpunpcklqdq
$xd2,$xd0,$xd3 # "d2"
2506 vpunpckhqdq
$xd2,$xd0,$xd0 # "d3"
2508 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2510 vshufi32x4 \
$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2511 vshufi32x4 \
$0xee,$xd0,$xc0,$xd0
2512 vshufi32x4 \
$0x44,$xd1,$xc1,$xc0
2513 vshufi32x4 \
$0xee,$xd1,$xc1,$xd1
2514 vshufi32x4 \
$0x44,$xd2,$xc2,$xc1
2515 vshufi32x4 \
$0xee,$xd2,$xc2,$xd2
2516 vshufi32x4 \
$0x44,$xd3,$xc3,$xc2
2517 vshufi32x4 \
$0xee,$xd3,$xc3,$xd3
2519 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2521 vshufi32x4 \
$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2522 vshufi32x4 \
$0xdd,$xc0,$xa0,$xa0
2523 vshufi32x4 \
$0x88,$xd0,$xb0,$xc0
2524 vshufi32x4 \
$0xdd,$xd0,$xb0,$xd0
2525 vshufi32x4 \
$0x88,$xc1,$xa1,$xt1
2526 vshufi32x4 \
$0xdd,$xc1,$xa1,$xa1
2527 vshufi32x4 \
$0x88,$xd1,$xb1,$xc1
2528 vshufi32x4 \
$0xdd,$xd1,$xb1,$xd1
2529 vshufi32x4 \
$0x88,$xc2,$xa2,$xt2
2530 vshufi32x4 \
$0xdd,$xc2,$xa2,$xa2
2531 vshufi32x4 \
$0x88,$xd2,$xb2,$xc2
2532 vshufi32x4 \
$0xdd,$xd2,$xb2,$xd2
2533 vshufi32x4 \
$0x88,$xc3,$xa3,$xt3
2534 vshufi32x4 \
$0xdd,$xc3,$xa3,$xa3
2535 vshufi32x4 \
$0x88,$xd3,$xb3,$xc3
2536 vshufi32x4 \
$0xdd,$xd3,$xb3,$xd3
2538 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2539 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2541 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2542 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2543 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2544 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2549 vpxord
0x00($inp),$xa0,$xa0 # xor with input
2550 vpxord
0x40($inp),$xb0,$xb0
2551 vpxord
0x80($inp),$xc0,$xc0
2552 vpxord
0xc0($inp),$xd0,$xd0
2553 vmovdqu32
$xa0,0x00($out)
2554 vmovdqu32
$xb0,0x40($out)
2555 vmovdqu32
$xc0,0x80($out)
2556 vmovdqu32
$xd0,0xc0($out)
2558 vpxord
0x100($inp),$xa1,$xa1
2559 vpxord
0x140($inp),$xb1,$xb1
2560 vpxord
0x180($inp),$xc1,$xc1
2561 vpxord
0x1c0($inp),$xd1,$xd1
2562 vmovdqu32
$xa1,0x100($out)
2563 vmovdqu32
$xb1,0x140($out)
2564 vmovdqu32
$xc1,0x180($out)
2565 vmovdqu32
$xd1,0x1c0($out)
2567 vpxord
0x200($inp),$xa2,$xa2
2568 vpxord
0x240($inp),$xb2,$xb2
2569 vpxord
0x280($inp),$xc2,$xc2
2570 vpxord
0x2c0($inp),$xd2,$xd2
2571 vmovdqu32
$xa2,0x200($out)
2572 vmovdqu32
$xb2,0x240($out)
2573 vmovdqu32
$xc2,0x280($out)
2574 vmovdqu32
$xd2,0x2c0($out)
2576 vpxord
0x300($inp),$xa3,$xa3
2577 vpxord
0x340($inp),$xb3,$xb3
2578 vpxord
0x380($inp),$xc3,$xc3
2579 vpxord
0x3c0($inp),$xd3,$xd3
2580 lea
0x400($inp),$inp
2581 vmovdqu32
$xa3,0x300($out)
2582 vmovdqu32
$xb3,0x340($out)
2583 vmovdqu32
$xc3,0x380($out)
2584 vmovdqu32
$xd3,0x3c0($out)
2585 lea
0x400($out),$out
2597 jb
.Less_than_64_16x
2598 vpxord
($inp),$xa0,$xa0 # xor with input
2599 vmovdqu32
$xa0,($out,$inp)
2605 jb
.Less_than_64_16x
2606 vpxord
($inp),$xb0,$xb0
2607 vmovdqu32
$xb0,($out,$inp)
2613 jb
.Less_than_64_16x
2614 vpxord
($inp),$xc0,$xc0
2615 vmovdqu32
$xc0,($out,$inp)
2621 jb
.Less_than_64_16x
2622 vpxord
($inp),$xd0,$xd0
2623 vmovdqu32
$xd0,($out,$inp)
2629 jb
.Less_than_64_16x
2630 vpxord
($inp),$xa1,$xa1
2631 vmovdqu32
$xa1,($out,$inp)
2637 jb
.Less_than_64_16x
2638 vpxord
($inp),$xb1,$xb1
2639 vmovdqu32
$xb1,($out,$inp)
2645 jb
.Less_than_64_16x
2646 vpxord
($inp),$xc1,$xc1
2647 vmovdqu32
$xc1,($out,$inp)
2653 jb
.Less_than_64_16x
2654 vpxord
($inp),$xd1,$xd1
2655 vmovdqu32
$xd1,($out,$inp)
2661 jb
.Less_than_64_16x
2662 vpxord
($inp),$xa2,$xa2
2663 vmovdqu32
$xa2,($out,$inp)
2669 jb
.Less_than_64_16x
2670 vpxord
($inp),$xb2,$xb2
2671 vmovdqu32
$xb2,($out,$inp)
2677 jb
.Less_than_64_16x
2678 vpxord
($inp),$xc2,$xc2
2679 vmovdqu32
$xc2,($out,$inp)
2685 jb
.Less_than_64_16x
2686 vpxord
($inp),$xd2,$xd2
2687 vmovdqu32
$xd2,($out,$inp)
2693 jb
.Less_than_64_16x
2694 vpxord
($inp),$xa3,$xa3
2695 vmovdqu32
$xa3,($out,$inp)
2701 jb
.Less_than_64_16x
2702 vpxord
($inp),$xb3,$xb3
2703 vmovdqu32
$xb3,($out,$inp)
2709 jb
.Less_than_64_16x
2710 vpxord
($inp),$xc3,$xc3
2711 vmovdqu32
$xc3,($out,$inp)
2717 vmovdqa32
$xa0,0x00(%rsp)
2718 lea
($out,$inp),$out
2722 movzb
($inp,%r10),%eax
2723 movzb
(%rsp,%r10),%ecx
2726 mov
%al,-1($out,%r10)
2733 $code.=<<___
if ($win64);
2734 lea
0x290+0x30(%rsp),%r11
2735 movaps
-0x30(%r11),%xmm6
2736 movaps
-0x20(%r11),%xmm7
2737 movaps
-0x10(%r11),%xmm8
2738 movaps
0x00(%r11),%xmm9
2739 movaps
0x10(%r11),%xmm10
2740 movaps
0x20(%r11),%xmm11
2741 movaps
0x30(%r11),%xmm12
2742 movaps
0x40(%r11),%xmm13
2743 movaps
0x50(%r11),%xmm14
2744 movaps
0x60(%r11),%xmm15
2749 .size ChaCha20_16x
,.-ChaCha20_16x
2753 foreach (split("\n",$code)) {
2754 s/\`([^\`]*)\`/eval $1/geo;