]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chacha-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
03d770d9 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a98c648e
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
abb8c44f
AP
21# December 2016
22#
23# Add AVX512F code path.
24#
cded9513
AP
25# December 2017
26#
27# Add AVX512VL code path.
28#
a98c648e
AP
29# Performance in cycles per byte out of large buffer.
30#
d5487a45 31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
a98c648e 32#
d5487a45
AP
33# P4 9.48/+99% - -
34# Core2 7.83/+55% 7.90/5.76 4.35
35# Westmere 7.19/+50% 5.60/4.50 3.00
36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
37# Ivy Bridge 6.71/+46% 5.40/? 2.41
38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
41# Knights L 11.7/- ? 9.60(iii) 0.80
42# Goldmont 10.6/+17% 5.10/3.52 3.28
43# Sledgehammer 7.28/+52% - -
44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
46# VIA Nano 10.5/+46% 6.72/6.88 6.05
a98c648e
AP
47#
48# (i) compared to older gcc 3.x one can observe >2x improvement on
49# most platforms;
d5487a45
AP
50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
51# by chacha20_poly1305_tls_cipher, results are EVP-free;
a98c648e
AP
52# (iii) this is not optimal result for Atom because of MSROM
53# limitations, SSE2 can do better, but gain is considered too
54# low to justify the [maintenance] effort;
d5487a45
AP
55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
56# and 4.85 for 128-byte inputs;
cded9513
AP
57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
59# cpb in single thread, the corresponding capability is suppressed;
a98c648e 60
1aa89a7a
RL
61# $output is the last argument if it looks like a file (it has an extension)
62# $flavour is the first argument if it doesn't look like a file
63$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
a98c648e
AP
65
66$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67
68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71die "can't locate x86_64-xlate.pl";
72
73if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
abb8c44f 75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
a98c648e
AP
76}
77
78if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1ea01427 79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
abb8c44f
AP
80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81 $avx += 1 if ($1==2.11 && $2>=8);
a98c648e
AP
82}
83
84if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 $avx = ($1>=10) + ($1>=11);
87}
88
9bb3e5fd 89if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
a98c648e
AP
90 $avx = ($2>=3.0) + ($2>3.0);
91}
92
1aa89a7a
RL
93open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94 or die "can't call $xlate: $!";
a98c648e
AP
95*STDOUT=*OUT;
96
97# input parameter block
98($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
99
100$code.=<<___;
101.text
102
103.extern OPENSSL_ia32cap_P
104
105.align 64
106.Lzero:
107.long 0,0,0,0
108.Lone:
109.long 1,0,0,0
110.Linc:
111.long 0,1,2,3
112.Lfour:
113.long 4,4,4,4
114.Lincy:
115.long 0,2,4,6,1,3,5,7
116.Leight:
117.long 8,8,8,8,8,8,8,8
118.Lrot16:
119.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
120.Lrot24:
121.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
cded9513
AP
122.Ltwoy:
123.long 2,0,0,0, 2,0,0,0
abb8c44f 124.align 64
3c274a6e
AP
125.Lzeroz:
126.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
127.Lfourz:
128.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
abb8c44f
AP
129.Lincz:
130.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
131.Lsixteen:
132.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
cded9513
AP
133.Lsigma:
134.asciz "expand 32-byte k"
a98c648e
AP
135.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
136___
137
138sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
139{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
140 my $arg = pop;
141 $arg = "\$$arg" if ($arg*1 eq $arg);
142 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
143}
144
145@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
146 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
147@t=("%esi","%edi");
148
149sub ROUND { # critical path is 24 cycles per round
150my ($a0,$b0,$c0,$d0)=@_;
151my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
152my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
153my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
154my ($xc,$xc_)=map("\"$_\"",@t);
155my @x=map("\"$_\"",@x);
156
157 # Consider order in which variables are addressed by their
158 # index:
159 #
160 # a b c d
161 #
162 # 0 4 8 12 < even round
163 # 1 5 9 13
164 # 2 6 10 14
165 # 3 7 11 15
166 # 0 5 10 15 < odd round
167 # 1 6 11 12
168 # 2 7 8 13
169 # 3 4 9 14
170 #
171 # 'a', 'b' and 'd's are permanently allocated in registers,
172 # @x[0..7,12..15], while 'c's are maintained in memory. If
173 # you observe 'c' column, you'll notice that pair of 'c's is
174 # invariant between rounds. This means that we have to reload
175 # them once per round, in the middle. This is why you'll see
176 # bunch of 'c' stores and loads in the middle, but none in
177 # the beginning or end.
178
179 # Normally instructions would be interleaved to favour in-order
180 # execution. Generally out-of-order cores manage it gracefully,
181 # but not this time for some reason. As in-order execution
182 # cores are dying breed, old Atom is the only one around,
183 # instructions are left uninterleaved. Besides, Atom is better
184 # off executing 1xSSSE3 code anyway...
185
186 (
187 "&add (@x[$a0],@x[$b0])", # Q1
188 "&xor (@x[$d0],@x[$a0])",
189 "&rol (@x[$d0],16)",
190 "&add (@x[$a1],@x[$b1])", # Q2
191 "&xor (@x[$d1],@x[$a1])",
192 "&rol (@x[$d1],16)",
193
194 "&add ($xc,@x[$d0])",
195 "&xor (@x[$b0],$xc)",
196 "&rol (@x[$b0],12)",
197 "&add ($xc_,@x[$d1])",
198 "&xor (@x[$b1],$xc_)",
199 "&rol (@x[$b1],12)",
200
201 "&add (@x[$a0],@x[$b0])",
202 "&xor (@x[$d0],@x[$a0])",
203 "&rol (@x[$d0],8)",
204 "&add (@x[$a1],@x[$b1])",
205 "&xor (@x[$d1],@x[$a1])",
206 "&rol (@x[$d1],8)",
207
208 "&add ($xc,@x[$d0])",
209 "&xor (@x[$b0],$xc)",
210 "&rol (@x[$b0],7)",
211 "&add ($xc_,@x[$d1])",
212 "&xor (@x[$b1],$xc_)",
213 "&rol (@x[$b1],7)",
214
215 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
216 "&mov (\"4*$c1(%rsp)\",$xc_)",
217 "&mov ($xc,\"4*$c2(%rsp)\")",
218 "&mov ($xc_,\"4*$c3(%rsp)\")",
219
220 "&add (@x[$a2],@x[$b2])", # Q3
221 "&xor (@x[$d2],@x[$a2])",
222 "&rol (@x[$d2],16)",
223 "&add (@x[$a3],@x[$b3])", # Q4
224 "&xor (@x[$d3],@x[$a3])",
225 "&rol (@x[$d3],16)",
226
227 "&add ($xc,@x[$d2])",
228 "&xor (@x[$b2],$xc)",
229 "&rol (@x[$b2],12)",
230 "&add ($xc_,@x[$d3])",
231 "&xor (@x[$b3],$xc_)",
232 "&rol (@x[$b3],12)",
233
234 "&add (@x[$a2],@x[$b2])",
235 "&xor (@x[$d2],@x[$a2])",
236 "&rol (@x[$d2],8)",
237 "&add (@x[$a3],@x[$b3])",
238 "&xor (@x[$d3],@x[$a3])",
239 "&rol (@x[$d3],8)",
240
241 "&add ($xc,@x[$d2])",
242 "&xor (@x[$b2],$xc)",
243 "&rol (@x[$b2],7)",
244 "&add ($xc_,@x[$d3])",
245 "&xor (@x[$b3],$xc_)",
246 "&rol (@x[$b3],7)"
247 );
248}
249
250########################################################################
251# Generic code path that handles all lengths on pre-SSSE3 processors.
252$code.=<<___;
253.globl ChaCha20_ctr32
254.type ChaCha20_ctr32,\@function,5
255.align 64
256ChaCha20_ctr32:
f17652e5 257.cfi_startproc
622a531c
AP
258 cmp \$0,$len
259 je .Lno_data
a98c648e 260 mov OPENSSL_ia32cap_P+4(%rip),%r10
3c274a6e
AP
261___
262$code.=<<___ if ($avx>2);
263 bt \$48,%r10 # check for AVX512F
264 jc .LChaCha20_avx512
cded9513
AP
265 test %r10,%r10 # check for AVX512VL
266 js .LChaCha20_avx512vl
3c274a6e
AP
267___
268$code.=<<___;
a98c648e
AP
269 test \$`1<<(41-32)`,%r10d
270 jnz .LChaCha20_ssse3
271
272 push %rbx
f17652e5 273.cfi_push %rbx
a98c648e 274 push %rbp
f17652e5 275.cfi_push %rbp
a98c648e 276 push %r12
f17652e5 277.cfi_push %r12
a98c648e 278 push %r13
f17652e5 279.cfi_push %r13
a98c648e 280 push %r14
f17652e5 281.cfi_push %r14
a98c648e 282 push %r15
f17652e5 283.cfi_push %r15
a98c648e 284 sub \$64+24,%rsp
f17652e5 285.cfi_adjust_cfa_offset 64+24
384e6de4 286.Lctr32_body:
a98c648e
AP
287
288 #movdqa .Lsigma(%rip),%xmm0
289 movdqu ($key),%xmm1
290 movdqu 16($key),%xmm2
291 movdqu ($counter),%xmm3
292 movdqa .Lone(%rip),%xmm4
293
294 #movdqa %xmm0,4*0(%rsp) # key[0]
295 movdqa %xmm1,4*4(%rsp) # key[1]
296 movdqa %xmm2,4*8(%rsp) # key[2]
297 movdqa %xmm3,4*12(%rsp) # key[3]
298 mov $len,%rbp # reassign $len
299 jmp .Loop_outer
300
301.align 32
302.Loop_outer:
303 mov \$0x61707865,@x[0] # 'expa'
304 mov \$0x3320646e,@x[1] # 'nd 3'
305 mov \$0x79622d32,@x[2] # '2-by'
306 mov \$0x6b206574,@x[3] # 'te k'
307 mov 4*4(%rsp),@x[4]
308 mov 4*5(%rsp),@x[5]
309 mov 4*6(%rsp),@x[6]
310 mov 4*7(%rsp),@x[7]
311 movd %xmm3,@x[12]
312 mov 4*13(%rsp),@x[13]
313 mov 4*14(%rsp),@x[14]
314 mov 4*15(%rsp),@x[15]
315
316 mov %rbp,64+0(%rsp) # save len
317 mov \$10,%ebp
318 mov $inp,64+8(%rsp) # save inp
319 movq %xmm2,%rsi # "@x[8]"
320 mov $out,64+16(%rsp) # save out
321 mov %rsi,%rdi
322 shr \$32,%rdi # "@x[9]"
323 jmp .Loop
324
325.align 32
326.Loop:
327___
328 foreach (&ROUND (0, 4, 8,12)) { eval; }
329 foreach (&ROUND (0, 5,10,15)) { eval; }
330 &dec ("%ebp");
331 &jnz (".Loop");
332
333$code.=<<___;
334 mov @t[1],4*9(%rsp) # modulo-scheduled
335 mov @t[0],4*8(%rsp)
336 mov 64(%rsp),%rbp # load len
337 movdqa %xmm2,%xmm1
338 mov 64+8(%rsp),$inp # load inp
339 paddd %xmm4,%xmm3 # increment counter
340 mov 64+16(%rsp),$out # load out
341
342 add \$0x61707865,@x[0] # 'expa'
343 add \$0x3320646e,@x[1] # 'nd 3'
344 add \$0x79622d32,@x[2] # '2-by'
345 add \$0x6b206574,@x[3] # 'te k'
346 add 4*4(%rsp),@x[4]
347 add 4*5(%rsp),@x[5]
348 add 4*6(%rsp),@x[6]
349 add 4*7(%rsp),@x[7]
350 add 4*12(%rsp),@x[12]
351 add 4*13(%rsp),@x[13]
352 add 4*14(%rsp),@x[14]
353 add 4*15(%rsp),@x[15]
354 paddd 4*8(%rsp),%xmm1
355
356 cmp \$64,%rbp
357 jb .Ltail
358
359 xor 4*0($inp),@x[0] # xor with input
360 xor 4*1($inp),@x[1]
361 xor 4*2($inp),@x[2]
362 xor 4*3($inp),@x[3]
363 xor 4*4($inp),@x[4]
364 xor 4*5($inp),@x[5]
365 xor 4*6($inp),@x[6]
366 xor 4*7($inp),@x[7]
367 movdqu 4*8($inp),%xmm0
368 xor 4*12($inp),@x[12]
369 xor 4*13($inp),@x[13]
370 xor 4*14($inp),@x[14]
371 xor 4*15($inp),@x[15]
372 lea 4*16($inp),$inp # inp+=64
373 pxor %xmm1,%xmm0
374
375 movdqa %xmm2,4*8(%rsp)
376 movd %xmm3,4*12(%rsp)
377
378 mov @x[0],4*0($out) # write output
379 mov @x[1],4*1($out)
380 mov @x[2],4*2($out)
381 mov @x[3],4*3($out)
382 mov @x[4],4*4($out)
383 mov @x[5],4*5($out)
384 mov @x[6],4*6($out)
385 mov @x[7],4*7($out)
386 movdqu %xmm0,4*8($out)
387 mov @x[12],4*12($out)
388 mov @x[13],4*13($out)
389 mov @x[14],4*14($out)
390 mov @x[15],4*15($out)
391 lea 4*16($out),$out # out+=64
392
393 sub \$64,%rbp
394 jnz .Loop_outer
395
396 jmp .Ldone
397
398.align 16
399.Ltail:
400 mov @x[0],4*0(%rsp)
a98c648e 401 mov @x[1],4*1(%rsp)
29880e97 402 xor %rbx,%rbx
a98c648e
AP
403 mov @x[2],4*2(%rsp)
404 mov @x[3],4*3(%rsp)
405 mov @x[4],4*4(%rsp)
406 mov @x[5],4*5(%rsp)
407 mov @x[6],4*6(%rsp)
408 mov @x[7],4*7(%rsp)
409 movdqa %xmm1,4*8(%rsp)
410 mov @x[12],4*12(%rsp)
411 mov @x[13],4*13(%rsp)
412 mov @x[14],4*14(%rsp)
413 mov @x[15],4*15(%rsp)
414
415.Loop_tail:
416 movzb ($inp,%rbx),%eax
417 movzb (%rsp,%rbx),%edx
418 lea 1(%rbx),%rbx
419 xor %edx,%eax
420 mov %al,-1($out,%rbx)
421 dec %rbp
422 jnz .Loop_tail
423
424.Ldone:
384e6de4 425 lea 64+24+48(%rsp),%rsi
f17652e5 426.cfi_def_cfa %rsi,8
384e6de4 427 mov -48(%rsi),%r15
f17652e5 428.cfi_restore %r15
384e6de4 429 mov -40(%rsi),%r14
f17652e5 430.cfi_restore %r14
384e6de4 431 mov -32(%rsi),%r13
f17652e5 432.cfi_restore %r13
384e6de4 433 mov -24(%rsi),%r12
f17652e5 434.cfi_restore %r12
384e6de4 435 mov -16(%rsi),%rbp
f17652e5 436.cfi_restore %rbp
384e6de4 437 mov -8(%rsi),%rbx
f17652e5 438.cfi_restore %rbx
384e6de4 439 lea (%rsi),%rsp
f17652e5 440.cfi_def_cfa_register %rsp
622a531c 441.Lno_data:
a98c648e 442 ret
f17652e5 443.cfi_endproc
a98c648e
AP
444.size ChaCha20_ctr32,.-ChaCha20_ctr32
445___
446
447########################################################################
448# SSSE3 code path that handles shorter lengths
449{
450my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
451
452sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
453 &paddd ($a,$b);
454 &pxor ($d,$a);
455 &pshufb ($d,$rot16);
456
457 &paddd ($c,$d);
458 &pxor ($b,$c);
459 &movdqa ($t,$b);
460 &psrld ($b,20);
461 &pslld ($t,12);
462 &por ($b,$t);
463
464 &paddd ($a,$b);
465 &pxor ($d,$a);
466 &pshufb ($d,$rot24);
467
468 &paddd ($c,$d);
469 &pxor ($b,$c);
470 &movdqa ($t,$b);
471 &psrld ($b,25);
472 &pslld ($t,7);
473 &por ($b,$t);
474}
475
384e6de4 476my $xframe = $win64 ? 32+8 : 8;
a98c648e
AP
477
478$code.=<<___;
479.type ChaCha20_ssse3,\@function,5
480.align 32
481ChaCha20_ssse3:
f17652e5 482.cfi_startproc
a98c648e 483.LChaCha20_ssse3:
384e6de4 484 mov %rsp,%r9 # frame pointer
f17652e5 485.cfi_def_cfa_register %r9
a98c648e
AP
486___
487$code.=<<___ if ($avx);
488 test \$`1<<(43-32)`,%r10d
489 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
490___
491$code.=<<___;
492 cmp \$128,$len # we might throw away some data,
d5487a45 493 je .LChaCha20_128
a98c648e
AP
494 ja .LChaCha20_4x # but overall it won't be slower
495
496.Ldo_sse3_after_all:
a98c648e
AP
497 sub \$64+$xframe,%rsp
498___
499$code.=<<___ if ($win64);
384e6de4
AP
500 movaps %xmm6,-0x28(%r9)
501 movaps %xmm7,-0x18(%r9)
502.Lssse3_body:
a98c648e
AP
503___
504$code.=<<___;
505 movdqa .Lsigma(%rip),$a
506 movdqu ($key),$b
507 movdqu 16($key),$c
508 movdqu ($counter),$d
509 movdqa .Lrot16(%rip),$rot16
510 movdqa .Lrot24(%rip),$rot24
511
512 movdqa $a,0x00(%rsp)
513 movdqa $b,0x10(%rsp)
514 movdqa $c,0x20(%rsp)
515 movdqa $d,0x30(%rsp)
3c274a6e 516 mov \$10,$counter # reuse $counter
a98c648e
AP
517 jmp .Loop_ssse3
518
519.align 32
520.Loop_outer_ssse3:
521 movdqa .Lone(%rip),$d
522 movdqa 0x00(%rsp),$a
523 movdqa 0x10(%rsp),$b
524 movdqa 0x20(%rsp),$c
525 paddd 0x30(%rsp),$d
3c274a6e 526 mov \$10,$counter
a98c648e
AP
527 movdqa $d,0x30(%rsp)
528 jmp .Loop_ssse3
529
530.align 32
531.Loop_ssse3:
532___
533 &SSSE3ROUND();
534 &pshufd ($c,$c,0b01001110);
535 &pshufd ($b,$b,0b00111001);
536 &pshufd ($d,$d,0b10010011);
537 &nop ();
538
539 &SSSE3ROUND();
540 &pshufd ($c,$c,0b01001110);
541 &pshufd ($b,$b,0b10010011);
542 &pshufd ($d,$d,0b00111001);
543
3c274a6e 544 &dec ($counter);
a98c648e
AP
545 &jnz (".Loop_ssse3");
546
547$code.=<<___;
548 paddd 0x00(%rsp),$a
549 paddd 0x10(%rsp),$b
550 paddd 0x20(%rsp),$c
551 paddd 0x30(%rsp),$d
552
553 cmp \$64,$len
554 jb .Ltail_ssse3
555
556 movdqu 0x00($inp),$t
557 movdqu 0x10($inp),$t1
558 pxor $t,$a # xor with input
559 movdqu 0x20($inp),$t
560 pxor $t1,$b
561 movdqu 0x30($inp),$t1
562 lea 0x40($inp),$inp # inp+=64
563 pxor $t,$c
564 pxor $t1,$d
565
566 movdqu $a,0x00($out) # write output
567 movdqu $b,0x10($out)
568 movdqu $c,0x20($out)
569 movdqu $d,0x30($out)
570 lea 0x40($out),$out # out+=64
571
572 sub \$64,$len
573 jnz .Loop_outer_ssse3
574
575 jmp .Ldone_ssse3
576
577.align 16
578.Ltail_ssse3:
579 movdqa $a,0x00(%rsp)
580 movdqa $b,0x10(%rsp)
581 movdqa $c,0x20(%rsp)
582 movdqa $d,0x30(%rsp)
3c274a6e 583 xor $counter,$counter
a98c648e
AP
584
585.Loop_tail_ssse3:
3c274a6e
AP
586 movzb ($inp,$counter),%eax
587 movzb (%rsp,$counter),%ecx
588 lea 1($counter),$counter
29880e97 589 xor %ecx,%eax
3c274a6e 590 mov %al,-1($out,$counter)
29880e97 591 dec $len
a98c648e
AP
592 jnz .Loop_tail_ssse3
593
594.Ldone_ssse3:
595___
596$code.=<<___ if ($win64);
384e6de4
AP
597 movaps -0x28(%r9),%xmm6
598 movaps -0x18(%r9),%xmm7
a98c648e
AP
599___
600$code.=<<___;
384e6de4 601 lea (%r9),%rsp
f17652e5 602.cfi_def_cfa_register %rsp
384e6de4 603.Lssse3_epilogue:
a98c648e 604 ret
f17652e5 605.cfi_endproc
a98c648e
AP
606.size ChaCha20_ssse3,.-ChaCha20_ssse3
607___
608}
609
d5487a45
AP
610########################################################################
611# SSSE3 code path that handles 128-byte inputs
612{
613my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
614my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
615
616sub SSSE3ROUND_2x {
617 &paddd ($a,$b);
618 &pxor ($d,$a);
619 &paddd ($a1,$b1);
620 &pxor ($d1,$a1);
621 &pshufb ($d,$rot16);
622 &pshufb($d1,$rot16);
623
624 &paddd ($c,$d);
625 &paddd ($c1,$d1);
626 &pxor ($b,$c);
627 &pxor ($b1,$c1);
628 &movdqa ($t,$b);
629 &psrld ($b,20);
630 &movdqa($t1,$b1);
631 &pslld ($t,12);
632 &psrld ($b1,20);
633 &por ($b,$t);
634 &pslld ($t1,12);
635 &por ($b1,$t1);
636
637 &paddd ($a,$b);
638 &pxor ($d,$a);
639 &paddd ($a1,$b1);
640 &pxor ($d1,$a1);
641 &pshufb ($d,$rot24);
642 &pshufb($d1,$rot24);
643
644 &paddd ($c,$d);
645 &paddd ($c1,$d1);
646 &pxor ($b,$c);
647 &pxor ($b1,$c1);
648 &movdqa ($t,$b);
649 &psrld ($b,25);
650 &movdqa($t1,$b1);
651 &pslld ($t,7);
652 &psrld ($b1,25);
653 &por ($b,$t);
654 &pslld ($t1,7);
655 &por ($b1,$t1);
656}
657
658my $xframe = $win64 ? 0x68 : 8;
659
660$code.=<<___;
661.type ChaCha20_128,\@function,5
662.align 32
663ChaCha20_128:
664.cfi_startproc
665.LChaCha20_128:
666 mov %rsp,%r9 # frame pointer
667.cfi_def_cfa_register %r9
668 sub \$64+$xframe,%rsp
669___
670$code.=<<___ if ($win64);
671 movaps %xmm6,-0x68(%r9)
672 movaps %xmm7,-0x58(%r9)
673 movaps %xmm8,-0x48(%r9)
674 movaps %xmm9,-0x38(%r9)
675 movaps %xmm10,-0x28(%r9)
676 movaps %xmm11,-0x18(%r9)
677.L128_body:
678___
679$code.=<<___;
680 movdqa .Lsigma(%rip),$a
681 movdqu ($key),$b
682 movdqu 16($key),$c
683 movdqu ($counter),$d
684 movdqa .Lone(%rip),$d1
685 movdqa .Lrot16(%rip),$rot16
686 movdqa .Lrot24(%rip),$rot24
687
688 movdqa $a,$a1
689 movdqa $a,0x00(%rsp)
690 movdqa $b,$b1
691 movdqa $b,0x10(%rsp)
692 movdqa $c,$c1
693 movdqa $c,0x20(%rsp)
694 paddd $d,$d1
695 movdqa $d,0x30(%rsp)
696 mov \$10,$counter # reuse $counter
697 jmp .Loop_128
698
699.align 32
700.Loop_128:
701___
702 &SSSE3ROUND_2x();
703 &pshufd ($c,$c,0b01001110);
704 &pshufd ($b,$b,0b00111001);
705 &pshufd ($d,$d,0b10010011);
706 &pshufd ($c1,$c1,0b01001110);
707 &pshufd ($b1,$b1,0b00111001);
708 &pshufd ($d1,$d1,0b10010011);
709
710 &SSSE3ROUND_2x();
711 &pshufd ($c,$c,0b01001110);
712 &pshufd ($b,$b,0b10010011);
713 &pshufd ($d,$d,0b00111001);
714 &pshufd ($c1,$c1,0b01001110);
715 &pshufd ($b1,$b1,0b10010011);
716 &pshufd ($d1,$d1,0b00111001);
717
718 &dec ($counter);
719 &jnz (".Loop_128");
720
721$code.=<<___;
722 paddd 0x00(%rsp),$a
723 paddd 0x10(%rsp),$b
724 paddd 0x20(%rsp),$c
725 paddd 0x30(%rsp),$d
726 paddd .Lone(%rip),$d1
727 paddd 0x00(%rsp),$a1
728 paddd 0x10(%rsp),$b1
729 paddd 0x20(%rsp),$c1
730 paddd 0x30(%rsp),$d1
731
732 movdqu 0x00($inp),$t
733 movdqu 0x10($inp),$t1
734 pxor $t,$a # xor with input
735 movdqu 0x20($inp),$t
736 pxor $t1,$b
737 movdqu 0x30($inp),$t1
738 pxor $t,$c
739 movdqu 0x40($inp),$t
740 pxor $t1,$d
741 movdqu 0x50($inp),$t1
742 pxor $t,$a1
743 movdqu 0x60($inp),$t
744 pxor $t1,$b1
745 movdqu 0x70($inp),$t1
746 pxor $t,$c1
747 pxor $t1,$d1
748
749 movdqu $a,0x00($out) # write output
750 movdqu $b,0x10($out)
751 movdqu $c,0x20($out)
752 movdqu $d,0x30($out)
753 movdqu $a1,0x40($out)
754 movdqu $b1,0x50($out)
755 movdqu $c1,0x60($out)
756 movdqu $d1,0x70($out)
757___
758$code.=<<___ if ($win64);
759 movaps -0x68(%r9),%xmm6
760 movaps -0x58(%r9),%xmm7
761 movaps -0x48(%r9),%xmm8
762 movaps -0x38(%r9),%xmm9
763 movaps -0x28(%r9),%xmm10
764 movaps -0x18(%r9),%xmm11
765___
766$code.=<<___;
767 lea (%r9),%rsp
768.cfi_def_cfa_register %rsp
769.L128_epilogue:
770 ret
771.cfi_endproc
772.size ChaCha20_128,.-ChaCha20_128
773___
774}
775
a98c648e
AP
776########################################################################
777# SSSE3 code path that handles longer messages.
778{
779# assign variables to favor Atom front-end
780my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
781 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
782my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
783 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
784
785sub SSSE3_lane_ROUND {
786my ($a0,$b0,$c0,$d0)=@_;
787my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
788my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
789my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
790my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
791my @x=map("\"$_\"",@xx);
792
793 # Consider order in which variables are addressed by their
794 # index:
795 #
796 # a b c d
797 #
798 # 0 4 8 12 < even round
799 # 1 5 9 13
800 # 2 6 10 14
801 # 3 7 11 15
802 # 0 5 10 15 < odd round
803 # 1 6 11 12
804 # 2 7 8 13
805 # 3 4 9 14
806 #
807 # 'a', 'b' and 'd's are permanently allocated in registers,
808 # @x[0..7,12..15], while 'c's are maintained in memory. If
809 # you observe 'c' column, you'll notice that pair of 'c's is
810 # invariant between rounds. This means that we have to reload
811 # them once per round, in the middle. This is why you'll see
812 # bunch of 'c' stores and loads in the middle, but none in
813 # the beginning or end.
814
815 (
816 "&paddd (@x[$a0],@x[$b0])", # Q1
817 "&paddd (@x[$a1],@x[$b1])", # Q2
818 "&pxor (@x[$d0],@x[$a0])",
819 "&pxor (@x[$d1],@x[$a1])",
820 "&pshufb (@x[$d0],$t1)",
821 "&pshufb (@x[$d1],$t1)",
822
823 "&paddd ($xc,@x[$d0])",
824 "&paddd ($xc_,@x[$d1])",
825 "&pxor (@x[$b0],$xc)",
826 "&pxor (@x[$b1],$xc_)",
827 "&movdqa ($t0,@x[$b0])",
828 "&pslld (@x[$b0],12)",
829 "&psrld ($t0,20)",
830 "&movdqa ($t1,@x[$b1])",
831 "&pslld (@x[$b1],12)",
832 "&por (@x[$b0],$t0)",
833 "&psrld ($t1,20)",
834 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
835 "&por (@x[$b1],$t1)",
836
837 "&paddd (@x[$a0],@x[$b0])",
838 "&paddd (@x[$a1],@x[$b1])",
839 "&pxor (@x[$d0],@x[$a0])",
840 "&pxor (@x[$d1],@x[$a1])",
841 "&pshufb (@x[$d0],$t0)",
842 "&pshufb (@x[$d1],$t0)",
843
844 "&paddd ($xc,@x[$d0])",
845 "&paddd ($xc_,@x[$d1])",
846 "&pxor (@x[$b0],$xc)",
847 "&pxor (@x[$b1],$xc_)",
848 "&movdqa ($t1,@x[$b0])",
849 "&pslld (@x[$b0],7)",
850 "&psrld ($t1,25)",
851 "&movdqa ($t0,@x[$b1])",
852 "&pslld (@x[$b1],7)",
853 "&por (@x[$b0],$t1)",
854 "&psrld ($t0,25)",
855 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
856 "&por (@x[$b1],$t0)",
857
858 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
859 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
860 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
861 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
862
863 "&paddd (@x[$a2],@x[$b2])", # Q3
864 "&paddd (@x[$a3],@x[$b3])", # Q4
865 "&pxor (@x[$d2],@x[$a2])",
866 "&pxor (@x[$d3],@x[$a3])",
867 "&pshufb (@x[$d2],$t1)",
868 "&pshufb (@x[$d3],$t1)",
869
870 "&paddd ($xc,@x[$d2])",
871 "&paddd ($xc_,@x[$d3])",
872 "&pxor (@x[$b2],$xc)",
873 "&pxor (@x[$b3],$xc_)",
874 "&movdqa ($t0,@x[$b2])",
875 "&pslld (@x[$b2],12)",
876 "&psrld ($t0,20)",
877 "&movdqa ($t1,@x[$b3])",
878 "&pslld (@x[$b3],12)",
879 "&por (@x[$b2],$t0)",
880 "&psrld ($t1,20)",
881 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
882 "&por (@x[$b3],$t1)",
883
884 "&paddd (@x[$a2],@x[$b2])",
885 "&paddd (@x[$a3],@x[$b3])",
886 "&pxor (@x[$d2],@x[$a2])",
887 "&pxor (@x[$d3],@x[$a3])",
888 "&pshufb (@x[$d2],$t0)",
889 "&pshufb (@x[$d3],$t0)",
890
891 "&paddd ($xc,@x[$d2])",
892 "&paddd ($xc_,@x[$d3])",
893 "&pxor (@x[$b2],$xc)",
894 "&pxor (@x[$b3],$xc_)",
895 "&movdqa ($t1,@x[$b2])",
896 "&pslld (@x[$b2],7)",
897 "&psrld ($t1,25)",
898 "&movdqa ($t0,@x[$b3])",
899 "&pslld (@x[$b3],7)",
900 "&por (@x[$b2],$t1)",
901 "&psrld ($t0,25)",
902 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
903 "&por (@x[$b3],$t0)"
904 );
905}
906
384e6de4 907my $xframe = $win64 ? 0xa8 : 8;
a98c648e
AP
908
909$code.=<<___;
910.type ChaCha20_4x,\@function,5
911.align 32
912ChaCha20_4x:
f17652e5 913.cfi_startproc
a98c648e 914.LChaCha20_4x:
384e6de4 915 mov %rsp,%r9 # frame pointer
f17652e5 916.cfi_def_cfa_register %r9
a98c648e
AP
917 mov %r10,%r11
918___
919$code.=<<___ if ($avx>1);
920 shr \$32,%r10 # OPENSSL_ia32cap_P+8
921 test \$`1<<5`,%r10 # test AVX2
922 jnz .LChaCha20_8x
923___
924$code.=<<___;
925 cmp \$192,$len
926 ja .Lproceed4x
927
928 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
929 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
930 je .Ldo_sse3_after_all # to detect Atom
931
932.Lproceed4x:
384e6de4 933 sub \$0x140+$xframe,%rsp
a98c648e
AP
934___
935 ################ stack layout
936 # +0x00 SIMD equivalent of @x[8-12]
937 # ...
938 # +0x40 constant copy of key[0-2] smashed by lanes
939 # ...
940 # +0x100 SIMD counters (with nonce smashed by lanes)
941 # ...
942 # +0x140
943$code.=<<___ if ($win64);
384e6de4
AP
944 movaps %xmm6,-0xa8(%r9)
945 movaps %xmm7,-0x98(%r9)
946 movaps %xmm8,-0x88(%r9)
947 movaps %xmm9,-0x78(%r9)
948 movaps %xmm10,-0x68(%r9)
949 movaps %xmm11,-0x58(%r9)
950 movaps %xmm12,-0x48(%r9)
951 movaps %xmm13,-0x38(%r9)
952 movaps %xmm14,-0x28(%r9)
953 movaps %xmm15,-0x18(%r9)
954.L4x_body:
a98c648e
AP
955___
956$code.=<<___;
957 movdqa .Lsigma(%rip),$xa3 # key[0]
958 movdqu ($key),$xb3 # key[1]
959 movdqu 16($key),$xt3 # key[2]
960 movdqu ($counter),$xd3 # key[3]
961 lea 0x100(%rsp),%rcx # size optimization
962 lea .Lrot16(%rip),%r10
963 lea .Lrot24(%rip),%r11
964
965 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
966 pshufd \$0x55,$xa3,$xa1
967 movdqa $xa0,0x40(%rsp) # ... and offload
968 pshufd \$0xaa,$xa3,$xa2
969 movdqa $xa1,0x50(%rsp)
970 pshufd \$0xff,$xa3,$xa3
971 movdqa $xa2,0x60(%rsp)
972 movdqa $xa3,0x70(%rsp)
973
974 pshufd \$0x00,$xb3,$xb0
975 pshufd \$0x55,$xb3,$xb1
976 movdqa $xb0,0x80-0x100(%rcx)
977 pshufd \$0xaa,$xb3,$xb2
978 movdqa $xb1,0x90-0x100(%rcx)
979 pshufd \$0xff,$xb3,$xb3
980 movdqa $xb2,0xa0-0x100(%rcx)
981 movdqa $xb3,0xb0-0x100(%rcx)
982
983 pshufd \$0x00,$xt3,$xt0 # "$xc0"
984 pshufd \$0x55,$xt3,$xt1 # "$xc1"
985 movdqa $xt0,0xc0-0x100(%rcx)
986 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
987 movdqa $xt1,0xd0-0x100(%rcx)
988 pshufd \$0xff,$xt3,$xt3 # "$xc3"
989 movdqa $xt2,0xe0-0x100(%rcx)
990 movdqa $xt3,0xf0-0x100(%rcx)
991
992 pshufd \$0x00,$xd3,$xd0
993 pshufd \$0x55,$xd3,$xd1
994 paddd .Linc(%rip),$xd0 # don't save counters yet
995 pshufd \$0xaa,$xd3,$xd2
996 movdqa $xd1,0x110-0x100(%rcx)
997 pshufd \$0xff,$xd3,$xd3
998 movdqa $xd2,0x120-0x100(%rcx)
999 movdqa $xd3,0x130-0x100(%rcx)
1000
1001 jmp .Loop_enter4x
1002
1003.align 32
1004.Loop_outer4x:
1005 movdqa 0x40(%rsp),$xa0 # re-load smashed key
1006 movdqa 0x50(%rsp),$xa1
1007 movdqa 0x60(%rsp),$xa2
1008 movdqa 0x70(%rsp),$xa3
1009 movdqa 0x80-0x100(%rcx),$xb0
1010 movdqa 0x90-0x100(%rcx),$xb1
1011 movdqa 0xa0-0x100(%rcx),$xb2
1012 movdqa 0xb0-0x100(%rcx),$xb3
1013 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1014 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1015 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1016 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1017 movdqa 0x100-0x100(%rcx),$xd0
1018 movdqa 0x110-0x100(%rcx),$xd1
1019 movdqa 0x120-0x100(%rcx),$xd2
1020 movdqa 0x130-0x100(%rcx),$xd3
1021 paddd .Lfour(%rip),$xd0 # next SIMD counters
1022
1023.Loop_enter4x:
1024 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
1025 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
1026 movdqa (%r10),$xt3 # .Lrot16(%rip)
1027 mov \$10,%eax
1028 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1029 jmp .Loop4x
1030
1031.align 32
1032.Loop4x:
1033___
1034 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1035 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1036$code.=<<___;
1037 dec %eax
1038 jnz .Loop4x
1039
1040 paddd 0x40(%rsp),$xa0 # accumulate key material
1041 paddd 0x50(%rsp),$xa1
1042 paddd 0x60(%rsp),$xa2
1043 paddd 0x70(%rsp),$xa3
1044
1045 movdqa $xa0,$xt2 # "de-interlace" data
1046 punpckldq $xa1,$xa0
1047 movdqa $xa2,$xt3
1048 punpckldq $xa3,$xa2
1049 punpckhdq $xa1,$xt2
1050 punpckhdq $xa3,$xt3
1051 movdqa $xa0,$xa1
1052 punpcklqdq $xa2,$xa0 # "a0"
1053 movdqa $xt2,$xa3
1054 punpcklqdq $xt3,$xt2 # "a2"
1055 punpckhqdq $xa2,$xa1 # "a1"
1056 punpckhqdq $xt3,$xa3 # "a3"
1057___
1058 ($xa2,$xt2)=($xt2,$xa2);
1059$code.=<<___;
1060 paddd 0x80-0x100(%rcx),$xb0
1061 paddd 0x90-0x100(%rcx),$xb1
1062 paddd 0xa0-0x100(%rcx),$xb2
1063 paddd 0xb0-0x100(%rcx),$xb3
1064
1065 movdqa $xa0,0x00(%rsp) # offload $xaN
1066 movdqa $xa1,0x10(%rsp)
1067 movdqa 0x20(%rsp),$xa0 # "xc2"
1068 movdqa 0x30(%rsp),$xa1 # "xc3"
1069
1070 movdqa $xb0,$xt2
1071 punpckldq $xb1,$xb0
1072 movdqa $xb2,$xt3
1073 punpckldq $xb3,$xb2
1074 punpckhdq $xb1,$xt2
1075 punpckhdq $xb3,$xt3
1076 movdqa $xb0,$xb1
1077 punpcklqdq $xb2,$xb0 # "b0"
1078 movdqa $xt2,$xb3
1079 punpcklqdq $xt3,$xt2 # "b2"
1080 punpckhqdq $xb2,$xb1 # "b1"
1081 punpckhqdq $xt3,$xb3 # "b3"
1082___
1083 ($xb2,$xt2)=($xt2,$xb2);
1084 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1085$code.=<<___;
1086 paddd 0xc0-0x100(%rcx),$xc0
1087 paddd 0xd0-0x100(%rcx),$xc1
1088 paddd 0xe0-0x100(%rcx),$xc2
1089 paddd 0xf0-0x100(%rcx),$xc3
1090
1091 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
1092 movdqa $xa3,0x30(%rsp)
1093
1094 movdqa $xc0,$xt2
1095 punpckldq $xc1,$xc0
1096 movdqa $xc2,$xt3
1097 punpckldq $xc3,$xc2
1098 punpckhdq $xc1,$xt2
1099 punpckhdq $xc3,$xt3
1100 movdqa $xc0,$xc1
1101 punpcklqdq $xc2,$xc0 # "c0"
1102 movdqa $xt2,$xc3
1103 punpcklqdq $xt3,$xt2 # "c2"
1104 punpckhqdq $xc2,$xc1 # "c1"
1105 punpckhqdq $xt3,$xc3 # "c3"
1106___
1107 ($xc2,$xt2)=($xt2,$xc2);
1108 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
1109$code.=<<___;
1110 paddd 0x100-0x100(%rcx),$xd0
1111 paddd 0x110-0x100(%rcx),$xd1
1112 paddd 0x120-0x100(%rcx),$xd2
1113 paddd 0x130-0x100(%rcx),$xd3
1114
1115 movdqa $xd0,$xt2
1116 punpckldq $xd1,$xd0
1117 movdqa $xd2,$xt3
1118 punpckldq $xd3,$xd2
1119 punpckhdq $xd1,$xt2
1120 punpckhdq $xd3,$xt3
1121 movdqa $xd0,$xd1
1122 punpcklqdq $xd2,$xd0 # "d0"
1123 movdqa $xt2,$xd3
1124 punpcklqdq $xt3,$xt2 # "d2"
1125 punpckhqdq $xd2,$xd1 # "d1"
1126 punpckhqdq $xt3,$xd3 # "d3"
1127___
1128 ($xd2,$xt2)=($xt2,$xd2);
1129$code.=<<___;
1130 cmp \$64*4,$len
1131 jb .Ltail4x
1132
1133 movdqu 0x00($inp),$xt0 # xor with input
1134 movdqu 0x10($inp),$xt1
1135 movdqu 0x20($inp),$xt2
1136 movdqu 0x30($inp),$xt3
1137 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1138 pxor $xb0,$xt1
1139 pxor $xc0,$xt2
1140 pxor $xd0,$xt3
1141
1142 movdqu $xt0,0x00($out)
1143 movdqu 0x40($inp),$xt0
1144 movdqu $xt1,0x10($out)
1145 movdqu 0x50($inp),$xt1
1146 movdqu $xt2,0x20($out)
1147 movdqu 0x60($inp),$xt2
1148 movdqu $xt3,0x30($out)
1149 movdqu 0x70($inp),$xt3
1150 lea 0x80($inp),$inp # size optimization
1151 pxor 0x10(%rsp),$xt0
1152 pxor $xb1,$xt1
1153 pxor $xc1,$xt2
1154 pxor $xd1,$xt3
1155
1156 movdqu $xt0,0x40($out)
1157 movdqu 0x00($inp),$xt0
1158 movdqu $xt1,0x50($out)
1159 movdqu 0x10($inp),$xt1
1160 movdqu $xt2,0x60($out)
1161 movdqu 0x20($inp),$xt2
1162 movdqu $xt3,0x70($out)
1163 lea 0x80($out),$out # size optimization
1164 movdqu 0x30($inp),$xt3
1165 pxor 0x20(%rsp),$xt0
1166 pxor $xb2,$xt1
1167 pxor $xc2,$xt2
1168 pxor $xd2,$xt3
1169
1170 movdqu $xt0,0x00($out)
1171 movdqu 0x40($inp),$xt0
1172 movdqu $xt1,0x10($out)
1173 movdqu 0x50($inp),$xt1
1174 movdqu $xt2,0x20($out)
1175 movdqu 0x60($inp),$xt2
1176 movdqu $xt3,0x30($out)
1177 movdqu 0x70($inp),$xt3
1178 lea 0x80($inp),$inp # inp+=64*4
1179 pxor 0x30(%rsp),$xt0
1180 pxor $xb3,$xt1
1181 pxor $xc3,$xt2
1182 pxor $xd3,$xt3
1183 movdqu $xt0,0x40($out)
1184 movdqu $xt1,0x50($out)
1185 movdqu $xt2,0x60($out)
1186 movdqu $xt3,0x70($out)
1187 lea 0x80($out),$out # out+=64*4
1188
1189 sub \$64*4,$len
1190 jnz .Loop_outer4x
1191
1192 jmp .Ldone4x
1193
1194.Ltail4x:
1195 cmp \$192,$len
1196 jae .L192_or_more4x
1197 cmp \$128,$len
1198 jae .L128_or_more4x
1199 cmp \$64,$len
1200 jae .L64_or_more4x
1201
1202 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1203 xor %r10,%r10
1204 #movdqa $xt0,0x00(%rsp)
1205 movdqa $xb0,0x10(%rsp)
1206 movdqa $xc0,0x20(%rsp)
1207 movdqa $xd0,0x30(%rsp)
1208 jmp .Loop_tail4x
1209
1210.align 32
1211.L64_or_more4x:
1212 movdqu 0x00($inp),$xt0 # xor with input
1213 movdqu 0x10($inp),$xt1
1214 movdqu 0x20($inp),$xt2
1215 movdqu 0x30($inp),$xt3
1216 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1217 pxor $xb0,$xt1
1218 pxor $xc0,$xt2
1219 pxor $xd0,$xt3
1220 movdqu $xt0,0x00($out)
1221 movdqu $xt1,0x10($out)
1222 movdqu $xt2,0x20($out)
1223 movdqu $xt3,0x30($out)
1224 je .Ldone4x
1225
1226 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1227 lea 0x40($inp),$inp # inp+=64*1
1228 xor %r10,%r10
1229 movdqa $xt0,0x00(%rsp)
1230 movdqa $xb1,0x10(%rsp)
1231 lea 0x40($out),$out # out+=64*1
1232 movdqa $xc1,0x20(%rsp)
1233 sub \$64,$len # len-=64*1
1234 movdqa $xd1,0x30(%rsp)
1235 jmp .Loop_tail4x
1236
1237.align 32
1238.L128_or_more4x:
1239 movdqu 0x00($inp),$xt0 # xor with input
1240 movdqu 0x10($inp),$xt1
1241 movdqu 0x20($inp),$xt2
1242 movdqu 0x30($inp),$xt3
1243 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1244 pxor $xb0,$xt1
1245 pxor $xc0,$xt2
1246 pxor $xd0,$xt3
1247
1248 movdqu $xt0,0x00($out)
1249 movdqu 0x40($inp),$xt0
1250 movdqu $xt1,0x10($out)
1251 movdqu 0x50($inp),$xt1
1252 movdqu $xt2,0x20($out)
1253 movdqu 0x60($inp),$xt2
1254 movdqu $xt3,0x30($out)
1255 movdqu 0x70($inp),$xt3
1256 pxor 0x10(%rsp),$xt0
1257 pxor $xb1,$xt1
1258 pxor $xc1,$xt2
1259 pxor $xd1,$xt3
1260 movdqu $xt0,0x40($out)
1261 movdqu $xt1,0x50($out)
1262 movdqu $xt2,0x60($out)
1263 movdqu $xt3,0x70($out)
1264 je .Ldone4x
1265
1266 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1267 lea 0x80($inp),$inp # inp+=64*2
1268 xor %r10,%r10
1269 movdqa $xt0,0x00(%rsp)
1270 movdqa $xb2,0x10(%rsp)
1271 lea 0x80($out),$out # out+=64*2
1272 movdqa $xc2,0x20(%rsp)
1273 sub \$128,$len # len-=64*2
1274 movdqa $xd2,0x30(%rsp)
1275 jmp .Loop_tail4x
1276
1277.align 32
1278.L192_or_more4x:
1279 movdqu 0x00($inp),$xt0 # xor with input
1280 movdqu 0x10($inp),$xt1
1281 movdqu 0x20($inp),$xt2
1282 movdqu 0x30($inp),$xt3
1283 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1284 pxor $xb0,$xt1
1285 pxor $xc0,$xt2
1286 pxor $xd0,$xt3
1287
1288 movdqu $xt0,0x00($out)
1289 movdqu 0x40($inp),$xt0
1290 movdqu $xt1,0x10($out)
1291 movdqu 0x50($inp),$xt1
1292 movdqu $xt2,0x20($out)
1293 movdqu 0x60($inp),$xt2
1294 movdqu $xt3,0x30($out)
1295 movdqu 0x70($inp),$xt3
1296 lea 0x80($inp),$inp # size optimization
1297 pxor 0x10(%rsp),$xt0
1298 pxor $xb1,$xt1
1299 pxor $xc1,$xt2
1300 pxor $xd1,$xt3
1301
1302 movdqu $xt0,0x40($out)
1303 movdqu 0x00($inp),$xt0
1304 movdqu $xt1,0x50($out)
1305 movdqu 0x10($inp),$xt1
1306 movdqu $xt2,0x60($out)
1307 movdqu 0x20($inp),$xt2
1308 movdqu $xt3,0x70($out)
1309 lea 0x80($out),$out # size optimization
1310 movdqu 0x30($inp),$xt3
1311 pxor 0x20(%rsp),$xt0
1312 pxor $xb2,$xt1
1313 pxor $xc2,$xt2
1314 pxor $xd2,$xt3
1315 movdqu $xt0,0x00($out)
1316 movdqu $xt1,0x10($out)
1317 movdqu $xt2,0x20($out)
1318 movdqu $xt3,0x30($out)
1319 je .Ldone4x
1320
1321 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1322 lea 0x40($inp),$inp # inp+=64*3
1323 xor %r10,%r10
1324 movdqa $xt0,0x00(%rsp)
1325 movdqa $xb3,0x10(%rsp)
1326 lea 0x40($out),$out # out+=64*3
1327 movdqa $xc3,0x20(%rsp)
1328 sub \$192,$len # len-=64*3
1329 movdqa $xd3,0x30(%rsp)
1330
1331.Loop_tail4x:
1332 movzb ($inp,%r10),%eax
1333 movzb (%rsp,%r10),%ecx
1334 lea 1(%r10),%r10
1335 xor %ecx,%eax
1336 mov %al,-1($out,%r10)
1337 dec $len
1338 jnz .Loop_tail4x
1339
1340.Ldone4x:
1341___
1342$code.=<<___ if ($win64);
384e6de4
AP
1343 movaps -0xa8(%r9),%xmm6
1344 movaps -0x98(%r9),%xmm7
1345 movaps -0x88(%r9),%xmm8
1346 movaps -0x78(%r9),%xmm9
1347 movaps -0x68(%r9),%xmm10
1348 movaps -0x58(%r9),%xmm11
1349 movaps -0x48(%r9),%xmm12
1350 movaps -0x38(%r9),%xmm13
1351 movaps -0x28(%r9),%xmm14
1352 movaps -0x18(%r9),%xmm15
a98c648e
AP
1353___
1354$code.=<<___;
384e6de4 1355 lea (%r9),%rsp
f17652e5 1356.cfi_def_cfa_register %rsp
384e6de4 1357.L4x_epilogue:
a98c648e 1358 ret
f17652e5 1359.cfi_endproc
a98c648e
AP
1360.size ChaCha20_4x,.-ChaCha20_4x
1361___
1362}
1363
1364########################################################################
1365# XOP code path that handles all lengths.
1366if ($avx) {
1367# There is some "anomaly" observed depending on instructions' size or
1368# alignment. If you look closely at below code you'll notice that
1369# sometimes argument order varies. The order affects instruction
1370# encoding by making it larger, and such fiddling gives 5% performance
1371# improvement. This is on FX-4100...
1372
1373my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1374 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1375my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1376 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1377
1378sub XOP_lane_ROUND {
1379my ($a0,$b0,$c0,$d0)=@_;
1380my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1381my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1382my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1383my @x=map("\"$_\"",@xx);
1384
1385 (
1386 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1387 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1388 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1389 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1390 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1391 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1392 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1393 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1394 "&vprotd (@x[$d0],@x[$d0],16)",
1395 "&vprotd (@x[$d1],@x[$d1],16)",
1396 "&vprotd (@x[$d2],@x[$d2],16)",
1397 "&vprotd (@x[$d3],@x[$d3],16)",
1398
1399 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1400 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1401 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1402 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1403 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1404 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1405 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1406 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1407 "&vprotd (@x[$b0],@x[$b0],12)",
1408 "&vprotd (@x[$b1],@x[$b1],12)",
1409 "&vprotd (@x[$b2],@x[$b2],12)",
1410 "&vprotd (@x[$b3],@x[$b3],12)",
1411
1412 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1413 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1414 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1415 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1416 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1417 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1418 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1419 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1420 "&vprotd (@x[$d0],@x[$d0],8)",
1421 "&vprotd (@x[$d1],@x[$d1],8)",
1422 "&vprotd (@x[$d2],@x[$d2],8)",
1423 "&vprotd (@x[$d3],@x[$d3],8)",
1424
1425 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1426 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1427 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1428 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1429 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1430 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1431 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1432 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1433 "&vprotd (@x[$b0],@x[$b0],7)",
1434 "&vprotd (@x[$b1],@x[$b1],7)",
1435 "&vprotd (@x[$b2],@x[$b2],7)",
1436 "&vprotd (@x[$b3],@x[$b3],7)"
1437 );
1438}
1439
384e6de4 1440my $xframe = $win64 ? 0xa8 : 8;
a98c648e
AP
1441
1442$code.=<<___;
1443.type ChaCha20_4xop,\@function,5
1444.align 32
1445ChaCha20_4xop:
f17652e5 1446.cfi_startproc
a98c648e 1447.LChaCha20_4xop:
384e6de4 1448 mov %rsp,%r9 # frame pointer
f17652e5 1449.cfi_def_cfa_register %r9
384e6de4 1450 sub \$0x140+$xframe,%rsp
a98c648e
AP
1451___
1452 ################ stack layout
1453 # +0x00 SIMD equivalent of @x[8-12]
1454 # ...
1455 # +0x40 constant copy of key[0-2] smashed by lanes
1456 # ...
1457 # +0x100 SIMD counters (with nonce smashed by lanes)
1458 # ...
1459 # +0x140
1460$code.=<<___ if ($win64);
384e6de4
AP
1461 movaps %xmm6,-0xa8(%r9)
1462 movaps %xmm7,-0x98(%r9)
1463 movaps %xmm8,-0x88(%r9)
1464 movaps %xmm9,-0x78(%r9)
1465 movaps %xmm10,-0x68(%r9)
1466 movaps %xmm11,-0x58(%r9)
1467 movaps %xmm12,-0x48(%r9)
1468 movaps %xmm13,-0x38(%r9)
1469 movaps %xmm14,-0x28(%r9)
1470 movaps %xmm15,-0x18(%r9)
1471.L4xop_body:
a98c648e
AP
1472___
1473$code.=<<___;
1474 vzeroupper
1475
1476 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1477 vmovdqu ($key),$xb3 # key[1]
1478 vmovdqu 16($key),$xt3 # key[2]
1479 vmovdqu ($counter),$xd3 # key[3]
1480 lea 0x100(%rsp),%rcx # size optimization
1481
1482 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1483 vpshufd \$0x55,$xa3,$xa1
1484 vmovdqa $xa0,0x40(%rsp) # ... and offload
1485 vpshufd \$0xaa,$xa3,$xa2
1486 vmovdqa $xa1,0x50(%rsp)
1487 vpshufd \$0xff,$xa3,$xa3
1488 vmovdqa $xa2,0x60(%rsp)
1489 vmovdqa $xa3,0x70(%rsp)
1490
1491 vpshufd \$0x00,$xb3,$xb0
1492 vpshufd \$0x55,$xb3,$xb1
1493 vmovdqa $xb0,0x80-0x100(%rcx)
1494 vpshufd \$0xaa,$xb3,$xb2
1495 vmovdqa $xb1,0x90-0x100(%rcx)
1496 vpshufd \$0xff,$xb3,$xb3
1497 vmovdqa $xb2,0xa0-0x100(%rcx)
1498 vmovdqa $xb3,0xb0-0x100(%rcx)
1499
1500 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1501 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1502 vmovdqa $xt0,0xc0-0x100(%rcx)
1503 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1504 vmovdqa $xt1,0xd0-0x100(%rcx)
1505 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1506 vmovdqa $xt2,0xe0-0x100(%rcx)
1507 vmovdqa $xt3,0xf0-0x100(%rcx)
1508
1509 vpshufd \$0x00,$xd3,$xd0
1510 vpshufd \$0x55,$xd3,$xd1
1511 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1512 vpshufd \$0xaa,$xd3,$xd2
1513 vmovdqa $xd1,0x110-0x100(%rcx)
1514 vpshufd \$0xff,$xd3,$xd3
1515 vmovdqa $xd2,0x120-0x100(%rcx)
1516 vmovdqa $xd3,0x130-0x100(%rcx)
1517
1518 jmp .Loop_enter4xop
1519
1520.align 32
1521.Loop_outer4xop:
1522 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1523 vmovdqa 0x50(%rsp),$xa1
1524 vmovdqa 0x60(%rsp),$xa2
1525 vmovdqa 0x70(%rsp),$xa3
1526 vmovdqa 0x80-0x100(%rcx),$xb0
1527 vmovdqa 0x90-0x100(%rcx),$xb1
1528 vmovdqa 0xa0-0x100(%rcx),$xb2
1529 vmovdqa 0xb0-0x100(%rcx),$xb3
1530 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1531 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1532 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1533 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1534 vmovdqa 0x100-0x100(%rcx),$xd0
1535 vmovdqa 0x110-0x100(%rcx),$xd1
1536 vmovdqa 0x120-0x100(%rcx),$xd2
1537 vmovdqa 0x130-0x100(%rcx),$xd3
1538 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1539
1540.Loop_enter4xop:
1541 mov \$10,%eax
1542 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1543 jmp .Loop4xop
1544
1545.align 32
1546.Loop4xop:
1547___
1548 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1549 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1550$code.=<<___;
1551 dec %eax
1552 jnz .Loop4xop
1553
1554 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1555 vpaddd 0x50(%rsp),$xa1,$xa1
1556 vpaddd 0x60(%rsp),$xa2,$xa2
1557 vpaddd 0x70(%rsp),$xa3,$xa3
1558
1559 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1560 vmovdqa $xt3,0x30(%rsp)
1561
1562 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1563 vpunpckldq $xa3,$xa2,$xt3
1564 vpunpckhdq $xa1,$xa0,$xa0
1565 vpunpckhdq $xa3,$xa2,$xa2
1566 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1567 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1568 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1569 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1570___
1571 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1572$code.=<<___;
1573 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1574 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1575 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1576 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1577
1578 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1579 vmovdqa $xa1,0x10(%rsp)
1580 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1581 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1582
1583 vpunpckldq $xb1,$xb0,$xt2
1584 vpunpckldq $xb3,$xb2,$xt3
1585 vpunpckhdq $xb1,$xb0,$xb0
1586 vpunpckhdq $xb3,$xb2,$xb2
1587 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1588 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1589 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1590 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1591___
1592 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1593 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1594$code.=<<___;
1595 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1596 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1597 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1598 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1599
1600 vpunpckldq $xc1,$xc0,$xt2
1601 vpunpckldq $xc3,$xc2,$xt3
1602 vpunpckhdq $xc1,$xc0,$xc0
1603 vpunpckhdq $xc3,$xc2,$xc2
1604 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1605 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1606 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1607 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1608___
1609 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1610$code.=<<___;
1611 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1612 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1613 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1614 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1615
1616 vpunpckldq $xd1,$xd0,$xt2
1617 vpunpckldq $xd3,$xd2,$xt3
1618 vpunpckhdq $xd1,$xd0,$xd0
1619 vpunpckhdq $xd3,$xd2,$xd2
1620 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1621 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1622 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1623 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1624___
1625 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1626 ($xa0,$xa1)=($xt2,$xt3);
1627$code.=<<___;
1628 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1629 vmovdqa 0x10(%rsp),$xa1
1630
1631 cmp \$64*4,$len
1632 jb .Ltail4xop
1633
1634 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1635 vpxor 0x10($inp),$xb0,$xb0
1636 vpxor 0x20($inp),$xc0,$xc0
1637 vpxor 0x30($inp),$xd0,$xd0
1638 vpxor 0x40($inp),$xa1,$xa1
1639 vpxor 0x50($inp),$xb1,$xb1
1640 vpxor 0x60($inp),$xc1,$xc1
1641 vpxor 0x70($inp),$xd1,$xd1
1642 lea 0x80($inp),$inp # size optimization
1643 vpxor 0x00($inp),$xa2,$xa2
1644 vpxor 0x10($inp),$xb2,$xb2
1645 vpxor 0x20($inp),$xc2,$xc2
1646 vpxor 0x30($inp),$xd2,$xd2
1647 vpxor 0x40($inp),$xa3,$xa3
1648 vpxor 0x50($inp),$xb3,$xb3
1649 vpxor 0x60($inp),$xc3,$xc3
1650 vpxor 0x70($inp),$xd3,$xd3
1651 lea 0x80($inp),$inp # inp+=64*4
1652
1653 vmovdqu $xa0,0x00($out)
1654 vmovdqu $xb0,0x10($out)
1655 vmovdqu $xc0,0x20($out)
1656 vmovdqu $xd0,0x30($out)
1657 vmovdqu $xa1,0x40($out)
1658 vmovdqu $xb1,0x50($out)
1659 vmovdqu $xc1,0x60($out)
1660 vmovdqu $xd1,0x70($out)
1661 lea 0x80($out),$out # size optimization
1662 vmovdqu $xa2,0x00($out)
1663 vmovdqu $xb2,0x10($out)
1664 vmovdqu $xc2,0x20($out)
1665 vmovdqu $xd2,0x30($out)
1666 vmovdqu $xa3,0x40($out)
1667 vmovdqu $xb3,0x50($out)
1668 vmovdqu $xc3,0x60($out)
1669 vmovdqu $xd3,0x70($out)
1670 lea 0x80($out),$out # out+=64*4
1671
1672 sub \$64*4,$len
1673 jnz .Loop_outer4xop
1674
1675 jmp .Ldone4xop
1676
1677.align 32
1678.Ltail4xop:
1679 cmp \$192,$len
1680 jae .L192_or_more4xop
1681 cmp \$128,$len
1682 jae .L128_or_more4xop
1683 cmp \$64,$len
1684 jae .L64_or_more4xop
1685
1686 xor %r10,%r10
1687 vmovdqa $xa0,0x00(%rsp)
1688 vmovdqa $xb0,0x10(%rsp)
1689 vmovdqa $xc0,0x20(%rsp)
1690 vmovdqa $xd0,0x30(%rsp)
1691 jmp .Loop_tail4xop
1692
1693.align 32
1694.L64_or_more4xop:
1695 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1696 vpxor 0x10($inp),$xb0,$xb0
1697 vpxor 0x20($inp),$xc0,$xc0
1698 vpxor 0x30($inp),$xd0,$xd0
1699 vmovdqu $xa0,0x00($out)
1700 vmovdqu $xb0,0x10($out)
1701 vmovdqu $xc0,0x20($out)
1702 vmovdqu $xd0,0x30($out)
1703 je .Ldone4xop
1704
1705 lea 0x40($inp),$inp # inp+=64*1
1706 vmovdqa $xa1,0x00(%rsp)
1707 xor %r10,%r10
1708 vmovdqa $xb1,0x10(%rsp)
1709 lea 0x40($out),$out # out+=64*1
1710 vmovdqa $xc1,0x20(%rsp)
1711 sub \$64,$len # len-=64*1
1712 vmovdqa $xd1,0x30(%rsp)
1713 jmp .Loop_tail4xop
1714
1715.align 32
1716.L128_or_more4xop:
1717 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1718 vpxor 0x10($inp),$xb0,$xb0
1719 vpxor 0x20($inp),$xc0,$xc0
1720 vpxor 0x30($inp),$xd0,$xd0
1721 vpxor 0x40($inp),$xa1,$xa1
1722 vpxor 0x50($inp),$xb1,$xb1
1723 vpxor 0x60($inp),$xc1,$xc1
1724 vpxor 0x70($inp),$xd1,$xd1
1725
1726 vmovdqu $xa0,0x00($out)
1727 vmovdqu $xb0,0x10($out)
1728 vmovdqu $xc0,0x20($out)
1729 vmovdqu $xd0,0x30($out)
1730 vmovdqu $xa1,0x40($out)
1731 vmovdqu $xb1,0x50($out)
1732 vmovdqu $xc1,0x60($out)
1733 vmovdqu $xd1,0x70($out)
1734 je .Ldone4xop
1735
1736 lea 0x80($inp),$inp # inp+=64*2
1737 vmovdqa $xa2,0x00(%rsp)
1738 xor %r10,%r10
1739 vmovdqa $xb2,0x10(%rsp)
1740 lea 0x80($out),$out # out+=64*2
1741 vmovdqa $xc2,0x20(%rsp)
1742 sub \$128,$len # len-=64*2
1743 vmovdqa $xd2,0x30(%rsp)
1744 jmp .Loop_tail4xop
1745
1746.align 32
1747.L192_or_more4xop:
1748 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1749 vpxor 0x10($inp),$xb0,$xb0
1750 vpxor 0x20($inp),$xc0,$xc0
1751 vpxor 0x30($inp),$xd0,$xd0
1752 vpxor 0x40($inp),$xa1,$xa1
1753 vpxor 0x50($inp),$xb1,$xb1
1754 vpxor 0x60($inp),$xc1,$xc1
1755 vpxor 0x70($inp),$xd1,$xd1
1756 lea 0x80($inp),$inp # size optimization
1757 vpxor 0x00($inp),$xa2,$xa2
1758 vpxor 0x10($inp),$xb2,$xb2
1759 vpxor 0x20($inp),$xc2,$xc2
1760 vpxor 0x30($inp),$xd2,$xd2
1761
1762 vmovdqu $xa0,0x00($out)
1763 vmovdqu $xb0,0x10($out)
1764 vmovdqu $xc0,0x20($out)
1765 vmovdqu $xd0,0x30($out)
1766 vmovdqu $xa1,0x40($out)
1767 vmovdqu $xb1,0x50($out)
1768 vmovdqu $xc1,0x60($out)
1769 vmovdqu $xd1,0x70($out)
1770 lea 0x80($out),$out # size optimization
1771 vmovdqu $xa2,0x00($out)
1772 vmovdqu $xb2,0x10($out)
1773 vmovdqu $xc2,0x20($out)
1774 vmovdqu $xd2,0x30($out)
1775 je .Ldone4xop
1776
1777 lea 0x40($inp),$inp # inp+=64*3
f2188228 1778 vmovdqa $xa3,0x00(%rsp)
a98c648e 1779 xor %r10,%r10
f2188228 1780 vmovdqa $xb3,0x10(%rsp)
a98c648e 1781 lea 0x40($out),$out # out+=64*3
f2188228 1782 vmovdqa $xc3,0x20(%rsp)
a98c648e 1783 sub \$192,$len # len-=64*3
f2188228 1784 vmovdqa $xd3,0x30(%rsp)
a98c648e
AP
1785
1786.Loop_tail4xop:
1787 movzb ($inp,%r10),%eax
1788 movzb (%rsp,%r10),%ecx
1789 lea 1(%r10),%r10
1790 xor %ecx,%eax
1791 mov %al,-1($out,%r10)
1792 dec $len
1793 jnz .Loop_tail4xop
1794
1795.Ldone4xop:
1796 vzeroupper
1797___
1798$code.=<<___ if ($win64);
384e6de4
AP
1799 movaps -0xa8(%r9),%xmm6
1800 movaps -0x98(%r9),%xmm7
1801 movaps -0x88(%r9),%xmm8
1802 movaps -0x78(%r9),%xmm9
1803 movaps -0x68(%r9),%xmm10
1804 movaps -0x58(%r9),%xmm11
1805 movaps -0x48(%r9),%xmm12
1806 movaps -0x38(%r9),%xmm13
1807 movaps -0x28(%r9),%xmm14
1808 movaps -0x18(%r9),%xmm15
a98c648e
AP
1809___
1810$code.=<<___;
384e6de4 1811 lea (%r9),%rsp
f17652e5 1812.cfi_def_cfa_register %rsp
384e6de4 1813.L4xop_epilogue:
a98c648e 1814 ret
f17652e5 1815.cfi_endproc
a98c648e
AP
1816.size ChaCha20_4xop,.-ChaCha20_4xop
1817___
1818}
1819
1820########################################################################
1821# AVX2 code path
1822if ($avx>1) {
1823my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1824 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1825my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1826 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1827
1828sub AVX2_lane_ROUND {
1829my ($a0,$b0,$c0,$d0)=@_;
1830my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1831my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1832my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1833my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1834my @x=map("\"$_\"",@xx);
1835
1836 # Consider order in which variables are addressed by their
1837 # index:
1838 #
1839 # a b c d
1840 #
1841 # 0 4 8 12 < even round
1842 # 1 5 9 13
1843 # 2 6 10 14
1844 # 3 7 11 15
1845 # 0 5 10 15 < odd round
1846 # 1 6 11 12
1847 # 2 7 8 13
1848 # 3 4 9 14
1849 #
1850 # 'a', 'b' and 'd's are permanently allocated in registers,
1851 # @x[0..7,12..15], while 'c's are maintained in memory. If
1852 # you observe 'c' column, you'll notice that pair of 'c's is
1853 # invariant between rounds. This means that we have to reload
1854 # them once per round, in the middle. This is why you'll see
1855 # bunch of 'c' stores and loads in the middle, but none in
1856 # the beginning or end.
1857
1858 (
1859 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1860 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1861 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1862 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1863 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1864 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1865
1866 "&vpaddd ($xc,$xc,@x[$d0])",
1867 "&vpxor (@x[$b0],$xc,@x[$b0])",
1868 "&vpslld ($t0,@x[$b0],12)",
1869 "&vpsrld (@x[$b0],@x[$b0],20)",
1870 "&vpor (@x[$b0],$t0,@x[$b0])",
1871 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1872 "&vpaddd ($xc_,$xc_,@x[$d1])",
1873 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1874 "&vpslld ($t1,@x[$b1],12)",
1875 "&vpsrld (@x[$b1],@x[$b1],20)",
1876 "&vpor (@x[$b1],$t1,@x[$b1])",
1877
1878 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1879 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1880 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1881 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1882 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1883 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1884
1885 "&vpaddd ($xc,$xc,@x[$d0])",
1886 "&vpxor (@x[$b0],$xc,@x[$b0])",
1887 "&vpslld ($t1,@x[$b0],7)",
1888 "&vpsrld (@x[$b0],@x[$b0],25)",
1889 "&vpor (@x[$b0],$t1,@x[$b0])",
1890 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1891 "&vpaddd ($xc_,$xc_,@x[$d1])",
1892 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1893 "&vpslld ($t0,@x[$b1],7)",
1894 "&vpsrld (@x[$b1],@x[$b1],25)",
1895 "&vpor (@x[$b1],$t0,@x[$b1])",
1896
1897 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1898 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1899 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1900 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1901
1902 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1903 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1904 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1905 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1906 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1907 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1908
1909 "&vpaddd ($xc,$xc,@x[$d2])",
1910 "&vpxor (@x[$b2],$xc,@x[$b2])",
1911 "&vpslld ($t0,@x[$b2],12)",
1912 "&vpsrld (@x[$b2],@x[$b2],20)",
1913 "&vpor (@x[$b2],$t0,@x[$b2])",
1914 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1915 "&vpaddd ($xc_,$xc_,@x[$d3])",
1916 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1917 "&vpslld ($t1,@x[$b3],12)",
1918 "&vpsrld (@x[$b3],@x[$b3],20)",
1919 "&vpor (@x[$b3],$t1,@x[$b3])",
1920
1921 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1922 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1923 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1924 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1925 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1926 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1927
1928 "&vpaddd ($xc,$xc,@x[$d2])",
1929 "&vpxor (@x[$b2],$xc,@x[$b2])",
1930 "&vpslld ($t1,@x[$b2],7)",
1931 "&vpsrld (@x[$b2],@x[$b2],25)",
1932 "&vpor (@x[$b2],$t1,@x[$b2])",
1933 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1934 "&vpaddd ($xc_,$xc_,@x[$d3])",
1935 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1936 "&vpslld ($t0,@x[$b3],7)",
1937 "&vpsrld (@x[$b3],@x[$b3],25)",
1938 "&vpor (@x[$b3],$t0,@x[$b3])"
1939 );
1940}
1941
384e6de4 1942my $xframe = $win64 ? 0xa8 : 8;
a98c648e
AP
1943
1944$code.=<<___;
1945.type ChaCha20_8x,\@function,5
1946.align 32
1947ChaCha20_8x:
f17652e5 1948.cfi_startproc
a98c648e 1949.LChaCha20_8x:
384e6de4 1950 mov %rsp,%r9 # frame register
f17652e5 1951.cfi_def_cfa_register %r9
a98c648e
AP
1952 sub \$0x280+$xframe,%rsp
1953 and \$-32,%rsp
1954___
1955$code.=<<___ if ($win64);
384e6de4
AP
1956 movaps %xmm6,-0xa8(%r9)
1957 movaps %xmm7,-0x98(%r9)
1958 movaps %xmm8,-0x88(%r9)
1959 movaps %xmm9,-0x78(%r9)
1960 movaps %xmm10,-0x68(%r9)
1961 movaps %xmm11,-0x58(%r9)
1962 movaps %xmm12,-0x48(%r9)
1963 movaps %xmm13,-0x38(%r9)
1964 movaps %xmm14,-0x28(%r9)
1965 movaps %xmm15,-0x18(%r9)
1966.L8x_body:
a98c648e
AP
1967___
1968$code.=<<___;
1969 vzeroupper
a98c648e
AP
1970
1971 ################ stack layout
1972 # +0x00 SIMD equivalent of @x[8-12]
1973 # ...
1974 # +0x80 constant copy of key[0-2] smashed by lanes
1975 # ...
1976 # +0x200 SIMD counters (with nonce smashed by lanes)
1977 # ...
384e6de4 1978 # +0x280
a98c648e
AP
1979
1980 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1981 vbroadcasti128 ($key),$xb3 # key[1]
1982 vbroadcasti128 16($key),$xt3 # key[2]
1983 vbroadcasti128 ($counter),$xd3 # key[3]
1984 lea 0x100(%rsp),%rcx # size optimization
1985 lea 0x200(%rsp),%rax # size optimization
1986 lea .Lrot16(%rip),%r10
1987 lea .Lrot24(%rip),%r11
1988
1989 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1990 vpshufd \$0x55,$xa3,$xa1
1991 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1992 vpshufd \$0xaa,$xa3,$xa2
1993 vmovdqa $xa1,0xa0-0x100(%rcx)
1994 vpshufd \$0xff,$xa3,$xa3
1995 vmovdqa $xa2,0xc0-0x100(%rcx)
1996 vmovdqa $xa3,0xe0-0x100(%rcx)
1997
1998 vpshufd \$0x00,$xb3,$xb0
1999 vpshufd \$0x55,$xb3,$xb1
2000 vmovdqa $xb0,0x100-0x100(%rcx)
2001 vpshufd \$0xaa,$xb3,$xb2
2002 vmovdqa $xb1,0x120-0x100(%rcx)
2003 vpshufd \$0xff,$xb3,$xb3
2004 vmovdqa $xb2,0x140-0x100(%rcx)
2005 vmovdqa $xb3,0x160-0x100(%rcx)
2006
2007 vpshufd \$0x00,$xt3,$xt0 # "xc0"
2008 vpshufd \$0x55,$xt3,$xt1 # "xc1"
2009 vmovdqa $xt0,0x180-0x200(%rax)
2010 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
2011 vmovdqa $xt1,0x1a0-0x200(%rax)
2012 vpshufd \$0xff,$xt3,$xt3 # "xc3"
2013 vmovdqa $xt2,0x1c0-0x200(%rax)
2014 vmovdqa $xt3,0x1e0-0x200(%rax)
2015
2016 vpshufd \$0x00,$xd3,$xd0
2017 vpshufd \$0x55,$xd3,$xd1
2018 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
2019 vpshufd \$0xaa,$xd3,$xd2
2020 vmovdqa $xd1,0x220-0x200(%rax)
2021 vpshufd \$0xff,$xd3,$xd3
2022 vmovdqa $xd2,0x240-0x200(%rax)
2023 vmovdqa $xd3,0x260-0x200(%rax)
2024
2025 jmp .Loop_enter8x
2026
2027.align 32
2028.Loop_outer8x:
2029 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
2030 vmovdqa 0xa0-0x100(%rcx),$xa1
2031 vmovdqa 0xc0-0x100(%rcx),$xa2
2032 vmovdqa 0xe0-0x100(%rcx),$xa3
2033 vmovdqa 0x100-0x100(%rcx),$xb0
2034 vmovdqa 0x120-0x100(%rcx),$xb1
2035 vmovdqa 0x140-0x100(%rcx),$xb2
2036 vmovdqa 0x160-0x100(%rcx),$xb3
2037 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
2038 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
2039 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
2040 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
2041 vmovdqa 0x200-0x200(%rax),$xd0
2042 vmovdqa 0x220-0x200(%rax),$xd1
2043 vmovdqa 0x240-0x200(%rax),$xd2
2044 vmovdqa 0x260-0x200(%rax),$xd3
2045 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
2046
2047.Loop_enter8x:
2048 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
2049 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
2050 vbroadcasti128 (%r10),$xt3
2051 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
2052 mov \$10,%eax
2053 jmp .Loop8x
2054
2055.align 32
2056.Loop8x:
2057___
2058 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2059 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2060$code.=<<___;
2061 dec %eax
2062 jnz .Loop8x
2063
2064 lea 0x200(%rsp),%rax # size optimization
2065 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
2066 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
2067 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
2068 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
2069
2070 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2071 vpunpckldq $xa3,$xa2,$xt3
2072 vpunpckhdq $xa1,$xa0,$xa0
2073 vpunpckhdq $xa3,$xa2,$xa2
2074 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2075 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2076 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2077 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2078___
2079 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2080$code.=<<___;
2081 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
2082 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
2083 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
2084 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
2085
2086 vpunpckldq $xb1,$xb0,$xt2
2087 vpunpckldq $xb3,$xb2,$xt3
2088 vpunpckhdq $xb1,$xb0,$xb0
2089 vpunpckhdq $xb3,$xb2,$xb2
2090 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2091 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2092 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2093 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2094___
2095 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2096$code.=<<___;
2097 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
2098 vperm2i128 \$0x31,$xb0,$xa0,$xb0
2099 vperm2i128 \$0x20,$xb1,$xa1,$xa0
2100 vperm2i128 \$0x31,$xb1,$xa1,$xb1
2101 vperm2i128 \$0x20,$xb2,$xa2,$xa1
2102 vperm2i128 \$0x31,$xb2,$xa2,$xb2
2103 vperm2i128 \$0x20,$xb3,$xa3,$xa2
2104 vperm2i128 \$0x31,$xb3,$xa3,$xb3
2105___
2106 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2107 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2108$code.=<<___;
2109 vmovdqa $xa0,0x00(%rsp) # offload $xaN
2110 vmovdqa $xa1,0x20(%rsp)
2111 vmovdqa 0x40(%rsp),$xc2 # $xa0
2112 vmovdqa 0x60(%rsp),$xc3 # $xa1
2113
2114 vpaddd 0x180-0x200(%rax),$xc0,$xc0
2115 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
2116 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
2117 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
2118
2119 vpunpckldq $xc1,$xc0,$xt2
2120 vpunpckldq $xc3,$xc2,$xt3
2121 vpunpckhdq $xc1,$xc0,$xc0
2122 vpunpckhdq $xc3,$xc2,$xc2
2123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2127___
2128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2129$code.=<<___;
2130 vpaddd 0x200-0x200(%rax),$xd0,$xd0
2131 vpaddd 0x220-0x200(%rax),$xd1,$xd1
2132 vpaddd 0x240-0x200(%rax),$xd2,$xd2
2133 vpaddd 0x260-0x200(%rax),$xd3,$xd3
2134
2135 vpunpckldq $xd1,$xd0,$xt2
2136 vpunpckldq $xd3,$xd2,$xt3
2137 vpunpckhdq $xd1,$xd0,$xd0
2138 vpunpckhdq $xd3,$xd2,$xd2
2139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2143___
2144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2145$code.=<<___;
2146 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
2147 vperm2i128 \$0x31,$xd0,$xc0,$xd0
2148 vperm2i128 \$0x20,$xd1,$xc1,$xc0
2149 vperm2i128 \$0x31,$xd1,$xc1,$xd1
2150 vperm2i128 \$0x20,$xd2,$xc2,$xc1
2151 vperm2i128 \$0x31,$xd2,$xc2,$xd2
2152 vperm2i128 \$0x20,$xd3,$xc3,$xc2
2153 vperm2i128 \$0x31,$xd3,$xc3,$xd3
2154___
2155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2156 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2157 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2158 ($xa0,$xa1)=($xt2,$xt3);
2159$code.=<<___;
2160 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
2161 vmovdqa 0x20(%rsp),$xa1
2162
2163 cmp \$64*8,$len
2164 jb .Ltail8x
2165
2166 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2167 vpxor 0x20($inp),$xb0,$xb0
2168 vpxor 0x40($inp),$xc0,$xc0
2169 vpxor 0x60($inp),$xd0,$xd0
2170 lea 0x80($inp),$inp # size optimization
2171 vmovdqu $xa0,0x00($out)
2172 vmovdqu $xb0,0x20($out)
2173 vmovdqu $xc0,0x40($out)
2174 vmovdqu $xd0,0x60($out)
2175 lea 0x80($out),$out # size optimization
2176
2177 vpxor 0x00($inp),$xa1,$xa1
2178 vpxor 0x20($inp),$xb1,$xb1
2179 vpxor 0x40($inp),$xc1,$xc1
2180 vpxor 0x60($inp),$xd1,$xd1
2181 lea 0x80($inp),$inp # size optimization
2182 vmovdqu $xa1,0x00($out)
2183 vmovdqu $xb1,0x20($out)
2184 vmovdqu $xc1,0x40($out)
2185 vmovdqu $xd1,0x60($out)
2186 lea 0x80($out),$out # size optimization
2187
2188 vpxor 0x00($inp),$xa2,$xa2
2189 vpxor 0x20($inp),$xb2,$xb2
2190 vpxor 0x40($inp),$xc2,$xc2
2191 vpxor 0x60($inp),$xd2,$xd2
2192 lea 0x80($inp),$inp # size optimization
2193 vmovdqu $xa2,0x00($out)
2194 vmovdqu $xb2,0x20($out)
2195 vmovdqu $xc2,0x40($out)
2196 vmovdqu $xd2,0x60($out)
2197 lea 0x80($out),$out # size optimization
2198
2199 vpxor 0x00($inp),$xa3,$xa3
2200 vpxor 0x20($inp),$xb3,$xb3
2201 vpxor 0x40($inp),$xc3,$xc3
2202 vpxor 0x60($inp),$xd3,$xd3
2203 lea 0x80($inp),$inp # size optimization
2204 vmovdqu $xa3,0x00($out)
2205 vmovdqu $xb3,0x20($out)
2206 vmovdqu $xc3,0x40($out)
2207 vmovdqu $xd3,0x60($out)
2208 lea 0x80($out),$out # size optimization
2209
2210 sub \$64*8,$len
2211 jnz .Loop_outer8x
2212
2213 jmp .Ldone8x
2214
2215.Ltail8x:
2216 cmp \$448,$len
2217 jae .L448_or_more8x
2218 cmp \$384,$len
2219 jae .L384_or_more8x
2220 cmp \$320,$len
2221 jae .L320_or_more8x
2222 cmp \$256,$len
2223 jae .L256_or_more8x
2224 cmp \$192,$len
2225 jae .L192_or_more8x
2226 cmp \$128,$len
2227 jae .L128_or_more8x
2228 cmp \$64,$len
2229 jae .L64_or_more8x
2230
2231 xor %r10,%r10
2232 vmovdqa $xa0,0x00(%rsp)
2233 vmovdqa $xb0,0x20(%rsp)
2234 jmp .Loop_tail8x
2235
2236.align 32
2237.L64_or_more8x:
2238 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2239 vpxor 0x20($inp),$xb0,$xb0
2240 vmovdqu $xa0,0x00($out)
2241 vmovdqu $xb0,0x20($out)
2242 je .Ldone8x
2243
2244 lea 0x40($inp),$inp # inp+=64*1
2245 xor %r10,%r10
2246 vmovdqa $xc0,0x00(%rsp)
2247 lea 0x40($out),$out # out+=64*1
2248 sub \$64,$len # len-=64*1
2249 vmovdqa $xd0,0x20(%rsp)
2250 jmp .Loop_tail8x
2251
2252.align 32
2253.L128_or_more8x:
2254 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2255 vpxor 0x20($inp),$xb0,$xb0
2256 vpxor 0x40($inp),$xc0,$xc0
2257 vpxor 0x60($inp),$xd0,$xd0
2258 vmovdqu $xa0,0x00($out)
2259 vmovdqu $xb0,0x20($out)
2260 vmovdqu $xc0,0x40($out)
2261 vmovdqu $xd0,0x60($out)
2262 je .Ldone8x
2263
2264 lea 0x80($inp),$inp # inp+=64*2
2265 xor %r10,%r10
2266 vmovdqa $xa1,0x00(%rsp)
2267 lea 0x80($out),$out # out+=64*2
2268 sub \$128,$len # len-=64*2
2269 vmovdqa $xb1,0x20(%rsp)
2270 jmp .Loop_tail8x
2271
2272.align 32
2273.L192_or_more8x:
2274 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2275 vpxor 0x20($inp),$xb0,$xb0
2276 vpxor 0x40($inp),$xc0,$xc0
2277 vpxor 0x60($inp),$xd0,$xd0
2278 vpxor 0x80($inp),$xa1,$xa1
2279 vpxor 0xa0($inp),$xb1,$xb1
2280 vmovdqu $xa0,0x00($out)
2281 vmovdqu $xb0,0x20($out)
2282 vmovdqu $xc0,0x40($out)
2283 vmovdqu $xd0,0x60($out)
2284 vmovdqu $xa1,0x80($out)
2285 vmovdqu $xb1,0xa0($out)
2286 je .Ldone8x
2287
2288 lea 0xc0($inp),$inp # inp+=64*3
2289 xor %r10,%r10
2290 vmovdqa $xc1,0x00(%rsp)
2291 lea 0xc0($out),$out # out+=64*3
2292 sub \$192,$len # len-=64*3
2293 vmovdqa $xd1,0x20(%rsp)
2294 jmp .Loop_tail8x
2295
2296.align 32
2297.L256_or_more8x:
2298 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2299 vpxor 0x20($inp),$xb0,$xb0
2300 vpxor 0x40($inp),$xc0,$xc0
2301 vpxor 0x60($inp),$xd0,$xd0
2302 vpxor 0x80($inp),$xa1,$xa1
2303 vpxor 0xa0($inp),$xb1,$xb1
2304 vpxor 0xc0($inp),$xc1,$xc1
2305 vpxor 0xe0($inp),$xd1,$xd1
2306 vmovdqu $xa0,0x00($out)
2307 vmovdqu $xb0,0x20($out)
2308 vmovdqu $xc0,0x40($out)
2309 vmovdqu $xd0,0x60($out)
2310 vmovdqu $xa1,0x80($out)
2311 vmovdqu $xb1,0xa0($out)
2312 vmovdqu $xc1,0xc0($out)
2313 vmovdqu $xd1,0xe0($out)
2314 je .Ldone8x
2315
2316 lea 0x100($inp),$inp # inp+=64*4
2317 xor %r10,%r10
2318 vmovdqa $xa2,0x00(%rsp)
2319 lea 0x100($out),$out # out+=64*4
2320 sub \$256,$len # len-=64*4
2321 vmovdqa $xb2,0x20(%rsp)
2322 jmp .Loop_tail8x
2323
2324.align 32
2325.L320_or_more8x:
2326 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2327 vpxor 0x20($inp),$xb0,$xb0
2328 vpxor 0x40($inp),$xc0,$xc0
2329 vpxor 0x60($inp),$xd0,$xd0
2330 vpxor 0x80($inp),$xa1,$xa1
2331 vpxor 0xa0($inp),$xb1,$xb1
2332 vpxor 0xc0($inp),$xc1,$xc1
2333 vpxor 0xe0($inp),$xd1,$xd1
2334 vpxor 0x100($inp),$xa2,$xa2
2335 vpxor 0x120($inp),$xb2,$xb2
2336 vmovdqu $xa0,0x00($out)
2337 vmovdqu $xb0,0x20($out)
2338 vmovdqu $xc0,0x40($out)
2339 vmovdqu $xd0,0x60($out)
2340 vmovdqu $xa1,0x80($out)
2341 vmovdqu $xb1,0xa0($out)
2342 vmovdqu $xc1,0xc0($out)
2343 vmovdqu $xd1,0xe0($out)
2344 vmovdqu $xa2,0x100($out)
2345 vmovdqu $xb2,0x120($out)
2346 je .Ldone8x
2347
2348 lea 0x140($inp),$inp # inp+=64*5
2349 xor %r10,%r10
2350 vmovdqa $xc2,0x00(%rsp)
2351 lea 0x140($out),$out # out+=64*5
2352 sub \$320,$len # len-=64*5
2353 vmovdqa $xd2,0x20(%rsp)
2354 jmp .Loop_tail8x
2355
2356.align 32
2357.L384_or_more8x:
2358 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2359 vpxor 0x20($inp),$xb0,$xb0
2360 vpxor 0x40($inp),$xc0,$xc0
2361 vpxor 0x60($inp),$xd0,$xd0
2362 vpxor 0x80($inp),$xa1,$xa1
2363 vpxor 0xa0($inp),$xb1,$xb1
2364 vpxor 0xc0($inp),$xc1,$xc1
2365 vpxor 0xe0($inp),$xd1,$xd1
2366 vpxor 0x100($inp),$xa2,$xa2
2367 vpxor 0x120($inp),$xb2,$xb2
2368 vpxor 0x140($inp),$xc2,$xc2
2369 vpxor 0x160($inp),$xd2,$xd2
2370 vmovdqu $xa0,0x00($out)
2371 vmovdqu $xb0,0x20($out)
2372 vmovdqu $xc0,0x40($out)
2373 vmovdqu $xd0,0x60($out)
2374 vmovdqu $xa1,0x80($out)
2375 vmovdqu $xb1,0xa0($out)
2376 vmovdqu $xc1,0xc0($out)
2377 vmovdqu $xd1,0xe0($out)
2378 vmovdqu $xa2,0x100($out)
2379 vmovdqu $xb2,0x120($out)
2380 vmovdqu $xc2,0x140($out)
2381 vmovdqu $xd2,0x160($out)
2382 je .Ldone8x
2383
2384 lea 0x180($inp),$inp # inp+=64*6
2385 xor %r10,%r10
2386 vmovdqa $xa3,0x00(%rsp)
2387 lea 0x180($out),$out # out+=64*6
2388 sub \$384,$len # len-=64*6
2389 vmovdqa $xb3,0x20(%rsp)
2390 jmp .Loop_tail8x
2391
2392.align 32
2393.L448_or_more8x:
2394 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2395 vpxor 0x20($inp),$xb0,$xb0
2396 vpxor 0x40($inp),$xc0,$xc0
2397 vpxor 0x60($inp),$xd0,$xd0
2398 vpxor 0x80($inp),$xa1,$xa1
2399 vpxor 0xa0($inp),$xb1,$xb1
2400 vpxor 0xc0($inp),$xc1,$xc1
2401 vpxor 0xe0($inp),$xd1,$xd1
2402 vpxor 0x100($inp),$xa2,$xa2
2403 vpxor 0x120($inp),$xb2,$xb2
2404 vpxor 0x140($inp),$xc2,$xc2
2405 vpxor 0x160($inp),$xd2,$xd2
2406 vpxor 0x180($inp),$xa3,$xa3
2407 vpxor 0x1a0($inp),$xb3,$xb3
2408 vmovdqu $xa0,0x00($out)
2409 vmovdqu $xb0,0x20($out)
2410 vmovdqu $xc0,0x40($out)
2411 vmovdqu $xd0,0x60($out)
2412 vmovdqu $xa1,0x80($out)
2413 vmovdqu $xb1,0xa0($out)
2414 vmovdqu $xc1,0xc0($out)
2415 vmovdqu $xd1,0xe0($out)
2416 vmovdqu $xa2,0x100($out)
2417 vmovdqu $xb2,0x120($out)
2418 vmovdqu $xc2,0x140($out)
2419 vmovdqu $xd2,0x160($out)
2420 vmovdqu $xa3,0x180($out)
2421 vmovdqu $xb3,0x1a0($out)
2422 je .Ldone8x
2423
2424 lea 0x1c0($inp),$inp # inp+=64*7
2425 xor %r10,%r10
2426 vmovdqa $xc3,0x00(%rsp)
2427 lea 0x1c0($out),$out # out+=64*7
2428 sub \$448,$len # len-=64*7
2429 vmovdqa $xd3,0x20(%rsp)
2430
2431.Loop_tail8x:
2432 movzb ($inp,%r10),%eax
2433 movzb (%rsp,%r10),%ecx
2434 lea 1(%r10),%r10
2435 xor %ecx,%eax
2436 mov %al,-1($out,%r10)
2437 dec $len
2438 jnz .Loop_tail8x
2439
2440.Ldone8x:
3c274a6e 2441 vzeroall
a98c648e
AP
2442___
2443$code.=<<___ if ($win64);
384e6de4
AP
2444 movaps -0xa8(%r9),%xmm6
2445 movaps -0x98(%r9),%xmm7
2446 movaps -0x88(%r9),%xmm8
2447 movaps -0x78(%r9),%xmm9
2448 movaps -0x68(%r9),%xmm10
2449 movaps -0x58(%r9),%xmm11
2450 movaps -0x48(%r9),%xmm12
2451 movaps -0x38(%r9),%xmm13
2452 movaps -0x28(%r9),%xmm14
2453 movaps -0x18(%r9),%xmm15
a98c648e
AP
2454___
2455$code.=<<___;
384e6de4 2456 lea (%r9),%rsp
f17652e5 2457.cfi_def_cfa_register %rsp
384e6de4 2458.L8x_epilogue:
a98c648e 2459 ret
f17652e5 2460.cfi_endproc
a98c648e
AP
2461.size ChaCha20_8x,.-ChaCha20_8x
2462___
2463}
2464
abb8c44f
AP
2465########################################################################
2466# AVX512 code paths
2467if ($avx>2) {
3c274a6e
AP
2468# This one handles shorter inputs...
2469
2470my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2471my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2472
cded9513
AP
2473sub vpxord() # size optimization
2474{ my $opcode = "vpxor"; # adhere to vpxor when possible
2475
2476 foreach (@_) {
2477 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2478 $opcode = "vpxord";
2479 last;
2480 }
2481 }
2482
2483 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2484}
2485
3c274a6e
AP
2486sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2487 &vpaddd ($a,$a,$b);
2488 &vpxord ($d,$d,$a);
2489 &vprold ($d,$d,16);
2490
2491 &vpaddd ($c,$c,$d);
2492 &vpxord ($b,$b,$c);
2493 &vprold ($b,$b,12);
2494
2495 &vpaddd ($a,$a,$b);
2496 &vpxord ($d,$d,$a);
2497 &vprold ($d,$d,8);
2498
2499 &vpaddd ($c,$c,$d);
2500 &vpxord ($b,$b,$c);
2501 &vprold ($b,$b,7);
2502}
2503
384e6de4 2504my $xframe = $win64 ? 32+8 : 8;
3c274a6e
AP
2505
2506$code.=<<___;
2507.type ChaCha20_avx512,\@function,5
2508.align 32
2509ChaCha20_avx512:
f17652e5 2510.cfi_startproc
3c274a6e 2511.LChaCha20_avx512:
384e6de4 2512 mov %rsp,%r9 # frame pointer
f17652e5 2513.cfi_def_cfa_register %r9
3c274a6e
AP
2514 cmp \$512,$len
2515 ja .LChaCha20_16x
2516
3c274a6e
AP
2517 sub \$64+$xframe,%rsp
2518___
2519$code.=<<___ if ($win64);
384e6de4
AP
2520 movaps %xmm6,-0x28(%r9)
2521 movaps %xmm7,-0x18(%r9)
2522.Lavx512_body:
3c274a6e
AP
2523___
2524$code.=<<___;
2525 vbroadcasti32x4 .Lsigma(%rip),$a
2526 vbroadcasti32x4 ($key),$b
2527 vbroadcasti32x4 16($key),$c
2528 vbroadcasti32x4 ($counter),$d
2529
2530 vmovdqa32 $a,$a_
2531 vmovdqa32 $b,$b_
2532 vmovdqa32 $c,$c_
2533 vpaddd .Lzeroz(%rip),$d,$d
2534 vmovdqa32 .Lfourz(%rip),$fourz
2535 mov \$10,$counter # reuse $counter
2536 vmovdqa32 $d,$d_
2537 jmp .Loop_avx512
2538
2539.align 16
2540.Loop_outer_avx512:
2541 vmovdqa32 $a_,$a
2542 vmovdqa32 $b_,$b
2543 vmovdqa32 $c_,$c
2544 vpaddd $fourz,$d_,$d
2545 mov \$10,$counter
2546 vmovdqa32 $d,$d_
2547 jmp .Loop_avx512
2548
2549.align 32
2550.Loop_avx512:
2551___
2552 &AVX512ROUND();
2553 &vpshufd ($c,$c,0b01001110);
2554 &vpshufd ($b,$b,0b00111001);
2555 &vpshufd ($d,$d,0b10010011);
2556
2557 &AVX512ROUND();
2558 &vpshufd ($c,$c,0b01001110);
2559 &vpshufd ($b,$b,0b10010011);
2560 &vpshufd ($d,$d,0b00111001);
2561
2562 &dec ($counter);
2563 &jnz (".Loop_avx512");
2564
2565$code.=<<___;
2566 vpaddd $a_,$a,$a
2567 vpaddd $b_,$b,$b
2568 vpaddd $c_,$c,$c
2569 vpaddd $d_,$d,$d
2570
2571 sub \$64,$len
2572 jb .Ltail64_avx512
2573
2574 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2575 vpxor 0x10($inp),%x#$b,$t1
2576 vpxor 0x20($inp),%x#$c,$t2
2577 vpxor 0x30($inp),%x#$d,$t3
2578 lea 0x40($inp),$inp # inp+=64
2579
2580 vmovdqu $t0,0x00($out) # write output
2581 vmovdqu $t1,0x10($out)
2582 vmovdqu $t2,0x20($out)
2583 vmovdqu $t3,0x30($out)
2584 lea 0x40($out),$out # out+=64
2585
2586 jz .Ldone_avx512
2587
2588 vextracti32x4 \$1,$a,$t0
2589 vextracti32x4 \$1,$b,$t1
2590 vextracti32x4 \$1,$c,$t2
2591 vextracti32x4 \$1,$d,$t3
2592
2593 sub \$64,$len
2594 jb .Ltail_avx512
2595
2596 vpxor 0x00($inp),$t0,$t0 # xor with input
2597 vpxor 0x10($inp),$t1,$t1
2598 vpxor 0x20($inp),$t2,$t2
2599 vpxor 0x30($inp),$t3,$t3
2600 lea 0x40($inp),$inp # inp+=64
2601
2602 vmovdqu $t0,0x00($out) # write output
2603 vmovdqu $t1,0x10($out)
2604 vmovdqu $t2,0x20($out)
2605 vmovdqu $t3,0x30($out)
2606 lea 0x40($out),$out # out+=64
2607
2608 jz .Ldone_avx512
2609
2610 vextracti32x4 \$2,$a,$t0
2611 vextracti32x4 \$2,$b,$t1
2612 vextracti32x4 \$2,$c,$t2
2613 vextracti32x4 \$2,$d,$t3
2614
2615 sub \$64,$len
2616 jb .Ltail_avx512
2617
2618 vpxor 0x00($inp),$t0,$t0 # xor with input
2619 vpxor 0x10($inp),$t1,$t1
2620 vpxor 0x20($inp),$t2,$t2
2621 vpxor 0x30($inp),$t3,$t3
2622 lea 0x40($inp),$inp # inp+=64
2623
2624 vmovdqu $t0,0x00($out) # write output
2625 vmovdqu $t1,0x10($out)
2626 vmovdqu $t2,0x20($out)
2627 vmovdqu $t3,0x30($out)
2628 lea 0x40($out),$out # out+=64
2629
2630 jz .Ldone_avx512
2631
2632 vextracti32x4 \$3,$a,$t0
2633 vextracti32x4 \$3,$b,$t1
2634 vextracti32x4 \$3,$c,$t2
2635 vextracti32x4 \$3,$d,$t3
2636
2637 sub \$64,$len
2638 jb .Ltail_avx512
2639
2640 vpxor 0x00($inp),$t0,$t0 # xor with input
2641 vpxor 0x10($inp),$t1,$t1
2642 vpxor 0x20($inp),$t2,$t2
2643 vpxor 0x30($inp),$t3,$t3
2644 lea 0x40($inp),$inp # inp+=64
2645
2646 vmovdqu $t0,0x00($out) # write output
2647 vmovdqu $t1,0x10($out)
2648 vmovdqu $t2,0x20($out)
2649 vmovdqu $t3,0x30($out)
2650 lea 0x40($out),$out # out+=64
2651
2652 jnz .Loop_outer_avx512
2653
2654 jmp .Ldone_avx512
2655
2656.align 16
2657.Ltail64_avx512:
2658 vmovdqa %x#$a,0x00(%rsp)
2659 vmovdqa %x#$b,0x10(%rsp)
2660 vmovdqa %x#$c,0x20(%rsp)
2661 vmovdqa %x#$d,0x30(%rsp)
2662 add \$64,$len
2663 jmp .Loop_tail_avx512
2664
2665.align 16
2666.Ltail_avx512:
2667 vmovdqa $t0,0x00(%rsp)
2668 vmovdqa $t1,0x10(%rsp)
2669 vmovdqa $t2,0x20(%rsp)
2670 vmovdqa $t3,0x30(%rsp)
2671 add \$64,$len
2672
2673.Loop_tail_avx512:
2674 movzb ($inp,$counter),%eax
2675 movzb (%rsp,$counter),%ecx
2676 lea 1($counter),$counter
2677 xor %ecx,%eax
2678 mov %al,-1($out,$counter)
2679 dec $len
2680 jnz .Loop_tail_avx512
2681
47c9926a 2682 vmovdqu32 $a_,0x00(%rsp)
3c274a6e
AP
2683
2684.Ldone_avx512:
2685 vzeroall
2686___
2687$code.=<<___ if ($win64);
384e6de4
AP
2688 movaps -0x28(%r9),%xmm6
2689 movaps -0x18(%r9),%xmm7
3c274a6e
AP
2690___
2691$code.=<<___;
384e6de4 2692 lea (%r9),%rsp
f17652e5 2693.cfi_def_cfa_register %rsp
384e6de4 2694.Lavx512_epilogue:
3c274a6e 2695 ret
f17652e5 2696.cfi_endproc
3c274a6e
AP
2697.size ChaCha20_avx512,.-ChaCha20_avx512
2698___
cded9513
AP
2699
2700map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2701
2702$code.=<<___;
2703.type ChaCha20_avx512vl,\@function,5
2704.align 32
2705ChaCha20_avx512vl:
2706.cfi_startproc
2707.LChaCha20_avx512vl:
2708 mov %rsp,%r9 # frame pointer
2709.cfi_def_cfa_register %r9
2710 cmp \$128,$len
2711 ja .LChaCha20_8xvl
2712
2713 sub \$64+$xframe,%rsp
2714___
2715$code.=<<___ if ($win64);
2716 movaps %xmm6,-0x28(%r9)
2717 movaps %xmm7,-0x18(%r9)
2718.Lavx512vl_body:
2719___
2720$code.=<<___;
2721 vbroadcasti128 .Lsigma(%rip),$a
2722 vbroadcasti128 ($key),$b
2723 vbroadcasti128 16($key),$c
2724 vbroadcasti128 ($counter),$d
2725
2726 vmovdqa32 $a,$a_
2727 vmovdqa32 $b,$b_
2728 vmovdqa32 $c,$c_
2729 vpaddd .Lzeroz(%rip),$d,$d
2730 vmovdqa32 .Ltwoy(%rip),$fourz
2731 mov \$10,$counter # reuse $counter
2732 vmovdqa32 $d,$d_
2733 jmp .Loop_avx512vl
2734
2735.align 16
2736.Loop_outer_avx512vl:
2737 vmovdqa32 $c_,$c
2738 vpaddd $fourz,$d_,$d
2739 mov \$10,$counter
2740 vmovdqa32 $d,$d_
2741 jmp .Loop_avx512vl
2742
2743.align 32
2744.Loop_avx512vl:
2745___
2746 &AVX512ROUND();
2747 &vpshufd ($c,$c,0b01001110);
2748 &vpshufd ($b,$b,0b00111001);
2749 &vpshufd ($d,$d,0b10010011);
2750
2751 &AVX512ROUND();
2752 &vpshufd ($c,$c,0b01001110);
2753 &vpshufd ($b,$b,0b10010011);
2754 &vpshufd ($d,$d,0b00111001);
2755
2756 &dec ($counter);
2757 &jnz (".Loop_avx512vl");
2758
2759$code.=<<___;
2760 vpaddd $a_,$a,$a
2761 vpaddd $b_,$b,$b
2762 vpaddd $c_,$c,$c
2763 vpaddd $d_,$d,$d
2764
2765 sub \$64,$len
2766 jb .Ltail64_avx512vl
2767
2768 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2769 vpxor 0x10($inp),%x#$b,$t1
2770 vpxor 0x20($inp),%x#$c,$t2
2771 vpxor 0x30($inp),%x#$d,$t3
2772 lea 0x40($inp),$inp # inp+=64
2773
2774 vmovdqu $t0,0x00($out) # write output
2775 vmovdqu $t1,0x10($out)
2776 vmovdqu $t2,0x20($out)
2777 vmovdqu $t3,0x30($out)
2778 lea 0x40($out),$out # out+=64
2779
2780 jz .Ldone_avx512vl
2781
2782 vextracti128 \$1,$a,$t0
2783 vextracti128 \$1,$b,$t1
2784 vextracti128 \$1,$c,$t2
2785 vextracti128 \$1,$d,$t3
2786
2787 sub \$64,$len
2788 jb .Ltail_avx512vl
2789
2790 vpxor 0x00($inp),$t0,$t0 # xor with input
2791 vpxor 0x10($inp),$t1,$t1
2792 vpxor 0x20($inp),$t2,$t2
2793 vpxor 0x30($inp),$t3,$t3
2794 lea 0x40($inp),$inp # inp+=64
2795
2796 vmovdqu $t0,0x00($out) # write output
2797 vmovdqu $t1,0x10($out)
2798 vmovdqu $t2,0x20($out)
2799 vmovdqu $t3,0x30($out)
2800 lea 0x40($out),$out # out+=64
2801
2802 vmovdqa32 $a_,$a
2803 vmovdqa32 $b_,$b
2804 jnz .Loop_outer_avx512vl
2805
2806 jmp .Ldone_avx512vl
2807
2808.align 16
2809.Ltail64_avx512vl:
2810 vmovdqa %x#$a,0x00(%rsp)
2811 vmovdqa %x#$b,0x10(%rsp)
2812 vmovdqa %x#$c,0x20(%rsp)
2813 vmovdqa %x#$d,0x30(%rsp)
2814 add \$64,$len
2815 jmp .Loop_tail_avx512vl
2816
2817.align 16
2818.Ltail_avx512vl:
2819 vmovdqa $t0,0x00(%rsp)
2820 vmovdqa $t1,0x10(%rsp)
2821 vmovdqa $t2,0x20(%rsp)
2822 vmovdqa $t3,0x30(%rsp)
2823 add \$64,$len
2824
2825.Loop_tail_avx512vl:
2826 movzb ($inp,$counter),%eax
2827 movzb (%rsp,$counter),%ecx
2828 lea 1($counter),$counter
2829 xor %ecx,%eax
2830 mov %al,-1($out,$counter)
2831 dec $len
2832 jnz .Loop_tail_avx512vl
2833
2834 vmovdqu32 $a_,0x00(%rsp)
2835 vmovdqu32 $a_,0x20(%rsp)
2836
2837.Ldone_avx512vl:
2838 vzeroall
2839___
2840$code.=<<___ if ($win64);
2841 movaps -0x28(%r9),%xmm6
2842 movaps -0x18(%r9),%xmm7
2843___
2844$code.=<<___;
2845 lea (%r9),%rsp
2846.cfi_def_cfa_register %rsp
2847.Lavx512vl_epilogue:
2848 ret
2849.cfi_endproc
2850.size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2851___
3c274a6e
AP
2852}
2853if ($avx>2) {
2854# This one handles longer inputs...
2855
abb8c44f
AP
2856my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2857 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2858my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2859 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2860my @key=map("%zmm$_",(16..31));
2861my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2862
2863sub AVX512_lane_ROUND {
2864my ($a0,$b0,$c0,$d0)=@_;
2865my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2866my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2867my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2868my @x=map("\"$_\"",@xx);
2869
2870 (
2871 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2872 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2873 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2874 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2875 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2876 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2877 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2878 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2879 "&vprold (@x[$d0],@x[$d0],16)",
2880 "&vprold (@x[$d1],@x[$d1],16)",
2881 "&vprold (@x[$d2],@x[$d2],16)",
2882 "&vprold (@x[$d3],@x[$d3],16)",
2883
2884 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2885 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2886 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2887 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2888 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2889 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2890 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2891 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2892 "&vprold (@x[$b0],@x[$b0],12)",
2893 "&vprold (@x[$b1],@x[$b1],12)",
2894 "&vprold (@x[$b2],@x[$b2],12)",
2895 "&vprold (@x[$b3],@x[$b3],12)",
2896
2897 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2898 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2899 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2900 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2901 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2902 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2903 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2904 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2905 "&vprold (@x[$d0],@x[$d0],8)",
2906 "&vprold (@x[$d1],@x[$d1],8)",
2907 "&vprold (@x[$d2],@x[$d2],8)",
2908 "&vprold (@x[$d3],@x[$d3],8)",
2909
2910 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2911 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2912 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2913 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2914 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2915 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2916 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2917 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2918 "&vprold (@x[$b0],@x[$b0],7)",
2919 "&vprold (@x[$b1],@x[$b1],7)",
2920 "&vprold (@x[$b2],@x[$b2],7)",
2921 "&vprold (@x[$b3],@x[$b3],7)"
2922 );
2923}
2924
384e6de4 2925my $xframe = $win64 ? 0xa8 : 8;
abb8c44f
AP
2926
2927$code.=<<___;
2928.type ChaCha20_16x,\@function,5
2929.align 32
2930ChaCha20_16x:
f17652e5 2931.cfi_startproc
abb8c44f 2932.LChaCha20_16x:
384e6de4 2933 mov %rsp,%r9 # frame register
f17652e5 2934.cfi_def_cfa_register %r9
abb8c44f
AP
2935 sub \$64+$xframe,%rsp
2936 and \$-64,%rsp
2937___
2938$code.=<<___ if ($win64);
384e6de4
AP
2939 movaps %xmm6,-0xa8(%r9)
2940 movaps %xmm7,-0x98(%r9)
2941 movaps %xmm8,-0x88(%r9)
2942 movaps %xmm9,-0x78(%r9)
2943 movaps %xmm10,-0x68(%r9)
2944 movaps %xmm11,-0x58(%r9)
2945 movaps %xmm12,-0x48(%r9)
2946 movaps %xmm13,-0x38(%r9)
2947 movaps %xmm14,-0x28(%r9)
2948 movaps %xmm15,-0x18(%r9)
2949.L16x_body:
abb8c44f
AP
2950___
2951$code.=<<___;
2952 vzeroupper
2953
2954 lea .Lsigma(%rip),%r10
2955 vbroadcasti32x4 (%r10),$xa3 # key[0]
2956 vbroadcasti32x4 ($key),$xb3 # key[1]
2957 vbroadcasti32x4 16($key),$xc3 # key[2]
2958 vbroadcasti32x4 ($counter),$xd3 # key[3]
2959
2960 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2961 vpshufd \$0x55,$xa3,$xa1
2962 vpshufd \$0xaa,$xa3,$xa2
2963 vpshufd \$0xff,$xa3,$xa3
2964 vmovdqa64 $xa0,@key[0]
2965 vmovdqa64 $xa1,@key[1]
2966 vmovdqa64 $xa2,@key[2]
2967 vmovdqa64 $xa3,@key[3]
2968
2969 vpshufd \$0x00,$xb3,$xb0
2970 vpshufd \$0x55,$xb3,$xb1
2971 vpshufd \$0xaa,$xb3,$xb2
2972 vpshufd \$0xff,$xb3,$xb3
2973 vmovdqa64 $xb0,@key[4]
2974 vmovdqa64 $xb1,@key[5]
2975 vmovdqa64 $xb2,@key[6]
2976 vmovdqa64 $xb3,@key[7]
2977
2978 vpshufd \$0x00,$xc3,$xc0
2979 vpshufd \$0x55,$xc3,$xc1
2980 vpshufd \$0xaa,$xc3,$xc2
2981 vpshufd \$0xff,$xc3,$xc3
2982 vmovdqa64 $xc0,@key[8]
2983 vmovdqa64 $xc1,@key[9]
2984 vmovdqa64 $xc2,@key[10]
2985 vmovdqa64 $xc3,@key[11]
2986
2987 vpshufd \$0x00,$xd3,$xd0
2988 vpshufd \$0x55,$xd3,$xd1
2989 vpshufd \$0xaa,$xd3,$xd2
2990 vpshufd \$0xff,$xd3,$xd3
2991 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2992 vmovdqa64 $xd0,@key[12]
2993 vmovdqa64 $xd1,@key[13]
2994 vmovdqa64 $xd2,@key[14]
2995 vmovdqa64 $xd3,@key[15]
2996
2997 mov \$10,%eax
2998 jmp .Loop16x
2999
3000.align 32
3001.Loop_outer16x:
3002 vpbroadcastd 0(%r10),$xa0 # reload key
3003 vpbroadcastd 4(%r10),$xa1
3004 vpbroadcastd 8(%r10),$xa2
3005 vpbroadcastd 12(%r10),$xa3
3006 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
3007 vmovdqa64 @key[4],$xb0
3008 vmovdqa64 @key[5],$xb1
3009 vmovdqa64 @key[6],$xb2
3010 vmovdqa64 @key[7],$xb3
3011 vmovdqa64 @key[8],$xc0
3012 vmovdqa64 @key[9],$xc1
3013 vmovdqa64 @key[10],$xc2
3014 vmovdqa64 @key[11],$xc3
3015 vmovdqa64 @key[12],$xd0
3016 vmovdqa64 @key[13],$xd1
3017 vmovdqa64 @key[14],$xd2
3018 vmovdqa64 @key[15],$xd3
3019
3020 vmovdqa64 $xa0,@key[0]
3021 vmovdqa64 $xa1,@key[1]
3022 vmovdqa64 $xa2,@key[2]
3023 vmovdqa64 $xa3,@key[3]
3024
3025 mov \$10,%eax
3026 jmp .Loop16x
3027
3028.align 32
3029.Loop16x:
3030___
3031 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3032 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3033$code.=<<___;
3034 dec %eax
3035 jnz .Loop16x
3036
3037 vpaddd @key[0],$xa0,$xa0 # accumulate key
3038 vpaddd @key[1],$xa1,$xa1
3039 vpaddd @key[2],$xa2,$xa2
3040 vpaddd @key[3],$xa3,$xa3
3041
3042 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3043 vpunpckldq $xa3,$xa2,$xt3
3044 vpunpckhdq $xa1,$xa0,$xa0
3045 vpunpckhdq $xa3,$xa2,$xa2
3046 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3047 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3048 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3049 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3050___
3051 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3052$code.=<<___;
3053 vpaddd @key[4],$xb0,$xb0
3054 vpaddd @key[5],$xb1,$xb1
3055 vpaddd @key[6],$xb2,$xb2
3056 vpaddd @key[7],$xb3,$xb3
3057
3058 vpunpckldq $xb1,$xb0,$xt2
3059 vpunpckldq $xb3,$xb2,$xt3
3060 vpunpckhdq $xb1,$xb0,$xb0
3061 vpunpckhdq $xb3,$xb2,$xb2
3062 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3063 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3064 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3065 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3066___
3067 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3068$code.=<<___;
3069 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
3070 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
3071 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
3072 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
3073 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
3074 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
3075 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
3076 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
3077___
3078 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3079$code.=<<___;
3080 vpaddd @key[8],$xc0,$xc0
3081 vpaddd @key[9],$xc1,$xc1
3082 vpaddd @key[10],$xc2,$xc2
3083 vpaddd @key[11],$xc3,$xc3
3084
3085 vpunpckldq $xc1,$xc0,$xt2
3086 vpunpckldq $xc3,$xc2,$xt3
3087 vpunpckhdq $xc1,$xc0,$xc0
3088 vpunpckhdq $xc3,$xc2,$xc2
3089 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3090 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3091 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3092 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3093___
3094 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3095$code.=<<___;
3096 vpaddd @key[12],$xd0,$xd0
3097 vpaddd @key[13],$xd1,$xd1
3098 vpaddd @key[14],$xd2,$xd2
3099 vpaddd @key[15],$xd3,$xd3
3100
3101 vpunpckldq $xd1,$xd0,$xt2
3102 vpunpckldq $xd3,$xd2,$xt3
3103 vpunpckhdq $xd1,$xd0,$xd0
3104 vpunpckhdq $xd3,$xd2,$xd2
3105 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3106 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3107 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3108 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3109___
3110 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3111$code.=<<___;
3112 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
3113 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
3114 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
3115 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
3116 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
3117 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
3118 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
3119 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
3120___
3121 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3122$code.=<<___;
3123 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
3124 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
3125 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
3126 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
3127 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
3128 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
3129 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
3130 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
3131 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
3132 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
3133 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
3134 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
3135 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
3136 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
3137 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
3138 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
3139___
3140 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3141 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3142
3143 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3144 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3145 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3146 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3147$code.=<<___;
3148 cmp \$64*16,$len
3149 jb .Ltail16x
3150
3151 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3152 vpxord 0x40($inp),$xb0,$xb0
3153 vpxord 0x80($inp),$xc0,$xc0
3154 vpxord 0xc0($inp),$xd0,$xd0
3155 vmovdqu32 $xa0,0x00($out)
3156 vmovdqu32 $xb0,0x40($out)
3157 vmovdqu32 $xc0,0x80($out)
3158 vmovdqu32 $xd0,0xc0($out)
3159
3160 vpxord 0x100($inp),$xa1,$xa1
3161 vpxord 0x140($inp),$xb1,$xb1
3162 vpxord 0x180($inp),$xc1,$xc1
3163 vpxord 0x1c0($inp),$xd1,$xd1
3164 vmovdqu32 $xa1,0x100($out)
3165 vmovdqu32 $xb1,0x140($out)
3166 vmovdqu32 $xc1,0x180($out)
3167 vmovdqu32 $xd1,0x1c0($out)
3168
3169 vpxord 0x200($inp),$xa2,$xa2
3170 vpxord 0x240($inp),$xb2,$xb2
3171 vpxord 0x280($inp),$xc2,$xc2
3172 vpxord 0x2c0($inp),$xd2,$xd2
3173 vmovdqu32 $xa2,0x200($out)
3174 vmovdqu32 $xb2,0x240($out)
3175 vmovdqu32 $xc2,0x280($out)
3176 vmovdqu32 $xd2,0x2c0($out)
3177
3178 vpxord 0x300($inp),$xa3,$xa3
3179 vpxord 0x340($inp),$xb3,$xb3
3180 vpxord 0x380($inp),$xc3,$xc3
3181 vpxord 0x3c0($inp),$xd3,$xd3
3182 lea 0x400($inp),$inp
3183 vmovdqu32 $xa3,0x300($out)
3184 vmovdqu32 $xb3,0x340($out)
3185 vmovdqu32 $xc3,0x380($out)
3186 vmovdqu32 $xd3,0x3c0($out)
3187 lea 0x400($out),$out
3188
3189 sub \$64*16,$len
3190 jnz .Loop_outer16x
3191
3192 jmp .Ldone16x
3193
3194.align 32
3195.Ltail16x:
3196 xor %r10,%r10
3197 sub $inp,$out
3198 cmp \$64*1,$len
3199 jb .Less_than_64_16x
3200 vpxord ($inp),$xa0,$xa0 # xor with input
3201 vmovdqu32 $xa0,($out,$inp)
3202 je .Ldone16x
3203 vmovdqa32 $xb0,$xa0
3204 lea 64($inp),$inp
3205
3206 cmp \$64*2,$len
3207 jb .Less_than_64_16x
3208 vpxord ($inp),$xb0,$xb0
3209 vmovdqu32 $xb0,($out,$inp)
3210 je .Ldone16x
3211 vmovdqa32 $xc0,$xa0
3212 lea 64($inp),$inp
3213
3214 cmp \$64*3,$len
3215 jb .Less_than_64_16x
3216 vpxord ($inp),$xc0,$xc0
3217 vmovdqu32 $xc0,($out,$inp)
3218 je .Ldone16x
3219 vmovdqa32 $xd0,$xa0
3220 lea 64($inp),$inp
3221
3222 cmp \$64*4,$len
3223 jb .Less_than_64_16x
3224 vpxord ($inp),$xd0,$xd0
3225 vmovdqu32 $xd0,($out,$inp)
3226 je .Ldone16x
3227 vmovdqa32 $xa1,$xa0
3228 lea 64($inp),$inp
3229
3230 cmp \$64*5,$len
3231 jb .Less_than_64_16x
3232 vpxord ($inp),$xa1,$xa1
3233 vmovdqu32 $xa1,($out,$inp)
3234 je .Ldone16x
3235 vmovdqa32 $xb1,$xa0
3236 lea 64($inp),$inp
3237
3238 cmp \$64*6,$len
3239 jb .Less_than_64_16x
3240 vpxord ($inp),$xb1,$xb1
3241 vmovdqu32 $xb1,($out,$inp)
3242 je .Ldone16x
3243 vmovdqa32 $xc1,$xa0
3244 lea 64($inp),$inp
3245
3246 cmp \$64*7,$len
3247 jb .Less_than_64_16x
3248 vpxord ($inp),$xc1,$xc1
3249 vmovdqu32 $xc1,($out,$inp)
3250 je .Ldone16x
3251 vmovdqa32 $xd1,$xa0
3252 lea 64($inp),$inp
3253
3254 cmp \$64*8,$len
3255 jb .Less_than_64_16x
3256 vpxord ($inp),$xd1,$xd1
3257 vmovdqu32 $xd1,($out,$inp)
3258 je .Ldone16x
3259 vmovdqa32 $xa2,$xa0
3260 lea 64($inp),$inp
3261
3262 cmp \$64*9,$len
3263 jb .Less_than_64_16x
3264 vpxord ($inp),$xa2,$xa2
3265 vmovdqu32 $xa2,($out,$inp)
3266 je .Ldone16x
3267 vmovdqa32 $xb2,$xa0
3268 lea 64($inp),$inp
3269
3270 cmp \$64*10,$len
3271 jb .Less_than_64_16x
3272 vpxord ($inp),$xb2,$xb2
3273 vmovdqu32 $xb2,($out,$inp)
3274 je .Ldone16x
3275 vmovdqa32 $xc2,$xa0
3276 lea 64($inp),$inp
3277
3278 cmp \$64*11,$len
3279 jb .Less_than_64_16x
3280 vpxord ($inp),$xc2,$xc2
3281 vmovdqu32 $xc2,($out,$inp)
3282 je .Ldone16x
3283 vmovdqa32 $xd2,$xa0
3284 lea 64($inp),$inp
3285
3286 cmp \$64*12,$len
3287 jb .Less_than_64_16x
3288 vpxord ($inp),$xd2,$xd2
3289 vmovdqu32 $xd2,($out,$inp)
3290 je .Ldone16x
3291 vmovdqa32 $xa3,$xa0
3292 lea 64($inp),$inp
3293
3294 cmp \$64*13,$len
3295 jb .Less_than_64_16x
3296 vpxord ($inp),$xa3,$xa3
3297 vmovdqu32 $xa3,($out,$inp)
3298 je .Ldone16x
3299 vmovdqa32 $xb3,$xa0
3300 lea 64($inp),$inp
3301
3302 cmp \$64*14,$len
3303 jb .Less_than_64_16x
3304 vpxord ($inp),$xb3,$xb3
3305 vmovdqu32 $xb3,($out,$inp)
3306 je .Ldone16x
3307 vmovdqa32 $xc3,$xa0
3308 lea 64($inp),$inp
3309
3310 cmp \$64*15,$len
3311 jb .Less_than_64_16x
3312 vpxord ($inp),$xc3,$xc3
3313 vmovdqu32 $xc3,($out,$inp)
3314 je .Ldone16x
3315 vmovdqa32 $xd3,$xa0
3316 lea 64($inp),$inp
3317
3318.Less_than_64_16x:
3319 vmovdqa32 $xa0,0x00(%rsp)
3320 lea ($out,$inp),$out
3321 and \$63,$len
3322
3323.Loop_tail16x:
3324 movzb ($inp,%r10),%eax
3325 movzb (%rsp,%r10),%ecx
3326 lea 1(%r10),%r10
3327 xor %ecx,%eax
3328 mov %al,-1($out,%r10)
3329 dec $len
3330 jnz .Loop_tail16x
3331
3c274a6e
AP
3332 vpxord $xa0,$xa0,$xa0
3333 vmovdqa32 $xa0,0(%rsp)
3334
abb8c44f 3335.Ldone16x:
3c274a6e 3336 vzeroall
abb8c44f
AP
3337___
3338$code.=<<___ if ($win64);
384e6de4
AP
3339 movaps -0xa8(%r9),%xmm6
3340 movaps -0x98(%r9),%xmm7
3341 movaps -0x88(%r9),%xmm8
3342 movaps -0x78(%r9),%xmm9
3343 movaps -0x68(%r9),%xmm10
3344 movaps -0x58(%r9),%xmm11
3345 movaps -0x48(%r9),%xmm12
3346 movaps -0x38(%r9),%xmm13
3347 movaps -0x28(%r9),%xmm14
3348 movaps -0x18(%r9),%xmm15
abb8c44f
AP
3349___
3350$code.=<<___;
384e6de4 3351 lea (%r9),%rsp
f17652e5 3352.cfi_def_cfa_register %rsp
384e6de4 3353.L16x_epilogue:
abb8c44f 3354 ret
f17652e5 3355.cfi_endproc
abb8c44f
AP
3356.size ChaCha20_16x,.-ChaCha20_16x
3357___
cded9513
AP
3358
3359# switch to %ymm domain
3360($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3361 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3362@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3363 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3364@key=map("%ymm$_",(16..31));
3365($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3366
3367$code.=<<___;
3368.type ChaCha20_8xvl,\@function,5
3369.align 32
3370ChaCha20_8xvl:
3371.cfi_startproc
3372.LChaCha20_8xvl:
3373 mov %rsp,%r9 # frame register
3374.cfi_def_cfa_register %r9
3375 sub \$64+$xframe,%rsp
3376 and \$-64,%rsp
3377___
3378$code.=<<___ if ($win64);
3379 movaps %xmm6,-0xa8(%r9)
3380 movaps %xmm7,-0x98(%r9)
3381 movaps %xmm8,-0x88(%r9)
3382 movaps %xmm9,-0x78(%r9)
3383 movaps %xmm10,-0x68(%r9)
3384 movaps %xmm11,-0x58(%r9)
3385 movaps %xmm12,-0x48(%r9)
3386 movaps %xmm13,-0x38(%r9)
3387 movaps %xmm14,-0x28(%r9)
3388 movaps %xmm15,-0x18(%r9)
3389.L8xvl_body:
3390___
3391$code.=<<___;
3392 vzeroupper
3393
3394 lea .Lsigma(%rip),%r10
3395 vbroadcasti128 (%r10),$xa3 # key[0]
3396 vbroadcasti128 ($key),$xb3 # key[1]
3397 vbroadcasti128 16($key),$xc3 # key[2]
3398 vbroadcasti128 ($counter),$xd3 # key[3]
3399
3400 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3401 vpshufd \$0x55,$xa3,$xa1
3402 vpshufd \$0xaa,$xa3,$xa2
3403 vpshufd \$0xff,$xa3,$xa3
3404 vmovdqa64 $xa0,@key[0]
3405 vmovdqa64 $xa1,@key[1]
3406 vmovdqa64 $xa2,@key[2]
3407 vmovdqa64 $xa3,@key[3]
3408
3409 vpshufd \$0x00,$xb3,$xb0
3410 vpshufd \$0x55,$xb3,$xb1
3411 vpshufd \$0xaa,$xb3,$xb2
3412 vpshufd \$0xff,$xb3,$xb3
3413 vmovdqa64 $xb0,@key[4]
3414 vmovdqa64 $xb1,@key[5]
3415 vmovdqa64 $xb2,@key[6]
3416 vmovdqa64 $xb3,@key[7]
3417
3418 vpshufd \$0x00,$xc3,$xc0
3419 vpshufd \$0x55,$xc3,$xc1
3420 vpshufd \$0xaa,$xc3,$xc2
3421 vpshufd \$0xff,$xc3,$xc3
3422 vmovdqa64 $xc0,@key[8]
3423 vmovdqa64 $xc1,@key[9]
3424 vmovdqa64 $xc2,@key[10]
3425 vmovdqa64 $xc3,@key[11]
3426
3427 vpshufd \$0x00,$xd3,$xd0
3428 vpshufd \$0x55,$xd3,$xd1
3429 vpshufd \$0xaa,$xd3,$xd2
3430 vpshufd \$0xff,$xd3,$xd3
3431 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3432 vmovdqa64 $xd0,@key[12]
3433 vmovdqa64 $xd1,@key[13]
3434 vmovdqa64 $xd2,@key[14]
3435 vmovdqa64 $xd3,@key[15]
3436
3437 mov \$10,%eax
3438 jmp .Loop8xvl
3439
3440.align 32
3441.Loop_outer8xvl:
3442 #vpbroadcastd 0(%r10),$xa0 # reload key
3443 #vpbroadcastd 4(%r10),$xa1
3444 vpbroadcastd 8(%r10),$xa2
3445 vpbroadcastd 12(%r10),$xa3
3446 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3447 vmovdqa64 @key[4],$xb0
3448 vmovdqa64 @key[5],$xb1
3449 vmovdqa64 @key[6],$xb2
3450 vmovdqa64 @key[7],$xb3
3451 vmovdqa64 @key[8],$xc0
3452 vmovdqa64 @key[9],$xc1
3453 vmovdqa64 @key[10],$xc2
3454 vmovdqa64 @key[11],$xc3
3455 vmovdqa64 @key[12],$xd0
3456 vmovdqa64 @key[13],$xd1
3457 vmovdqa64 @key[14],$xd2
3458 vmovdqa64 @key[15],$xd3
3459
3460 vmovdqa64 $xa0,@key[0]
3461 vmovdqa64 $xa1,@key[1]
3462 vmovdqa64 $xa2,@key[2]
3463 vmovdqa64 $xa3,@key[3]
3464
3465 mov \$10,%eax
3466 jmp .Loop8xvl
3467
3468.align 32
3469.Loop8xvl:
3470___
3471 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3472 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3473$code.=<<___;
3474 dec %eax
3475 jnz .Loop8xvl
3476
3477 vpaddd @key[0],$xa0,$xa0 # accumulate key
3478 vpaddd @key[1],$xa1,$xa1
3479 vpaddd @key[2],$xa2,$xa2
3480 vpaddd @key[3],$xa3,$xa3
3481
3482 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3483 vpunpckldq $xa3,$xa2,$xt3
3484 vpunpckhdq $xa1,$xa0,$xa0
3485 vpunpckhdq $xa3,$xa2,$xa2
3486 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3487 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3488 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3489 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3490___
3491 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3492$code.=<<___;
3493 vpaddd @key[4],$xb0,$xb0
3494 vpaddd @key[5],$xb1,$xb1
3495 vpaddd @key[6],$xb2,$xb2
3496 vpaddd @key[7],$xb3,$xb3
3497
3498 vpunpckldq $xb1,$xb0,$xt2
3499 vpunpckldq $xb3,$xb2,$xt3
3500 vpunpckhdq $xb1,$xb0,$xb0
3501 vpunpckhdq $xb3,$xb2,$xb2
3502 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3503 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3504 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3505 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3506___
3507 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3508$code.=<<___;
3509 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3510 vshufi32x4 \$3,$xb0,$xa0,$xb0
3511 vshufi32x4 \$0,$xb1,$xa1,$xa0
3512 vshufi32x4 \$3,$xb1,$xa1,$xb1
3513 vshufi32x4 \$0,$xb2,$xa2,$xa1
3514 vshufi32x4 \$3,$xb2,$xa2,$xb2
3515 vshufi32x4 \$0,$xb3,$xa3,$xa2
3516 vshufi32x4 \$3,$xb3,$xa3,$xb3
3517___
3518 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3519$code.=<<___;
3520 vpaddd @key[8],$xc0,$xc0
3521 vpaddd @key[9],$xc1,$xc1
3522 vpaddd @key[10],$xc2,$xc2
3523 vpaddd @key[11],$xc3,$xc3
3524
3525 vpunpckldq $xc1,$xc0,$xt2
3526 vpunpckldq $xc3,$xc2,$xt3
3527 vpunpckhdq $xc1,$xc0,$xc0
3528 vpunpckhdq $xc3,$xc2,$xc2
3529 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3530 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3531 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3532 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3533___
3534 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3535$code.=<<___;
3536 vpaddd @key[12],$xd0,$xd0
3537 vpaddd @key[13],$xd1,$xd1
3538 vpaddd @key[14],$xd2,$xd2
3539 vpaddd @key[15],$xd3,$xd3
3540
3541 vpunpckldq $xd1,$xd0,$xt2
3542 vpunpckldq $xd3,$xd2,$xt3
3543 vpunpckhdq $xd1,$xd0,$xd0
3544 vpunpckhdq $xd3,$xd2,$xd2
3545 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3546 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3547 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3548 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3549___
3550 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3551$code.=<<___;
3552 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3553 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3554 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3555 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3556 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3557 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3558 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3559 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3560___
3561 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3562 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3563 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3564$code.=<<___;
3565 cmp \$64*8,$len
3566 jb .Ltail8xvl
3567
3568 mov \$0x80,%eax # size optimization
3569 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3570 vpxor 0x20($inp),$xb0,$xb0
3571 vpxor 0x40($inp),$xc0,$xc0
3572 vpxor 0x60($inp),$xd0,$xd0
3573 lea ($inp,%rax),$inp # size optimization
3574 vmovdqu32 $xa0,0x00($out)
3575 vmovdqu $xb0,0x20($out)
3576 vmovdqu $xc0,0x40($out)
3577 vmovdqu $xd0,0x60($out)
3578 lea ($out,%rax),$out # size optimization
3579
3580 vpxor 0x00($inp),$xa1,$xa1
3581 vpxor 0x20($inp),$xb1,$xb1
3582 vpxor 0x40($inp),$xc1,$xc1
3583 vpxor 0x60($inp),$xd1,$xd1
3584 lea ($inp,%rax),$inp # size optimization
3585 vmovdqu $xa1,0x00($out)
3586 vmovdqu $xb1,0x20($out)
3587 vmovdqu $xc1,0x40($out)
3588 vmovdqu $xd1,0x60($out)
3589 lea ($out,%rax),$out # size optimization
3590
3591 vpxord 0x00($inp),$xa2,$xa2
3592 vpxor 0x20($inp),$xb2,$xb2
3593 vpxor 0x40($inp),$xc2,$xc2
3594 vpxor 0x60($inp),$xd2,$xd2
3595 lea ($inp,%rax),$inp # size optimization
3596 vmovdqu32 $xa2,0x00($out)
3597 vmovdqu $xb2,0x20($out)
3598 vmovdqu $xc2,0x40($out)
3599 vmovdqu $xd2,0x60($out)
3600 lea ($out,%rax),$out # size optimization
3601
3602 vpxor 0x00($inp),$xa3,$xa3
3603 vpxor 0x20($inp),$xb3,$xb3
3604 vpxor 0x40($inp),$xc3,$xc3
3605 vpxor 0x60($inp),$xd3,$xd3
3606 lea ($inp,%rax),$inp # size optimization
3607 vmovdqu $xa3,0x00($out)
3608 vmovdqu $xb3,0x20($out)
3609 vmovdqu $xc3,0x40($out)
3610 vmovdqu $xd3,0x60($out)
3611 lea ($out,%rax),$out # size optimization
3612
3613 vpbroadcastd 0(%r10),%ymm0 # reload key
3614 vpbroadcastd 4(%r10),%ymm1
3615
3616 sub \$64*8,$len
3617 jnz .Loop_outer8xvl
3618
3619 jmp .Ldone8xvl
3620
3621.align 32
3622.Ltail8xvl:
3623 vmovdqa64 $xa0,%ymm8 # size optimization
3624___
3625$xa0 = "%ymm8";
3626$code.=<<___;
3627 xor %r10,%r10
3628 sub $inp,$out
3629 cmp \$64*1,$len
3630 jb .Less_than_64_8xvl
3631 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3632 vpxor 0x20($inp),$xb0,$xb0
3633 vmovdqu $xa0,0x00($out,$inp)
3634 vmovdqu $xb0,0x20($out,$inp)
3635 je .Ldone8xvl
3636 vmovdqa $xc0,$xa0
3637 vmovdqa $xd0,$xb0
3638 lea 64($inp),$inp
3639
3640 cmp \$64*2,$len
3641 jb .Less_than_64_8xvl
3642 vpxor 0x00($inp),$xc0,$xc0
3643 vpxor 0x20($inp),$xd0,$xd0
3644 vmovdqu $xc0,0x00($out,$inp)
3645 vmovdqu $xd0,0x20($out,$inp)
3646 je .Ldone8xvl
3647 vmovdqa $xa1,$xa0
3648 vmovdqa $xb1,$xb0
3649 lea 64($inp),$inp
3650
3651 cmp \$64*3,$len
3652 jb .Less_than_64_8xvl
3653 vpxor 0x00($inp),$xa1,$xa1
3654 vpxor 0x20($inp),$xb1,$xb1
3655 vmovdqu $xa1,0x00($out,$inp)
3656 vmovdqu $xb1,0x20($out,$inp)
3657 je .Ldone8xvl
3658 vmovdqa $xc1,$xa0
3659 vmovdqa $xd1,$xb0
3660 lea 64($inp),$inp
3661
3662 cmp \$64*4,$len
3663 jb .Less_than_64_8xvl
3664 vpxor 0x00($inp),$xc1,$xc1
3665 vpxor 0x20($inp),$xd1,$xd1
3666 vmovdqu $xc1,0x00($out,$inp)
3667 vmovdqu $xd1,0x20($out,$inp)
3668 je .Ldone8xvl
3669 vmovdqa32 $xa2,$xa0
3670 vmovdqa $xb2,$xb0
3671 lea 64($inp),$inp
3672
3673 cmp \$64*5,$len
3674 jb .Less_than_64_8xvl
3675 vpxord 0x00($inp),$xa2,$xa2
3676 vpxor 0x20($inp),$xb2,$xb2
3677 vmovdqu32 $xa2,0x00($out,$inp)
3678 vmovdqu $xb2,0x20($out,$inp)
3679 je .Ldone8xvl
3680 vmovdqa $xc2,$xa0
3681 vmovdqa $xd2,$xb0
3682 lea 64($inp),$inp
3683
3684 cmp \$64*6,$len
3685 jb .Less_than_64_8xvl
3686 vpxor 0x00($inp),$xc2,$xc2
3687 vpxor 0x20($inp),$xd2,$xd2
3688 vmovdqu $xc2,0x00($out,$inp)
3689 vmovdqu $xd2,0x20($out,$inp)
3690 je .Ldone8xvl
3691 vmovdqa $xa3,$xa0
3692 vmovdqa $xb3,$xb0
3693 lea 64($inp),$inp
3694
3695 cmp \$64*7,$len
3696 jb .Less_than_64_8xvl
3697 vpxor 0x00($inp),$xa3,$xa3
3698 vpxor 0x20($inp),$xb3,$xb3
3699 vmovdqu $xa3,0x00($out,$inp)
3700 vmovdqu $xb3,0x20($out,$inp)
3701 je .Ldone8xvl
3702 vmovdqa $xc3,$xa0
3703 vmovdqa $xd3,$xb0
3704 lea 64($inp),$inp
3705
3706.Less_than_64_8xvl:
3707 vmovdqa $xa0,0x00(%rsp)
3708 vmovdqa $xb0,0x20(%rsp)
3709 lea ($out,$inp),$out
3710 and \$63,$len
3711
3712.Loop_tail8xvl:
3713 movzb ($inp,%r10),%eax
3714 movzb (%rsp,%r10),%ecx
3715 lea 1(%r10),%r10
3716 xor %ecx,%eax
3717 mov %al,-1($out,%r10)
3718 dec $len
3719 jnz .Loop_tail8xvl
3720
3721 vpxor $xa0,$xa0,$xa0
3722 vmovdqa $xa0,0x00(%rsp)
3723 vmovdqa $xa0,0x20(%rsp)
3724
3725.Ldone8xvl:
3726 vzeroall
3727___
3728$code.=<<___ if ($win64);
3729 movaps -0xa8(%r9),%xmm6
3730 movaps -0x98(%r9),%xmm7
3731 movaps -0x88(%r9),%xmm8
3732 movaps -0x78(%r9),%xmm9
3733 movaps -0x68(%r9),%xmm10
3734 movaps -0x58(%r9),%xmm11
3735 movaps -0x48(%r9),%xmm12
3736 movaps -0x38(%r9),%xmm13
3737 movaps -0x28(%r9),%xmm14
3738 movaps -0x18(%r9),%xmm15
3739___
3740$code.=<<___;
3741 lea (%r9),%rsp
3742.cfi_def_cfa_register %rsp
3743.L8xvl_epilogue:
3744 ret
3745.cfi_endproc
3746.size ChaCha20_8xvl,.-ChaCha20_8xvl
3747___
abb8c44f
AP
3748}
3749
384e6de4
AP
3750# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3751# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3752if ($win64) {
3753$rec="%rcx";
3754$frame="%rdx";
3755$context="%r8";
3756$disp="%r9";
3757
3758$code.=<<___;
3759.extern __imp_RtlVirtualUnwind
3760.type se_handler,\@abi-omnipotent
3761.align 16
3762se_handler:
3763 push %rsi
3764 push %rdi
3765 push %rbx
3766 push %rbp
3767 push %r12
3768 push %r13
3769 push %r14
3770 push %r15
3771 pushfq
3772 sub \$64,%rsp
3773
3774 mov 120($context),%rax # pull context->Rax
3775 mov 248($context),%rbx # pull context->Rip
3776
3777 mov 8($disp),%rsi # disp->ImageBase
3778 mov 56($disp),%r11 # disp->HandlerData
3779
3780 lea .Lctr32_body(%rip),%r10
3781 cmp %r10,%rbx # context->Rip<.Lprologue
3782 jb .Lcommon_seh_tail
3783
3784 mov 152($context),%rax # pull context->Rsp
3785
3786 lea .Lno_data(%rip),%r10 # epilogue label
3787 cmp %r10,%rbx # context->Rip>=.Lepilogue
3788 jae .Lcommon_seh_tail
3789
3790 lea 64+24+48(%rax),%rax
3791
3792 mov -8(%rax),%rbx
3793 mov -16(%rax),%rbp
3794 mov -24(%rax),%r12
3795 mov -32(%rax),%r13
3796 mov -40(%rax),%r14
3797 mov -48(%rax),%r15
3798 mov %rbx,144($context) # restore context->Rbx
3799 mov %rbp,160($context) # restore context->Rbp
3800 mov %r12,216($context) # restore context->R12
3801 mov %r13,224($context) # restore context->R13
3802 mov %r14,232($context) # restore context->R14
3803 mov %r15,240($context) # restore context->R14
3804
3805.Lcommon_seh_tail:
3806 mov 8(%rax),%rdi
3807 mov 16(%rax),%rsi
3808 mov %rax,152($context) # restore context->Rsp
3809 mov %rsi,168($context) # restore context->Rsi
3810 mov %rdi,176($context) # restore context->Rdi
3811
3812 mov 40($disp),%rdi # disp->ContextRecord
3813 mov $context,%rsi # context
3814 mov \$154,%ecx # sizeof(CONTEXT)
3815 .long 0xa548f3fc # cld; rep movsq
3816
3817 mov $disp,%rsi
3818 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3819 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3820 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3821 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3822 mov 40(%rsi),%r10 # disp->ContextRecord
3823 lea 56(%rsi),%r11 # &disp->HandlerData
3824 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3825 mov %r10,32(%rsp) # arg5
3826 mov %r11,40(%rsp) # arg6
3827 mov %r12,48(%rsp) # arg7
3828 mov %rcx,56(%rsp) # arg8, (NULL)
3829 call *__imp_RtlVirtualUnwind(%rip)
3830
3831 mov \$1,%eax # ExceptionContinueSearch
3832 add \$64,%rsp
3833 popfq
3834 pop %r15
3835 pop %r14
3836 pop %r13
3837 pop %r12
3838 pop %rbp
3839 pop %rbx
3840 pop %rdi
3841 pop %rsi
3842 ret
3843.size se_handler,.-se_handler
3844
d5487a45 3845.type simd_handler,\@abi-omnipotent
384e6de4 3846.align 16
d5487a45 3847simd_handler:
384e6de4
AP
3848 push %rsi
3849 push %rdi
3850 push %rbx
3851 push %rbp
3852 push %r12
3853 push %r13
3854 push %r14
3855 push %r15
3856 pushfq
3857 sub \$64,%rsp
3858
3859 mov 120($context),%rax # pull context->Rax
3860 mov 248($context),%rbx # pull context->Rip
3861
3862 mov 8($disp),%rsi # disp->ImageBase
3863 mov 56($disp),%r11 # disp->HandlerData
3864
3865 mov 0(%r11),%r10d # HandlerData[0]
3866 lea (%rsi,%r10),%r10 # prologue label
3867 cmp %r10,%rbx # context->Rip<prologue label
3868 jb .Lcommon_seh_tail
3869
3870 mov 192($context),%rax # pull context->R9
3871
3872 mov 4(%r11),%r10d # HandlerData[1]
d5487a45 3873 mov 8(%r11),%ecx # HandlerData[2]
384e6de4
AP
3874 lea (%rsi,%r10),%r10 # epilogue label
3875 cmp %r10,%rbx # context->Rip>=epilogue label
3876 jae .Lcommon_seh_tail
3877
d5487a45
AP
3878 neg %rcx
3879 lea -8(%rax,%rcx),%rsi
384e6de4 3880 lea 512($context),%rdi # &context.Xmm6
d5487a45
AP
3881 neg %ecx
3882 shr \$3,%ecx
384e6de4
AP
3883 .long 0xa548f3fc # cld; rep movsq
3884
3885 jmp .Lcommon_seh_tail
d5487a45 3886.size simd_handler,.-simd_handler
384e6de4
AP
3887
3888.section .pdata
3889.align 4
3890 .rva .LSEH_begin_ChaCha20_ctr32
3891 .rva .LSEH_end_ChaCha20_ctr32
3892 .rva .LSEH_info_ChaCha20_ctr32
3893
3894 .rva .LSEH_begin_ChaCha20_ssse3
3895 .rva .LSEH_end_ChaCha20_ssse3
3896 .rva .LSEH_info_ChaCha20_ssse3
3897
d5487a45
AP
3898 .rva .LSEH_begin_ChaCha20_128
3899 .rva .LSEH_end_ChaCha20_128
3900 .rva .LSEH_info_ChaCha20_128
3901
384e6de4
AP
3902 .rva .LSEH_begin_ChaCha20_4x
3903 .rva .LSEH_end_ChaCha20_4x
3904 .rva .LSEH_info_ChaCha20_4x
3905___
3906$code.=<<___ if ($avx);
3907 .rva .LSEH_begin_ChaCha20_4xop
3908 .rva .LSEH_end_ChaCha20_4xop
3909 .rva .LSEH_info_ChaCha20_4xop
3910___
3911$code.=<<___ if ($avx>1);
3912 .rva .LSEH_begin_ChaCha20_8x
3913 .rva .LSEH_end_ChaCha20_8x
3914 .rva .LSEH_info_ChaCha20_8x
3915___
3916$code.=<<___ if ($avx>2);
3917 .rva .LSEH_begin_ChaCha20_avx512
3918 .rva .LSEH_end_ChaCha20_avx512
3919 .rva .LSEH_info_ChaCha20_avx512
3920
cded9513
AP
3921 .rva .LSEH_begin_ChaCha20_avx512vl
3922 .rva .LSEH_end_ChaCha20_avx512vl
3923 .rva .LSEH_info_ChaCha20_avx512vl
3924
384e6de4
AP
3925 .rva .LSEH_begin_ChaCha20_16x
3926 .rva .LSEH_end_ChaCha20_16x
3927 .rva .LSEH_info_ChaCha20_16x
cded9513
AP
3928
3929 .rva .LSEH_begin_ChaCha20_8xvl
3930 .rva .LSEH_end_ChaCha20_8xvl
3931 .rva .LSEH_info_ChaCha20_8xvl
384e6de4
AP
3932___
3933$code.=<<___;
3934.section .xdata
3935.align 8
3936.LSEH_info_ChaCha20_ctr32:
3937 .byte 9,0,0,0
3938 .rva se_handler
3939
3940.LSEH_info_ChaCha20_ssse3:
3941 .byte 9,0,0,0
d5487a45 3942 .rva simd_handler
384e6de4 3943 .rva .Lssse3_body,.Lssse3_epilogue
d5487a45
AP
3944 .long 0x20,0
3945
3946.LSEH_info_ChaCha20_128:
3947 .byte 9,0,0,0
3948 .rva simd_handler
3949 .rva .L128_body,.L128_epilogue
3950 .long 0x60,0
384e6de4
AP
3951
3952.LSEH_info_ChaCha20_4x:
3953 .byte 9,0,0,0
d5487a45 3954 .rva simd_handler
384e6de4 3955 .rva .L4x_body,.L4x_epilogue
d5487a45 3956 .long 0xa0,0
384e6de4
AP
3957___
3958$code.=<<___ if ($avx);
3959.LSEH_info_ChaCha20_4xop:
3960 .byte 9,0,0,0
d5487a45 3961 .rva simd_handler
384e6de4 3962 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
d5487a45 3963 .long 0xa0,0
384e6de4
AP
3964___
3965$code.=<<___ if ($avx>1);
3966.LSEH_info_ChaCha20_8x:
3967 .byte 9,0,0,0
d5487a45 3968 .rva simd_handler
384e6de4 3969 .rva .L8x_body,.L8x_epilogue # HandlerData[]
d5487a45 3970 .long 0xa0,0
384e6de4
AP
3971___
3972$code.=<<___ if ($avx>2);
3973.LSEH_info_ChaCha20_avx512:
3974 .byte 9,0,0,0
d5487a45 3975 .rva simd_handler
384e6de4 3976 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
d5487a45 3977 .long 0x20,0
384e6de4 3978
cded9513
AP
3979.LSEH_info_ChaCha20_avx512vl:
3980 .byte 9,0,0,0
d5487a45 3981 .rva simd_handler
cded9513 3982 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
d5487a45 3983 .long 0x20,0
cded9513 3984
384e6de4
AP
3985.LSEH_info_ChaCha20_16x:
3986 .byte 9,0,0,0
d5487a45 3987 .rva simd_handler
384e6de4 3988 .rva .L16x_body,.L16x_epilogue # HandlerData[]
d5487a45 3989 .long 0xa0,0
cded9513
AP
3990
3991.LSEH_info_ChaCha20_8xvl:
3992 .byte 9,0,0,0
d5487a45 3993 .rva simd_handler
cded9513 3994 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
d5487a45 3995 .long 0xa0,0
384e6de4
AP
3996___
3997}
3998
a98c648e 3999foreach (split("\n",$code)) {
3c274a6e 4000 s/\`([^\`]*)\`/eval $1/ge;
a98c648e 4001
3c274a6e 4002 s/%x#%[yz]/%x/g; # "down-shift"
a98c648e
AP
4003
4004 print $_,"\n";
4005}
4006
a21314db 4007close STDOUT or die "error closing STDOUT: $!";