]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chacha-x86_64.pl
perlasm/x86_64-xlate.pl: clarify SEH coding guidelines.
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a98c648e
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
abb8c44f
AP
21# December 2016
22#
23# Add AVX512F code path.
24#
a98c648e
AP
25# Performance in cycles per byte out of large buffer.
26#
27# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
28#
29# P4 9.48/+99% -/22.7(ii) -
30# Core2 7.83/+55% 7.90/8.08 4.35
31# Westmere 7.19/+50% 5.60/6.70 3.00
32# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
a30b0522 35# Skylake 5.87/+39% 4.70/- 2.31 1.19
a98c648e 36# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
ace05265 37# Goldmont 10.6/+17% 5.10/- 3.28
a98c648e
AP
38# Sledgehammer 7.28/+52% -/14.2(ii) -
39# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
40# VIA Nano 10.5/+46% 6.72/8.60 6.05
41#
42# (i) compared to older gcc 3.x one can observe >2x improvement on
43# most platforms;
44# (ii) as it can be seen, SSE2 performance is too low on legacy
45# processors; NxSSE2 results are naturally better, but not
46# impressively better than IALU ones, which is why you won't
47# find SSE2 code below;
48# (iii) this is not optimal result for Atom because of MSROM
49# limitations, SSE2 can do better, but gain is considered too
50# low to justify the [maintenance] effort;
51# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
52
53$flavour = shift;
54$output = shift;
55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62die "can't locate x86_64-xlate.pl";
63
64if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
abb8c44f 66 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
a98c648e
AP
67}
68
69if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1ea01427 70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
abb8c44f
AP
71 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
72 $avx += 1 if ($1==2.11 && $2>=8);
a98c648e
AP
73}
74
75if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77 $avx = ($1>=10) + ($1>=11);
78}
79
80if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
81 $avx = ($2>=3.0) + ($2>3.0);
82}
83
cfe1d992 84open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
a98c648e
AP
85*STDOUT=*OUT;
86
87# input parameter block
88($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
89
90$code.=<<___;
91.text
92
93.extern OPENSSL_ia32cap_P
94
95.align 64
96.Lzero:
97.long 0,0,0,0
98.Lone:
99.long 1,0,0,0
100.Linc:
101.long 0,1,2,3
102.Lfour:
103.long 4,4,4,4
104.Lincy:
105.long 0,2,4,6,1,3,5,7
106.Leight:
107.long 8,8,8,8,8,8,8,8
108.Lrot16:
109.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
110.Lrot24:
111.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
112.Lsigma:
113.asciz "expand 32-byte k"
abb8c44f 114.align 64
3c274a6e
AP
115.Lzeroz:
116.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
117.Lfourz:
118.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
abb8c44f
AP
119.Lincz:
120.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
121.Lsixteen:
122.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
a98c648e
AP
123.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
124___
125
126sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
127{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
128 my $arg = pop;
129 $arg = "\$$arg" if ($arg*1 eq $arg);
130 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
131}
132
133@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
134 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
135@t=("%esi","%edi");
136
137sub ROUND { # critical path is 24 cycles per round
138my ($a0,$b0,$c0,$d0)=@_;
139my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
140my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
141my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
142my ($xc,$xc_)=map("\"$_\"",@t);
143my @x=map("\"$_\"",@x);
144
145 # Consider order in which variables are addressed by their
146 # index:
147 #
148 # a b c d
149 #
150 # 0 4 8 12 < even round
151 # 1 5 9 13
152 # 2 6 10 14
153 # 3 7 11 15
154 # 0 5 10 15 < odd round
155 # 1 6 11 12
156 # 2 7 8 13
157 # 3 4 9 14
158 #
159 # 'a', 'b' and 'd's are permanently allocated in registers,
160 # @x[0..7,12..15], while 'c's are maintained in memory. If
161 # you observe 'c' column, you'll notice that pair of 'c's is
162 # invariant between rounds. This means that we have to reload
163 # them once per round, in the middle. This is why you'll see
164 # bunch of 'c' stores and loads in the middle, but none in
165 # the beginning or end.
166
167 # Normally instructions would be interleaved to favour in-order
168 # execution. Generally out-of-order cores manage it gracefully,
169 # but not this time for some reason. As in-order execution
170 # cores are dying breed, old Atom is the only one around,
171 # instructions are left uninterleaved. Besides, Atom is better
172 # off executing 1xSSSE3 code anyway...
173
174 (
175 "&add (@x[$a0],@x[$b0])", # Q1
176 "&xor (@x[$d0],@x[$a0])",
177 "&rol (@x[$d0],16)",
178 "&add (@x[$a1],@x[$b1])", # Q2
179 "&xor (@x[$d1],@x[$a1])",
180 "&rol (@x[$d1],16)",
181
182 "&add ($xc,@x[$d0])",
183 "&xor (@x[$b0],$xc)",
184 "&rol (@x[$b0],12)",
185 "&add ($xc_,@x[$d1])",
186 "&xor (@x[$b1],$xc_)",
187 "&rol (@x[$b1],12)",
188
189 "&add (@x[$a0],@x[$b0])",
190 "&xor (@x[$d0],@x[$a0])",
191 "&rol (@x[$d0],8)",
192 "&add (@x[$a1],@x[$b1])",
193 "&xor (@x[$d1],@x[$a1])",
194 "&rol (@x[$d1],8)",
195
196 "&add ($xc,@x[$d0])",
197 "&xor (@x[$b0],$xc)",
198 "&rol (@x[$b0],7)",
199 "&add ($xc_,@x[$d1])",
200 "&xor (@x[$b1],$xc_)",
201 "&rol (@x[$b1],7)",
202
203 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
204 "&mov (\"4*$c1(%rsp)\",$xc_)",
205 "&mov ($xc,\"4*$c2(%rsp)\")",
206 "&mov ($xc_,\"4*$c3(%rsp)\")",
207
208 "&add (@x[$a2],@x[$b2])", # Q3
209 "&xor (@x[$d2],@x[$a2])",
210 "&rol (@x[$d2],16)",
211 "&add (@x[$a3],@x[$b3])", # Q4
212 "&xor (@x[$d3],@x[$a3])",
213 "&rol (@x[$d3],16)",
214
215 "&add ($xc,@x[$d2])",
216 "&xor (@x[$b2],$xc)",
217 "&rol (@x[$b2],12)",
218 "&add ($xc_,@x[$d3])",
219 "&xor (@x[$b3],$xc_)",
220 "&rol (@x[$b3],12)",
221
222 "&add (@x[$a2],@x[$b2])",
223 "&xor (@x[$d2],@x[$a2])",
224 "&rol (@x[$d2],8)",
225 "&add (@x[$a3],@x[$b3])",
226 "&xor (@x[$d3],@x[$a3])",
227 "&rol (@x[$d3],8)",
228
229 "&add ($xc,@x[$d2])",
230 "&xor (@x[$b2],$xc)",
231 "&rol (@x[$b2],7)",
232 "&add ($xc_,@x[$d3])",
233 "&xor (@x[$b3],$xc_)",
234 "&rol (@x[$b3],7)"
235 );
236}
237
238########################################################################
239# Generic code path that handles all lengths on pre-SSSE3 processors.
240$code.=<<___;
241.globl ChaCha20_ctr32
242.type ChaCha20_ctr32,\@function,5
243.align 64
244ChaCha20_ctr32:
622a531c
AP
245 cmp \$0,$len
246 je .Lno_data
a98c648e 247 mov OPENSSL_ia32cap_P+4(%rip),%r10
3c274a6e
AP
248___
249$code.=<<___ if ($avx>2);
250 bt \$48,%r10 # check for AVX512F
251 jc .LChaCha20_avx512
252___
253$code.=<<___;
a98c648e
AP
254 test \$`1<<(41-32)`,%r10d
255 jnz .LChaCha20_ssse3
256
257 push %rbx
258 push %rbp
259 push %r12
260 push %r13
261 push %r14
262 push %r15
263 sub \$64+24,%rsp
264
265 #movdqa .Lsigma(%rip),%xmm0
266 movdqu ($key),%xmm1
267 movdqu 16($key),%xmm2
268 movdqu ($counter),%xmm3
269 movdqa .Lone(%rip),%xmm4
270
271 #movdqa %xmm0,4*0(%rsp) # key[0]
272 movdqa %xmm1,4*4(%rsp) # key[1]
273 movdqa %xmm2,4*8(%rsp) # key[2]
274 movdqa %xmm3,4*12(%rsp) # key[3]
275 mov $len,%rbp # reassign $len
276 jmp .Loop_outer
277
278.align 32
279.Loop_outer:
280 mov \$0x61707865,@x[0] # 'expa'
281 mov \$0x3320646e,@x[1] # 'nd 3'
282 mov \$0x79622d32,@x[2] # '2-by'
283 mov \$0x6b206574,@x[3] # 'te k'
284 mov 4*4(%rsp),@x[4]
285 mov 4*5(%rsp),@x[5]
286 mov 4*6(%rsp),@x[6]
287 mov 4*7(%rsp),@x[7]
288 movd %xmm3,@x[12]
289 mov 4*13(%rsp),@x[13]
290 mov 4*14(%rsp),@x[14]
291 mov 4*15(%rsp),@x[15]
292
293 mov %rbp,64+0(%rsp) # save len
294 mov \$10,%ebp
295 mov $inp,64+8(%rsp) # save inp
296 movq %xmm2,%rsi # "@x[8]"
297 mov $out,64+16(%rsp) # save out
298 mov %rsi,%rdi
299 shr \$32,%rdi # "@x[9]"
300 jmp .Loop
301
302.align 32
303.Loop:
304___
305 foreach (&ROUND (0, 4, 8,12)) { eval; }
306 foreach (&ROUND (0, 5,10,15)) { eval; }
307 &dec ("%ebp");
308 &jnz (".Loop");
309
310$code.=<<___;
311 mov @t[1],4*9(%rsp) # modulo-scheduled
312 mov @t[0],4*8(%rsp)
313 mov 64(%rsp),%rbp # load len
314 movdqa %xmm2,%xmm1
315 mov 64+8(%rsp),$inp # load inp
316 paddd %xmm4,%xmm3 # increment counter
317 mov 64+16(%rsp),$out # load out
318
319 add \$0x61707865,@x[0] # 'expa'
320 add \$0x3320646e,@x[1] # 'nd 3'
321 add \$0x79622d32,@x[2] # '2-by'
322 add \$0x6b206574,@x[3] # 'te k'
323 add 4*4(%rsp),@x[4]
324 add 4*5(%rsp),@x[5]
325 add 4*6(%rsp),@x[6]
326 add 4*7(%rsp),@x[7]
327 add 4*12(%rsp),@x[12]
328 add 4*13(%rsp),@x[13]
329 add 4*14(%rsp),@x[14]
330 add 4*15(%rsp),@x[15]
331 paddd 4*8(%rsp),%xmm1
332
333 cmp \$64,%rbp
334 jb .Ltail
335
336 xor 4*0($inp),@x[0] # xor with input
337 xor 4*1($inp),@x[1]
338 xor 4*2($inp),@x[2]
339 xor 4*3($inp),@x[3]
340 xor 4*4($inp),@x[4]
341 xor 4*5($inp),@x[5]
342 xor 4*6($inp),@x[6]
343 xor 4*7($inp),@x[7]
344 movdqu 4*8($inp),%xmm0
345 xor 4*12($inp),@x[12]
346 xor 4*13($inp),@x[13]
347 xor 4*14($inp),@x[14]
348 xor 4*15($inp),@x[15]
349 lea 4*16($inp),$inp # inp+=64
350 pxor %xmm1,%xmm0
351
352 movdqa %xmm2,4*8(%rsp)
353 movd %xmm3,4*12(%rsp)
354
355 mov @x[0],4*0($out) # write output
356 mov @x[1],4*1($out)
357 mov @x[2],4*2($out)
358 mov @x[3],4*3($out)
359 mov @x[4],4*4($out)
360 mov @x[5],4*5($out)
361 mov @x[6],4*6($out)
362 mov @x[7],4*7($out)
363 movdqu %xmm0,4*8($out)
364 mov @x[12],4*12($out)
365 mov @x[13],4*13($out)
366 mov @x[14],4*14($out)
367 mov @x[15],4*15($out)
368 lea 4*16($out),$out # out+=64
369
370 sub \$64,%rbp
371 jnz .Loop_outer
372
373 jmp .Ldone
374
375.align 16
376.Ltail:
377 mov @x[0],4*0(%rsp)
a98c648e 378 mov @x[1],4*1(%rsp)
29880e97 379 xor %rbx,%rbx
a98c648e
AP
380 mov @x[2],4*2(%rsp)
381 mov @x[3],4*3(%rsp)
382 mov @x[4],4*4(%rsp)
383 mov @x[5],4*5(%rsp)
384 mov @x[6],4*6(%rsp)
385 mov @x[7],4*7(%rsp)
386 movdqa %xmm1,4*8(%rsp)
387 mov @x[12],4*12(%rsp)
388 mov @x[13],4*13(%rsp)
389 mov @x[14],4*14(%rsp)
390 mov @x[15],4*15(%rsp)
391
392.Loop_tail:
393 movzb ($inp,%rbx),%eax
394 movzb (%rsp,%rbx),%edx
395 lea 1(%rbx),%rbx
396 xor %edx,%eax
397 mov %al,-1($out,%rbx)
398 dec %rbp
399 jnz .Loop_tail
400
401.Ldone:
402 add \$64+24,%rsp
403 pop %r15
404 pop %r14
405 pop %r13
406 pop %r12
407 pop %rbp
408 pop %rbx
622a531c 409.Lno_data:
a98c648e
AP
410 ret
411.size ChaCha20_ctr32,.-ChaCha20_ctr32
412___
413
414########################################################################
415# SSSE3 code path that handles shorter lengths
416{
417my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
418
419sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
420 &paddd ($a,$b);
421 &pxor ($d,$a);
422 &pshufb ($d,$rot16);
423
424 &paddd ($c,$d);
425 &pxor ($b,$c);
426 &movdqa ($t,$b);
427 &psrld ($b,20);
428 &pslld ($t,12);
429 &por ($b,$t);
430
431 &paddd ($a,$b);
432 &pxor ($d,$a);
433 &pshufb ($d,$rot24);
434
435 &paddd ($c,$d);
436 &pxor ($b,$c);
437 &movdqa ($t,$b);
438 &psrld ($b,25);
439 &pslld ($t,7);
440 &por ($b,$t);
441}
442
443my $xframe = $win64 ? 32+32+8 : 24;
444
445$code.=<<___;
446.type ChaCha20_ssse3,\@function,5
447.align 32
448ChaCha20_ssse3:
449.LChaCha20_ssse3:
450___
451$code.=<<___ if ($avx);
452 test \$`1<<(43-32)`,%r10d
453 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
454___
455$code.=<<___;
456 cmp \$128,$len # we might throw away some data,
457 ja .LChaCha20_4x # but overall it won't be slower
458
459.Ldo_sse3_after_all:
3c274a6e 460 push %rbx # just to share SEH handler, no pops
a98c648e
AP
461 push %rbp
462 push %r12
463 push %r13
464 push %r14
465 push %r15
466
467 sub \$64+$xframe,%rsp
468___
469$code.=<<___ if ($win64);
470 movaps %xmm6,64+32(%rsp)
471 movaps %xmm7,64+48(%rsp)
472___
473$code.=<<___;
474 movdqa .Lsigma(%rip),$a
475 movdqu ($key),$b
476 movdqu 16($key),$c
477 movdqu ($counter),$d
478 movdqa .Lrot16(%rip),$rot16
479 movdqa .Lrot24(%rip),$rot24
480
481 movdqa $a,0x00(%rsp)
482 movdqa $b,0x10(%rsp)
483 movdqa $c,0x20(%rsp)
484 movdqa $d,0x30(%rsp)
3c274a6e 485 mov \$10,$counter # reuse $counter
a98c648e
AP
486 jmp .Loop_ssse3
487
488.align 32
489.Loop_outer_ssse3:
490 movdqa .Lone(%rip),$d
491 movdqa 0x00(%rsp),$a
492 movdqa 0x10(%rsp),$b
493 movdqa 0x20(%rsp),$c
494 paddd 0x30(%rsp),$d
3c274a6e 495 mov \$10,$counter
a98c648e
AP
496 movdqa $d,0x30(%rsp)
497 jmp .Loop_ssse3
498
499.align 32
500.Loop_ssse3:
501___
502 &SSSE3ROUND();
503 &pshufd ($c,$c,0b01001110);
504 &pshufd ($b,$b,0b00111001);
505 &pshufd ($d,$d,0b10010011);
506 &nop ();
507
508 &SSSE3ROUND();
509 &pshufd ($c,$c,0b01001110);
510 &pshufd ($b,$b,0b10010011);
511 &pshufd ($d,$d,0b00111001);
512
3c274a6e 513 &dec ($counter);
a98c648e
AP
514 &jnz (".Loop_ssse3");
515
516$code.=<<___;
517 paddd 0x00(%rsp),$a
518 paddd 0x10(%rsp),$b
519 paddd 0x20(%rsp),$c
520 paddd 0x30(%rsp),$d
521
522 cmp \$64,$len
523 jb .Ltail_ssse3
524
525 movdqu 0x00($inp),$t
526 movdqu 0x10($inp),$t1
527 pxor $t,$a # xor with input
528 movdqu 0x20($inp),$t
529 pxor $t1,$b
530 movdqu 0x30($inp),$t1
531 lea 0x40($inp),$inp # inp+=64
532 pxor $t,$c
533 pxor $t1,$d
534
535 movdqu $a,0x00($out) # write output
536 movdqu $b,0x10($out)
537 movdqu $c,0x20($out)
538 movdqu $d,0x30($out)
539 lea 0x40($out),$out # out+=64
540
541 sub \$64,$len
542 jnz .Loop_outer_ssse3
543
544 jmp .Ldone_ssse3
545
546.align 16
547.Ltail_ssse3:
548 movdqa $a,0x00(%rsp)
549 movdqa $b,0x10(%rsp)
550 movdqa $c,0x20(%rsp)
551 movdqa $d,0x30(%rsp)
3c274a6e 552 xor $counter,$counter
a98c648e
AP
553
554.Loop_tail_ssse3:
3c274a6e
AP
555 movzb ($inp,$counter),%eax
556 movzb (%rsp,$counter),%ecx
557 lea 1($counter),$counter
29880e97 558 xor %ecx,%eax
3c274a6e 559 mov %al,-1($out,$counter)
29880e97 560 dec $len
a98c648e
AP
561 jnz .Loop_tail_ssse3
562
563.Ldone_ssse3:
564___
565$code.=<<___ if ($win64);
566 movaps 64+32(%rsp),%xmm6
567 movaps 64+48(%rsp),%xmm7
568___
569$code.=<<___;
3c274a6e 570 add \$64+$xframe+48,%rsp
a98c648e
AP
571 ret
572.size ChaCha20_ssse3,.-ChaCha20_ssse3
573___
574}
575
576########################################################################
577# SSSE3 code path that handles longer messages.
578{
579# assign variables to favor Atom front-end
580my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
581 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
582my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
583 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
584
585sub SSSE3_lane_ROUND {
586my ($a0,$b0,$c0,$d0)=@_;
587my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
588my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
589my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
590my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
591my @x=map("\"$_\"",@xx);
592
593 # Consider order in which variables are addressed by their
594 # index:
595 #
596 # a b c d
597 #
598 # 0 4 8 12 < even round
599 # 1 5 9 13
600 # 2 6 10 14
601 # 3 7 11 15
602 # 0 5 10 15 < odd round
603 # 1 6 11 12
604 # 2 7 8 13
605 # 3 4 9 14
606 #
607 # 'a', 'b' and 'd's are permanently allocated in registers,
608 # @x[0..7,12..15], while 'c's are maintained in memory. If
609 # you observe 'c' column, you'll notice that pair of 'c's is
610 # invariant between rounds. This means that we have to reload
611 # them once per round, in the middle. This is why you'll see
612 # bunch of 'c' stores and loads in the middle, but none in
613 # the beginning or end.
614
615 (
616 "&paddd (@x[$a0],@x[$b0])", # Q1
617 "&paddd (@x[$a1],@x[$b1])", # Q2
618 "&pxor (@x[$d0],@x[$a0])",
619 "&pxor (@x[$d1],@x[$a1])",
620 "&pshufb (@x[$d0],$t1)",
621 "&pshufb (@x[$d1],$t1)",
622
623 "&paddd ($xc,@x[$d0])",
624 "&paddd ($xc_,@x[$d1])",
625 "&pxor (@x[$b0],$xc)",
626 "&pxor (@x[$b1],$xc_)",
627 "&movdqa ($t0,@x[$b0])",
628 "&pslld (@x[$b0],12)",
629 "&psrld ($t0,20)",
630 "&movdqa ($t1,@x[$b1])",
631 "&pslld (@x[$b1],12)",
632 "&por (@x[$b0],$t0)",
633 "&psrld ($t1,20)",
634 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
635 "&por (@x[$b1],$t1)",
636
637 "&paddd (@x[$a0],@x[$b0])",
638 "&paddd (@x[$a1],@x[$b1])",
639 "&pxor (@x[$d0],@x[$a0])",
640 "&pxor (@x[$d1],@x[$a1])",
641 "&pshufb (@x[$d0],$t0)",
642 "&pshufb (@x[$d1],$t0)",
643
644 "&paddd ($xc,@x[$d0])",
645 "&paddd ($xc_,@x[$d1])",
646 "&pxor (@x[$b0],$xc)",
647 "&pxor (@x[$b1],$xc_)",
648 "&movdqa ($t1,@x[$b0])",
649 "&pslld (@x[$b0],7)",
650 "&psrld ($t1,25)",
651 "&movdqa ($t0,@x[$b1])",
652 "&pslld (@x[$b1],7)",
653 "&por (@x[$b0],$t1)",
654 "&psrld ($t0,25)",
655 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
656 "&por (@x[$b1],$t0)",
657
658 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
659 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
660 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
661 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
662
663 "&paddd (@x[$a2],@x[$b2])", # Q3
664 "&paddd (@x[$a3],@x[$b3])", # Q4
665 "&pxor (@x[$d2],@x[$a2])",
666 "&pxor (@x[$d3],@x[$a3])",
667 "&pshufb (@x[$d2],$t1)",
668 "&pshufb (@x[$d3],$t1)",
669
670 "&paddd ($xc,@x[$d2])",
671 "&paddd ($xc_,@x[$d3])",
672 "&pxor (@x[$b2],$xc)",
673 "&pxor (@x[$b3],$xc_)",
674 "&movdqa ($t0,@x[$b2])",
675 "&pslld (@x[$b2],12)",
676 "&psrld ($t0,20)",
677 "&movdqa ($t1,@x[$b3])",
678 "&pslld (@x[$b3],12)",
679 "&por (@x[$b2],$t0)",
680 "&psrld ($t1,20)",
681 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
682 "&por (@x[$b3],$t1)",
683
684 "&paddd (@x[$a2],@x[$b2])",
685 "&paddd (@x[$a3],@x[$b3])",
686 "&pxor (@x[$d2],@x[$a2])",
687 "&pxor (@x[$d3],@x[$a3])",
688 "&pshufb (@x[$d2],$t0)",
689 "&pshufb (@x[$d3],$t0)",
690
691 "&paddd ($xc,@x[$d2])",
692 "&paddd ($xc_,@x[$d3])",
693 "&pxor (@x[$b2],$xc)",
694 "&pxor (@x[$b3],$xc_)",
695 "&movdqa ($t1,@x[$b2])",
696 "&pslld (@x[$b2],7)",
697 "&psrld ($t1,25)",
698 "&movdqa ($t0,@x[$b3])",
699 "&pslld (@x[$b3],7)",
700 "&por (@x[$b2],$t1)",
701 "&psrld ($t0,25)",
702 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
703 "&por (@x[$b3],$t0)"
704 );
705}
706
707my $xframe = $win64 ? 0xa0 : 0;
708
709$code.=<<___;
710.type ChaCha20_4x,\@function,5
711.align 32
712ChaCha20_4x:
713.LChaCha20_4x:
714 mov %r10,%r11
715___
716$code.=<<___ if ($avx>1);
717 shr \$32,%r10 # OPENSSL_ia32cap_P+8
718 test \$`1<<5`,%r10 # test AVX2
719 jnz .LChaCha20_8x
720___
721$code.=<<___;
722 cmp \$192,$len
723 ja .Lproceed4x
724
725 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
726 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
727 je .Ldo_sse3_after_all # to detect Atom
728
729.Lproceed4x:
730 lea -0x78(%rsp),%r11
731 sub \$0x148+$xframe,%rsp
732___
733 ################ stack layout
734 # +0x00 SIMD equivalent of @x[8-12]
735 # ...
736 # +0x40 constant copy of key[0-2] smashed by lanes
737 # ...
738 # +0x100 SIMD counters (with nonce smashed by lanes)
739 # ...
740 # +0x140
741$code.=<<___ if ($win64);
742 movaps %xmm6,-0x30(%r11)
743 movaps %xmm7,-0x20(%r11)
744 movaps %xmm8,-0x10(%r11)
745 movaps %xmm9,0x00(%r11)
746 movaps %xmm10,0x10(%r11)
747 movaps %xmm11,0x20(%r11)
748 movaps %xmm12,0x30(%r11)
749 movaps %xmm13,0x40(%r11)
750 movaps %xmm14,0x50(%r11)
751 movaps %xmm15,0x60(%r11)
752___
753$code.=<<___;
754 movdqa .Lsigma(%rip),$xa3 # key[0]
755 movdqu ($key),$xb3 # key[1]
756 movdqu 16($key),$xt3 # key[2]
757 movdqu ($counter),$xd3 # key[3]
758 lea 0x100(%rsp),%rcx # size optimization
759 lea .Lrot16(%rip),%r10
760 lea .Lrot24(%rip),%r11
761
762 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
763 pshufd \$0x55,$xa3,$xa1
764 movdqa $xa0,0x40(%rsp) # ... and offload
765 pshufd \$0xaa,$xa3,$xa2
766 movdqa $xa1,0x50(%rsp)
767 pshufd \$0xff,$xa3,$xa3
768 movdqa $xa2,0x60(%rsp)
769 movdqa $xa3,0x70(%rsp)
770
771 pshufd \$0x00,$xb3,$xb0
772 pshufd \$0x55,$xb3,$xb1
773 movdqa $xb0,0x80-0x100(%rcx)
774 pshufd \$0xaa,$xb3,$xb2
775 movdqa $xb1,0x90-0x100(%rcx)
776 pshufd \$0xff,$xb3,$xb3
777 movdqa $xb2,0xa0-0x100(%rcx)
778 movdqa $xb3,0xb0-0x100(%rcx)
779
780 pshufd \$0x00,$xt3,$xt0 # "$xc0"
781 pshufd \$0x55,$xt3,$xt1 # "$xc1"
782 movdqa $xt0,0xc0-0x100(%rcx)
783 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
784 movdqa $xt1,0xd0-0x100(%rcx)
785 pshufd \$0xff,$xt3,$xt3 # "$xc3"
786 movdqa $xt2,0xe0-0x100(%rcx)
787 movdqa $xt3,0xf0-0x100(%rcx)
788
789 pshufd \$0x00,$xd3,$xd0
790 pshufd \$0x55,$xd3,$xd1
791 paddd .Linc(%rip),$xd0 # don't save counters yet
792 pshufd \$0xaa,$xd3,$xd2
793 movdqa $xd1,0x110-0x100(%rcx)
794 pshufd \$0xff,$xd3,$xd3
795 movdqa $xd2,0x120-0x100(%rcx)
796 movdqa $xd3,0x130-0x100(%rcx)
797
798 jmp .Loop_enter4x
799
800.align 32
801.Loop_outer4x:
802 movdqa 0x40(%rsp),$xa0 # re-load smashed key
803 movdqa 0x50(%rsp),$xa1
804 movdqa 0x60(%rsp),$xa2
805 movdqa 0x70(%rsp),$xa3
806 movdqa 0x80-0x100(%rcx),$xb0
807 movdqa 0x90-0x100(%rcx),$xb1
808 movdqa 0xa0-0x100(%rcx),$xb2
809 movdqa 0xb0-0x100(%rcx),$xb3
810 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
811 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
812 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
813 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
814 movdqa 0x100-0x100(%rcx),$xd0
815 movdqa 0x110-0x100(%rcx),$xd1
816 movdqa 0x120-0x100(%rcx),$xd2
817 movdqa 0x130-0x100(%rcx),$xd3
818 paddd .Lfour(%rip),$xd0 # next SIMD counters
819
820.Loop_enter4x:
821 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
822 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
823 movdqa (%r10),$xt3 # .Lrot16(%rip)
824 mov \$10,%eax
825 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
826 jmp .Loop4x
827
828.align 32
829.Loop4x:
830___
831 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
832 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
833$code.=<<___;
834 dec %eax
835 jnz .Loop4x
836
837 paddd 0x40(%rsp),$xa0 # accumulate key material
838 paddd 0x50(%rsp),$xa1
839 paddd 0x60(%rsp),$xa2
840 paddd 0x70(%rsp),$xa3
841
842 movdqa $xa0,$xt2 # "de-interlace" data
843 punpckldq $xa1,$xa0
844 movdqa $xa2,$xt3
845 punpckldq $xa3,$xa2
846 punpckhdq $xa1,$xt2
847 punpckhdq $xa3,$xt3
848 movdqa $xa0,$xa1
849 punpcklqdq $xa2,$xa0 # "a0"
850 movdqa $xt2,$xa3
851 punpcklqdq $xt3,$xt2 # "a2"
852 punpckhqdq $xa2,$xa1 # "a1"
853 punpckhqdq $xt3,$xa3 # "a3"
854___
855 ($xa2,$xt2)=($xt2,$xa2);
856$code.=<<___;
857 paddd 0x80-0x100(%rcx),$xb0
858 paddd 0x90-0x100(%rcx),$xb1
859 paddd 0xa0-0x100(%rcx),$xb2
860 paddd 0xb0-0x100(%rcx),$xb3
861
862 movdqa $xa0,0x00(%rsp) # offload $xaN
863 movdqa $xa1,0x10(%rsp)
864 movdqa 0x20(%rsp),$xa0 # "xc2"
865 movdqa 0x30(%rsp),$xa1 # "xc3"
866
867 movdqa $xb0,$xt2
868 punpckldq $xb1,$xb0
869 movdqa $xb2,$xt3
870 punpckldq $xb3,$xb2
871 punpckhdq $xb1,$xt2
872 punpckhdq $xb3,$xt3
873 movdqa $xb0,$xb1
874 punpcklqdq $xb2,$xb0 # "b0"
875 movdqa $xt2,$xb3
876 punpcklqdq $xt3,$xt2 # "b2"
877 punpckhqdq $xb2,$xb1 # "b1"
878 punpckhqdq $xt3,$xb3 # "b3"
879___
880 ($xb2,$xt2)=($xt2,$xb2);
881 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
882$code.=<<___;
883 paddd 0xc0-0x100(%rcx),$xc0
884 paddd 0xd0-0x100(%rcx),$xc1
885 paddd 0xe0-0x100(%rcx),$xc2
886 paddd 0xf0-0x100(%rcx),$xc3
887
888 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
889 movdqa $xa3,0x30(%rsp)
890
891 movdqa $xc0,$xt2
892 punpckldq $xc1,$xc0
893 movdqa $xc2,$xt3
894 punpckldq $xc3,$xc2
895 punpckhdq $xc1,$xt2
896 punpckhdq $xc3,$xt3
897 movdqa $xc0,$xc1
898 punpcklqdq $xc2,$xc0 # "c0"
899 movdqa $xt2,$xc3
900 punpcklqdq $xt3,$xt2 # "c2"
901 punpckhqdq $xc2,$xc1 # "c1"
902 punpckhqdq $xt3,$xc3 # "c3"
903___
904 ($xc2,$xt2)=($xt2,$xc2);
905 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
906$code.=<<___;
907 paddd 0x100-0x100(%rcx),$xd0
908 paddd 0x110-0x100(%rcx),$xd1
909 paddd 0x120-0x100(%rcx),$xd2
910 paddd 0x130-0x100(%rcx),$xd3
911
912 movdqa $xd0,$xt2
913 punpckldq $xd1,$xd0
914 movdqa $xd2,$xt3
915 punpckldq $xd3,$xd2
916 punpckhdq $xd1,$xt2
917 punpckhdq $xd3,$xt3
918 movdqa $xd0,$xd1
919 punpcklqdq $xd2,$xd0 # "d0"
920 movdqa $xt2,$xd3
921 punpcklqdq $xt3,$xt2 # "d2"
922 punpckhqdq $xd2,$xd1 # "d1"
923 punpckhqdq $xt3,$xd3 # "d3"
924___
925 ($xd2,$xt2)=($xt2,$xd2);
926$code.=<<___;
927 cmp \$64*4,$len
928 jb .Ltail4x
929
930 movdqu 0x00($inp),$xt0 # xor with input
931 movdqu 0x10($inp),$xt1
932 movdqu 0x20($inp),$xt2
933 movdqu 0x30($inp),$xt3
934 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
935 pxor $xb0,$xt1
936 pxor $xc0,$xt2
937 pxor $xd0,$xt3
938
939 movdqu $xt0,0x00($out)
940 movdqu 0x40($inp),$xt0
941 movdqu $xt1,0x10($out)
942 movdqu 0x50($inp),$xt1
943 movdqu $xt2,0x20($out)
944 movdqu 0x60($inp),$xt2
945 movdqu $xt3,0x30($out)
946 movdqu 0x70($inp),$xt3
947 lea 0x80($inp),$inp # size optimization
948 pxor 0x10(%rsp),$xt0
949 pxor $xb1,$xt1
950 pxor $xc1,$xt2
951 pxor $xd1,$xt3
952
953 movdqu $xt0,0x40($out)
954 movdqu 0x00($inp),$xt0
955 movdqu $xt1,0x50($out)
956 movdqu 0x10($inp),$xt1
957 movdqu $xt2,0x60($out)
958 movdqu 0x20($inp),$xt2
959 movdqu $xt3,0x70($out)
960 lea 0x80($out),$out # size optimization
961 movdqu 0x30($inp),$xt3
962 pxor 0x20(%rsp),$xt0
963 pxor $xb2,$xt1
964 pxor $xc2,$xt2
965 pxor $xd2,$xt3
966
967 movdqu $xt0,0x00($out)
968 movdqu 0x40($inp),$xt0
969 movdqu $xt1,0x10($out)
970 movdqu 0x50($inp),$xt1
971 movdqu $xt2,0x20($out)
972 movdqu 0x60($inp),$xt2
973 movdqu $xt3,0x30($out)
974 movdqu 0x70($inp),$xt3
975 lea 0x80($inp),$inp # inp+=64*4
976 pxor 0x30(%rsp),$xt0
977 pxor $xb3,$xt1
978 pxor $xc3,$xt2
979 pxor $xd3,$xt3
980 movdqu $xt0,0x40($out)
981 movdqu $xt1,0x50($out)
982 movdqu $xt2,0x60($out)
983 movdqu $xt3,0x70($out)
984 lea 0x80($out),$out # out+=64*4
985
986 sub \$64*4,$len
987 jnz .Loop_outer4x
988
989 jmp .Ldone4x
990
991.Ltail4x:
992 cmp \$192,$len
993 jae .L192_or_more4x
994 cmp \$128,$len
995 jae .L128_or_more4x
996 cmp \$64,$len
997 jae .L64_or_more4x
998
999 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1000 xor %r10,%r10
1001 #movdqa $xt0,0x00(%rsp)
1002 movdqa $xb0,0x10(%rsp)
1003 movdqa $xc0,0x20(%rsp)
1004 movdqa $xd0,0x30(%rsp)
1005 jmp .Loop_tail4x
1006
1007.align 32
1008.L64_or_more4x:
1009 movdqu 0x00($inp),$xt0 # xor with input
1010 movdqu 0x10($inp),$xt1
1011 movdqu 0x20($inp),$xt2
1012 movdqu 0x30($inp),$xt3
1013 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1014 pxor $xb0,$xt1
1015 pxor $xc0,$xt2
1016 pxor $xd0,$xt3
1017 movdqu $xt0,0x00($out)
1018 movdqu $xt1,0x10($out)
1019 movdqu $xt2,0x20($out)
1020 movdqu $xt3,0x30($out)
1021 je .Ldone4x
1022
1023 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1024 lea 0x40($inp),$inp # inp+=64*1
1025 xor %r10,%r10
1026 movdqa $xt0,0x00(%rsp)
1027 movdqa $xb1,0x10(%rsp)
1028 lea 0x40($out),$out # out+=64*1
1029 movdqa $xc1,0x20(%rsp)
1030 sub \$64,$len # len-=64*1
1031 movdqa $xd1,0x30(%rsp)
1032 jmp .Loop_tail4x
1033
1034.align 32
1035.L128_or_more4x:
1036 movdqu 0x00($inp),$xt0 # xor with input
1037 movdqu 0x10($inp),$xt1
1038 movdqu 0x20($inp),$xt2
1039 movdqu 0x30($inp),$xt3
1040 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1041 pxor $xb0,$xt1
1042 pxor $xc0,$xt2
1043 pxor $xd0,$xt3
1044
1045 movdqu $xt0,0x00($out)
1046 movdqu 0x40($inp),$xt0
1047 movdqu $xt1,0x10($out)
1048 movdqu 0x50($inp),$xt1
1049 movdqu $xt2,0x20($out)
1050 movdqu 0x60($inp),$xt2
1051 movdqu $xt3,0x30($out)
1052 movdqu 0x70($inp),$xt3
1053 pxor 0x10(%rsp),$xt0
1054 pxor $xb1,$xt1
1055 pxor $xc1,$xt2
1056 pxor $xd1,$xt3
1057 movdqu $xt0,0x40($out)
1058 movdqu $xt1,0x50($out)
1059 movdqu $xt2,0x60($out)
1060 movdqu $xt3,0x70($out)
1061 je .Ldone4x
1062
1063 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1064 lea 0x80($inp),$inp # inp+=64*2
1065 xor %r10,%r10
1066 movdqa $xt0,0x00(%rsp)
1067 movdqa $xb2,0x10(%rsp)
1068 lea 0x80($out),$out # out+=64*2
1069 movdqa $xc2,0x20(%rsp)
1070 sub \$128,$len # len-=64*2
1071 movdqa $xd2,0x30(%rsp)
1072 jmp .Loop_tail4x
1073
1074.align 32
1075.L192_or_more4x:
1076 movdqu 0x00($inp),$xt0 # xor with input
1077 movdqu 0x10($inp),$xt1
1078 movdqu 0x20($inp),$xt2
1079 movdqu 0x30($inp),$xt3
1080 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1081 pxor $xb0,$xt1
1082 pxor $xc0,$xt2
1083 pxor $xd0,$xt3
1084
1085 movdqu $xt0,0x00($out)
1086 movdqu 0x40($inp),$xt0
1087 movdqu $xt1,0x10($out)
1088 movdqu 0x50($inp),$xt1
1089 movdqu $xt2,0x20($out)
1090 movdqu 0x60($inp),$xt2
1091 movdqu $xt3,0x30($out)
1092 movdqu 0x70($inp),$xt3
1093 lea 0x80($inp),$inp # size optimization
1094 pxor 0x10(%rsp),$xt0
1095 pxor $xb1,$xt1
1096 pxor $xc1,$xt2
1097 pxor $xd1,$xt3
1098
1099 movdqu $xt0,0x40($out)
1100 movdqu 0x00($inp),$xt0
1101 movdqu $xt1,0x50($out)
1102 movdqu 0x10($inp),$xt1
1103 movdqu $xt2,0x60($out)
1104 movdqu 0x20($inp),$xt2
1105 movdqu $xt3,0x70($out)
1106 lea 0x80($out),$out # size optimization
1107 movdqu 0x30($inp),$xt3
1108 pxor 0x20(%rsp),$xt0
1109 pxor $xb2,$xt1
1110 pxor $xc2,$xt2
1111 pxor $xd2,$xt3
1112 movdqu $xt0,0x00($out)
1113 movdqu $xt1,0x10($out)
1114 movdqu $xt2,0x20($out)
1115 movdqu $xt3,0x30($out)
1116 je .Ldone4x
1117
1118 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1119 lea 0x40($inp),$inp # inp+=64*3
1120 xor %r10,%r10
1121 movdqa $xt0,0x00(%rsp)
1122 movdqa $xb3,0x10(%rsp)
1123 lea 0x40($out),$out # out+=64*3
1124 movdqa $xc3,0x20(%rsp)
1125 sub \$192,$len # len-=64*3
1126 movdqa $xd3,0x30(%rsp)
1127
1128.Loop_tail4x:
1129 movzb ($inp,%r10),%eax
1130 movzb (%rsp,%r10),%ecx
1131 lea 1(%r10),%r10
1132 xor %ecx,%eax
1133 mov %al,-1($out,%r10)
1134 dec $len
1135 jnz .Loop_tail4x
1136
1137.Ldone4x:
1138___
1139$code.=<<___ if ($win64);
1140 lea 0x140+0x30(%rsp),%r11
1141 movaps -0x30(%r11),%xmm6
1142 movaps -0x20(%r11),%xmm7
1143 movaps -0x10(%r11),%xmm8
1144 movaps 0x00(%r11),%xmm9
1145 movaps 0x10(%r11),%xmm10
1146 movaps 0x20(%r11),%xmm11
1147 movaps 0x30(%r11),%xmm12
1148 movaps 0x40(%r11),%xmm13
1149 movaps 0x50(%r11),%xmm14
1150 movaps 0x60(%r11),%xmm15
1151___
1152$code.=<<___;
1153 add \$0x148+$xframe,%rsp
1154 ret
1155.size ChaCha20_4x,.-ChaCha20_4x
1156___
1157}
1158
1159########################################################################
1160# XOP code path that handles all lengths.
1161if ($avx) {
1162# There is some "anomaly" observed depending on instructions' size or
1163# alignment. If you look closely at below code you'll notice that
1164# sometimes argument order varies. The order affects instruction
1165# encoding by making it larger, and such fiddling gives 5% performance
1166# improvement. This is on FX-4100...
1167
1168my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1169 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1170my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1171 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1172
1173sub XOP_lane_ROUND {
1174my ($a0,$b0,$c0,$d0)=@_;
1175my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1176my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1177my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1178my @x=map("\"$_\"",@xx);
1179
1180 (
1181 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1182 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1183 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1184 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1185 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1186 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1187 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1188 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1189 "&vprotd (@x[$d0],@x[$d0],16)",
1190 "&vprotd (@x[$d1],@x[$d1],16)",
1191 "&vprotd (@x[$d2],@x[$d2],16)",
1192 "&vprotd (@x[$d3],@x[$d3],16)",
1193
1194 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1195 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1196 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1197 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1198 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1199 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1200 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1201 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1202 "&vprotd (@x[$b0],@x[$b0],12)",
1203 "&vprotd (@x[$b1],@x[$b1],12)",
1204 "&vprotd (@x[$b2],@x[$b2],12)",
1205 "&vprotd (@x[$b3],@x[$b3],12)",
1206
1207 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1208 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1209 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1210 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1211 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1212 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1213 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1214 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1215 "&vprotd (@x[$d0],@x[$d0],8)",
1216 "&vprotd (@x[$d1],@x[$d1],8)",
1217 "&vprotd (@x[$d2],@x[$d2],8)",
1218 "&vprotd (@x[$d3],@x[$d3],8)",
1219
1220 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1221 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1222 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1223 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1224 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1225 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1226 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1227 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1228 "&vprotd (@x[$b0],@x[$b0],7)",
1229 "&vprotd (@x[$b1],@x[$b1],7)",
1230 "&vprotd (@x[$b2],@x[$b2],7)",
1231 "&vprotd (@x[$b3],@x[$b3],7)"
1232 );
1233}
1234
1235my $xframe = $win64 ? 0xa0 : 0;
1236
1237$code.=<<___;
1238.type ChaCha20_4xop,\@function,5
1239.align 32
1240ChaCha20_4xop:
1241.LChaCha20_4xop:
1242 lea -0x78(%rsp),%r11
1243 sub \$0x148+$xframe,%rsp
1244___
1245 ################ stack layout
1246 # +0x00 SIMD equivalent of @x[8-12]
1247 # ...
1248 # +0x40 constant copy of key[0-2] smashed by lanes
1249 # ...
1250 # +0x100 SIMD counters (with nonce smashed by lanes)
1251 # ...
1252 # +0x140
1253$code.=<<___ if ($win64);
1254 movaps %xmm6,-0x30(%r11)
1255 movaps %xmm7,-0x20(%r11)
1256 movaps %xmm8,-0x10(%r11)
1257 movaps %xmm9,0x00(%r11)
1258 movaps %xmm10,0x10(%r11)
1259 movaps %xmm11,0x20(%r11)
1260 movaps %xmm12,0x30(%r11)
1261 movaps %xmm13,0x40(%r11)
1262 movaps %xmm14,0x50(%r11)
1263 movaps %xmm15,0x60(%r11)
1264___
1265$code.=<<___;
1266 vzeroupper
1267
1268 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1269 vmovdqu ($key),$xb3 # key[1]
1270 vmovdqu 16($key),$xt3 # key[2]
1271 vmovdqu ($counter),$xd3 # key[3]
1272 lea 0x100(%rsp),%rcx # size optimization
1273
1274 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1275 vpshufd \$0x55,$xa3,$xa1
1276 vmovdqa $xa0,0x40(%rsp) # ... and offload
1277 vpshufd \$0xaa,$xa3,$xa2
1278 vmovdqa $xa1,0x50(%rsp)
1279 vpshufd \$0xff,$xa3,$xa3
1280 vmovdqa $xa2,0x60(%rsp)
1281 vmovdqa $xa3,0x70(%rsp)
1282
1283 vpshufd \$0x00,$xb3,$xb0
1284 vpshufd \$0x55,$xb3,$xb1
1285 vmovdqa $xb0,0x80-0x100(%rcx)
1286 vpshufd \$0xaa,$xb3,$xb2
1287 vmovdqa $xb1,0x90-0x100(%rcx)
1288 vpshufd \$0xff,$xb3,$xb3
1289 vmovdqa $xb2,0xa0-0x100(%rcx)
1290 vmovdqa $xb3,0xb0-0x100(%rcx)
1291
1292 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1293 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1294 vmovdqa $xt0,0xc0-0x100(%rcx)
1295 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1296 vmovdqa $xt1,0xd0-0x100(%rcx)
1297 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1298 vmovdqa $xt2,0xe0-0x100(%rcx)
1299 vmovdqa $xt3,0xf0-0x100(%rcx)
1300
1301 vpshufd \$0x00,$xd3,$xd0
1302 vpshufd \$0x55,$xd3,$xd1
1303 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1304 vpshufd \$0xaa,$xd3,$xd2
1305 vmovdqa $xd1,0x110-0x100(%rcx)
1306 vpshufd \$0xff,$xd3,$xd3
1307 vmovdqa $xd2,0x120-0x100(%rcx)
1308 vmovdqa $xd3,0x130-0x100(%rcx)
1309
1310 jmp .Loop_enter4xop
1311
1312.align 32
1313.Loop_outer4xop:
1314 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1315 vmovdqa 0x50(%rsp),$xa1
1316 vmovdqa 0x60(%rsp),$xa2
1317 vmovdqa 0x70(%rsp),$xa3
1318 vmovdqa 0x80-0x100(%rcx),$xb0
1319 vmovdqa 0x90-0x100(%rcx),$xb1
1320 vmovdqa 0xa0-0x100(%rcx),$xb2
1321 vmovdqa 0xb0-0x100(%rcx),$xb3
1322 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1323 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1324 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1325 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1326 vmovdqa 0x100-0x100(%rcx),$xd0
1327 vmovdqa 0x110-0x100(%rcx),$xd1
1328 vmovdqa 0x120-0x100(%rcx),$xd2
1329 vmovdqa 0x130-0x100(%rcx),$xd3
1330 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1331
1332.Loop_enter4xop:
1333 mov \$10,%eax
1334 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1335 jmp .Loop4xop
1336
1337.align 32
1338.Loop4xop:
1339___
1340 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1341 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1342$code.=<<___;
1343 dec %eax
1344 jnz .Loop4xop
1345
1346 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1347 vpaddd 0x50(%rsp),$xa1,$xa1
1348 vpaddd 0x60(%rsp),$xa2,$xa2
1349 vpaddd 0x70(%rsp),$xa3,$xa3
1350
1351 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1352 vmovdqa $xt3,0x30(%rsp)
1353
1354 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1355 vpunpckldq $xa3,$xa2,$xt3
1356 vpunpckhdq $xa1,$xa0,$xa0
1357 vpunpckhdq $xa3,$xa2,$xa2
1358 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1359 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1360 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1361 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1362___
1363 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1364$code.=<<___;
1365 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1366 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1367 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1368 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1369
1370 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1371 vmovdqa $xa1,0x10(%rsp)
1372 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1373 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1374
1375 vpunpckldq $xb1,$xb0,$xt2
1376 vpunpckldq $xb3,$xb2,$xt3
1377 vpunpckhdq $xb1,$xb0,$xb0
1378 vpunpckhdq $xb3,$xb2,$xb2
1379 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1380 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1381 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1382 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1383___
1384 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1385 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1386$code.=<<___;
1387 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1388 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1389 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1390 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1391
1392 vpunpckldq $xc1,$xc0,$xt2
1393 vpunpckldq $xc3,$xc2,$xt3
1394 vpunpckhdq $xc1,$xc0,$xc0
1395 vpunpckhdq $xc3,$xc2,$xc2
1396 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1397 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1398 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1399 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1400___
1401 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1402$code.=<<___;
1403 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1404 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1405 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1406 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1407
1408 vpunpckldq $xd1,$xd0,$xt2
1409 vpunpckldq $xd3,$xd2,$xt3
1410 vpunpckhdq $xd1,$xd0,$xd0
1411 vpunpckhdq $xd3,$xd2,$xd2
1412 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1413 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1414 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1415 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1416___
1417 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1418 ($xa0,$xa1)=($xt2,$xt3);
1419$code.=<<___;
1420 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1421 vmovdqa 0x10(%rsp),$xa1
1422
1423 cmp \$64*4,$len
1424 jb .Ltail4xop
1425
1426 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1427 vpxor 0x10($inp),$xb0,$xb0
1428 vpxor 0x20($inp),$xc0,$xc0
1429 vpxor 0x30($inp),$xd0,$xd0
1430 vpxor 0x40($inp),$xa1,$xa1
1431 vpxor 0x50($inp),$xb1,$xb1
1432 vpxor 0x60($inp),$xc1,$xc1
1433 vpxor 0x70($inp),$xd1,$xd1
1434 lea 0x80($inp),$inp # size optimization
1435 vpxor 0x00($inp),$xa2,$xa2
1436 vpxor 0x10($inp),$xb2,$xb2
1437 vpxor 0x20($inp),$xc2,$xc2
1438 vpxor 0x30($inp),$xd2,$xd2
1439 vpxor 0x40($inp),$xa3,$xa3
1440 vpxor 0x50($inp),$xb3,$xb3
1441 vpxor 0x60($inp),$xc3,$xc3
1442 vpxor 0x70($inp),$xd3,$xd3
1443 lea 0x80($inp),$inp # inp+=64*4
1444
1445 vmovdqu $xa0,0x00($out)
1446 vmovdqu $xb0,0x10($out)
1447 vmovdqu $xc0,0x20($out)
1448 vmovdqu $xd0,0x30($out)
1449 vmovdqu $xa1,0x40($out)
1450 vmovdqu $xb1,0x50($out)
1451 vmovdqu $xc1,0x60($out)
1452 vmovdqu $xd1,0x70($out)
1453 lea 0x80($out),$out # size optimization
1454 vmovdqu $xa2,0x00($out)
1455 vmovdqu $xb2,0x10($out)
1456 vmovdqu $xc2,0x20($out)
1457 vmovdqu $xd2,0x30($out)
1458 vmovdqu $xa3,0x40($out)
1459 vmovdqu $xb3,0x50($out)
1460 vmovdqu $xc3,0x60($out)
1461 vmovdqu $xd3,0x70($out)
1462 lea 0x80($out),$out # out+=64*4
1463
1464 sub \$64*4,$len
1465 jnz .Loop_outer4xop
1466
1467 jmp .Ldone4xop
1468
1469.align 32
1470.Ltail4xop:
1471 cmp \$192,$len
1472 jae .L192_or_more4xop
1473 cmp \$128,$len
1474 jae .L128_or_more4xop
1475 cmp \$64,$len
1476 jae .L64_or_more4xop
1477
1478 xor %r10,%r10
1479 vmovdqa $xa0,0x00(%rsp)
1480 vmovdqa $xb0,0x10(%rsp)
1481 vmovdqa $xc0,0x20(%rsp)
1482 vmovdqa $xd0,0x30(%rsp)
1483 jmp .Loop_tail4xop
1484
1485.align 32
1486.L64_or_more4xop:
1487 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1488 vpxor 0x10($inp),$xb0,$xb0
1489 vpxor 0x20($inp),$xc0,$xc0
1490 vpxor 0x30($inp),$xd0,$xd0
1491 vmovdqu $xa0,0x00($out)
1492 vmovdqu $xb0,0x10($out)
1493 vmovdqu $xc0,0x20($out)
1494 vmovdqu $xd0,0x30($out)
1495 je .Ldone4xop
1496
1497 lea 0x40($inp),$inp # inp+=64*1
1498 vmovdqa $xa1,0x00(%rsp)
1499 xor %r10,%r10
1500 vmovdqa $xb1,0x10(%rsp)
1501 lea 0x40($out),$out # out+=64*1
1502 vmovdqa $xc1,0x20(%rsp)
1503 sub \$64,$len # len-=64*1
1504 vmovdqa $xd1,0x30(%rsp)
1505 jmp .Loop_tail4xop
1506
1507.align 32
1508.L128_or_more4xop:
1509 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1510 vpxor 0x10($inp),$xb0,$xb0
1511 vpxor 0x20($inp),$xc0,$xc0
1512 vpxor 0x30($inp),$xd0,$xd0
1513 vpxor 0x40($inp),$xa1,$xa1
1514 vpxor 0x50($inp),$xb1,$xb1
1515 vpxor 0x60($inp),$xc1,$xc1
1516 vpxor 0x70($inp),$xd1,$xd1
1517
1518 vmovdqu $xa0,0x00($out)
1519 vmovdqu $xb0,0x10($out)
1520 vmovdqu $xc0,0x20($out)
1521 vmovdqu $xd0,0x30($out)
1522 vmovdqu $xa1,0x40($out)
1523 vmovdqu $xb1,0x50($out)
1524 vmovdqu $xc1,0x60($out)
1525 vmovdqu $xd1,0x70($out)
1526 je .Ldone4xop
1527
1528 lea 0x80($inp),$inp # inp+=64*2
1529 vmovdqa $xa2,0x00(%rsp)
1530 xor %r10,%r10
1531 vmovdqa $xb2,0x10(%rsp)
1532 lea 0x80($out),$out # out+=64*2
1533 vmovdqa $xc2,0x20(%rsp)
1534 sub \$128,$len # len-=64*2
1535 vmovdqa $xd2,0x30(%rsp)
1536 jmp .Loop_tail4xop
1537
1538.align 32
1539.L192_or_more4xop:
1540 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1541 vpxor 0x10($inp),$xb0,$xb0
1542 vpxor 0x20($inp),$xc0,$xc0
1543 vpxor 0x30($inp),$xd0,$xd0
1544 vpxor 0x40($inp),$xa1,$xa1
1545 vpxor 0x50($inp),$xb1,$xb1
1546 vpxor 0x60($inp),$xc1,$xc1
1547 vpxor 0x70($inp),$xd1,$xd1
1548 lea 0x80($inp),$inp # size optimization
1549 vpxor 0x00($inp),$xa2,$xa2
1550 vpxor 0x10($inp),$xb2,$xb2
1551 vpxor 0x20($inp),$xc2,$xc2
1552 vpxor 0x30($inp),$xd2,$xd2
1553
1554 vmovdqu $xa0,0x00($out)
1555 vmovdqu $xb0,0x10($out)
1556 vmovdqu $xc0,0x20($out)
1557 vmovdqu $xd0,0x30($out)
1558 vmovdqu $xa1,0x40($out)
1559 vmovdqu $xb1,0x50($out)
1560 vmovdqu $xc1,0x60($out)
1561 vmovdqu $xd1,0x70($out)
1562 lea 0x80($out),$out # size optimization
1563 vmovdqu $xa2,0x00($out)
1564 vmovdqu $xb2,0x10($out)
1565 vmovdqu $xc2,0x20($out)
1566 vmovdqu $xd2,0x30($out)
1567 je .Ldone4xop
1568
1569 lea 0x40($inp),$inp # inp+=64*3
f2188228 1570 vmovdqa $xa3,0x00(%rsp)
a98c648e 1571 xor %r10,%r10
f2188228 1572 vmovdqa $xb3,0x10(%rsp)
a98c648e 1573 lea 0x40($out),$out # out+=64*3
f2188228 1574 vmovdqa $xc3,0x20(%rsp)
a98c648e 1575 sub \$192,$len # len-=64*3
f2188228 1576 vmovdqa $xd3,0x30(%rsp)
a98c648e
AP
1577
1578.Loop_tail4xop:
1579 movzb ($inp,%r10),%eax
1580 movzb (%rsp,%r10),%ecx
1581 lea 1(%r10),%r10
1582 xor %ecx,%eax
1583 mov %al,-1($out,%r10)
1584 dec $len
1585 jnz .Loop_tail4xop
1586
1587.Ldone4xop:
1588 vzeroupper
1589___
1590$code.=<<___ if ($win64);
1591 lea 0x140+0x30(%rsp),%r11
1592 movaps -0x30(%r11),%xmm6
1593 movaps -0x20(%r11),%xmm7
1594 movaps -0x10(%r11),%xmm8
1595 movaps 0x00(%r11),%xmm9
1596 movaps 0x10(%r11),%xmm10
1597 movaps 0x20(%r11),%xmm11
1598 movaps 0x30(%r11),%xmm12
1599 movaps 0x40(%r11),%xmm13
1600 movaps 0x50(%r11),%xmm14
1601 movaps 0x60(%r11),%xmm15
1602___
1603$code.=<<___;
1604 add \$0x148+$xframe,%rsp
1605 ret
1606.size ChaCha20_4xop,.-ChaCha20_4xop
1607___
1608}
1609
1610########################################################################
1611# AVX2 code path
1612if ($avx>1) {
1613my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1614 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1615my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1616 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1617
1618sub AVX2_lane_ROUND {
1619my ($a0,$b0,$c0,$d0)=@_;
1620my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1621my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1622my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1623my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1624my @x=map("\"$_\"",@xx);
1625
1626 # Consider order in which variables are addressed by their
1627 # index:
1628 #
1629 # a b c d
1630 #
1631 # 0 4 8 12 < even round
1632 # 1 5 9 13
1633 # 2 6 10 14
1634 # 3 7 11 15
1635 # 0 5 10 15 < odd round
1636 # 1 6 11 12
1637 # 2 7 8 13
1638 # 3 4 9 14
1639 #
1640 # 'a', 'b' and 'd's are permanently allocated in registers,
1641 # @x[0..7,12..15], while 'c's are maintained in memory. If
1642 # you observe 'c' column, you'll notice that pair of 'c's is
1643 # invariant between rounds. This means that we have to reload
1644 # them once per round, in the middle. This is why you'll see
1645 # bunch of 'c' stores and loads in the middle, but none in
1646 # the beginning or end.
1647
1648 (
1649 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1650 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1651 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1652 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1653 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1654 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1655
1656 "&vpaddd ($xc,$xc,@x[$d0])",
1657 "&vpxor (@x[$b0],$xc,@x[$b0])",
1658 "&vpslld ($t0,@x[$b0],12)",
1659 "&vpsrld (@x[$b0],@x[$b0],20)",
1660 "&vpor (@x[$b0],$t0,@x[$b0])",
1661 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1662 "&vpaddd ($xc_,$xc_,@x[$d1])",
1663 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1664 "&vpslld ($t1,@x[$b1],12)",
1665 "&vpsrld (@x[$b1],@x[$b1],20)",
1666 "&vpor (@x[$b1],$t1,@x[$b1])",
1667
1668 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1669 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1670 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1671 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1672 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1673 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1674
1675 "&vpaddd ($xc,$xc,@x[$d0])",
1676 "&vpxor (@x[$b0],$xc,@x[$b0])",
1677 "&vpslld ($t1,@x[$b0],7)",
1678 "&vpsrld (@x[$b0],@x[$b0],25)",
1679 "&vpor (@x[$b0],$t1,@x[$b0])",
1680 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1681 "&vpaddd ($xc_,$xc_,@x[$d1])",
1682 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1683 "&vpslld ($t0,@x[$b1],7)",
1684 "&vpsrld (@x[$b1],@x[$b1],25)",
1685 "&vpor (@x[$b1],$t0,@x[$b1])",
1686
1687 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1688 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1689 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1690 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1691
1692 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1693 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1694 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1695 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1696 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1697 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1698
1699 "&vpaddd ($xc,$xc,@x[$d2])",
1700 "&vpxor (@x[$b2],$xc,@x[$b2])",
1701 "&vpslld ($t0,@x[$b2],12)",
1702 "&vpsrld (@x[$b2],@x[$b2],20)",
1703 "&vpor (@x[$b2],$t0,@x[$b2])",
1704 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1705 "&vpaddd ($xc_,$xc_,@x[$d3])",
1706 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1707 "&vpslld ($t1,@x[$b3],12)",
1708 "&vpsrld (@x[$b3],@x[$b3],20)",
1709 "&vpor (@x[$b3],$t1,@x[$b3])",
1710
1711 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1712 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1713 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1714 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1715 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1716 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1717
1718 "&vpaddd ($xc,$xc,@x[$d2])",
1719 "&vpxor (@x[$b2],$xc,@x[$b2])",
1720 "&vpslld ($t1,@x[$b2],7)",
1721 "&vpsrld (@x[$b2],@x[$b2],25)",
1722 "&vpor (@x[$b2],$t1,@x[$b2])",
1723 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1724 "&vpaddd ($xc_,$xc_,@x[$d3])",
1725 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1726 "&vpslld ($t0,@x[$b3],7)",
1727 "&vpsrld (@x[$b3],@x[$b3],25)",
1728 "&vpor (@x[$b3],$t0,@x[$b3])"
1729 );
1730}
1731
1732my $xframe = $win64 ? 0xb0 : 8;
1733
1734$code.=<<___;
1735.type ChaCha20_8x,\@function,5
1736.align 32
1737ChaCha20_8x:
1738.LChaCha20_8x:
1739 mov %rsp,%r10
1740 sub \$0x280+$xframe,%rsp
1741 and \$-32,%rsp
1742___
1743$code.=<<___ if ($win64);
1744 lea 0x290+0x30(%rsp),%r11
1745 movaps %xmm6,-0x30(%r11)
1746 movaps %xmm7,-0x20(%r11)
1747 movaps %xmm8,-0x10(%r11)
1748 movaps %xmm9,0x00(%r11)
1749 movaps %xmm10,0x10(%r11)
1750 movaps %xmm11,0x20(%r11)
1751 movaps %xmm12,0x30(%r11)
1752 movaps %xmm13,0x40(%r11)
1753 movaps %xmm14,0x50(%r11)
1754 movaps %xmm15,0x60(%r11)
1755___
1756$code.=<<___;
1757 vzeroupper
1758 mov %r10,0x280(%rsp)
1759
1760 ################ stack layout
1761 # +0x00 SIMD equivalent of @x[8-12]
1762 # ...
1763 # +0x80 constant copy of key[0-2] smashed by lanes
1764 # ...
1765 # +0x200 SIMD counters (with nonce smashed by lanes)
1766 # ...
1767 # +0x280 saved %rsp
1768
1769 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1770 vbroadcasti128 ($key),$xb3 # key[1]
1771 vbroadcasti128 16($key),$xt3 # key[2]
1772 vbroadcasti128 ($counter),$xd3 # key[3]
1773 lea 0x100(%rsp),%rcx # size optimization
1774 lea 0x200(%rsp),%rax # size optimization
1775 lea .Lrot16(%rip),%r10
1776 lea .Lrot24(%rip),%r11
1777
1778 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1779 vpshufd \$0x55,$xa3,$xa1
1780 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1781 vpshufd \$0xaa,$xa3,$xa2
1782 vmovdqa $xa1,0xa0-0x100(%rcx)
1783 vpshufd \$0xff,$xa3,$xa3
1784 vmovdqa $xa2,0xc0-0x100(%rcx)
1785 vmovdqa $xa3,0xe0-0x100(%rcx)
1786
1787 vpshufd \$0x00,$xb3,$xb0
1788 vpshufd \$0x55,$xb3,$xb1
1789 vmovdqa $xb0,0x100-0x100(%rcx)
1790 vpshufd \$0xaa,$xb3,$xb2
1791 vmovdqa $xb1,0x120-0x100(%rcx)
1792 vpshufd \$0xff,$xb3,$xb3
1793 vmovdqa $xb2,0x140-0x100(%rcx)
1794 vmovdqa $xb3,0x160-0x100(%rcx)
1795
1796 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1797 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1798 vmovdqa $xt0,0x180-0x200(%rax)
1799 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1800 vmovdqa $xt1,0x1a0-0x200(%rax)
1801 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1802 vmovdqa $xt2,0x1c0-0x200(%rax)
1803 vmovdqa $xt3,0x1e0-0x200(%rax)
1804
1805 vpshufd \$0x00,$xd3,$xd0
1806 vpshufd \$0x55,$xd3,$xd1
1807 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1808 vpshufd \$0xaa,$xd3,$xd2
1809 vmovdqa $xd1,0x220-0x200(%rax)
1810 vpshufd \$0xff,$xd3,$xd3
1811 vmovdqa $xd2,0x240-0x200(%rax)
1812 vmovdqa $xd3,0x260-0x200(%rax)
1813
1814 jmp .Loop_enter8x
1815
1816.align 32
1817.Loop_outer8x:
1818 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1819 vmovdqa 0xa0-0x100(%rcx),$xa1
1820 vmovdqa 0xc0-0x100(%rcx),$xa2
1821 vmovdqa 0xe0-0x100(%rcx),$xa3
1822 vmovdqa 0x100-0x100(%rcx),$xb0
1823 vmovdqa 0x120-0x100(%rcx),$xb1
1824 vmovdqa 0x140-0x100(%rcx),$xb2
1825 vmovdqa 0x160-0x100(%rcx),$xb3
1826 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1827 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1828 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1829 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1830 vmovdqa 0x200-0x200(%rax),$xd0
1831 vmovdqa 0x220-0x200(%rax),$xd1
1832 vmovdqa 0x240-0x200(%rax),$xd2
1833 vmovdqa 0x260-0x200(%rax),$xd3
1834 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1835
1836.Loop_enter8x:
1837 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1838 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1839 vbroadcasti128 (%r10),$xt3
1840 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1841 mov \$10,%eax
1842 jmp .Loop8x
1843
1844.align 32
1845.Loop8x:
1846___
1847 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1848 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1849$code.=<<___;
1850 dec %eax
1851 jnz .Loop8x
1852
1853 lea 0x200(%rsp),%rax # size optimization
1854 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1855 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1856 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1857 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1858
1859 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1860 vpunpckldq $xa3,$xa2,$xt3
1861 vpunpckhdq $xa1,$xa0,$xa0
1862 vpunpckhdq $xa3,$xa2,$xa2
1863 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1864 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1865 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1866 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1867___
1868 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1869$code.=<<___;
1870 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1871 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1872 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1873 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1874
1875 vpunpckldq $xb1,$xb0,$xt2
1876 vpunpckldq $xb3,$xb2,$xt3
1877 vpunpckhdq $xb1,$xb0,$xb0
1878 vpunpckhdq $xb3,$xb2,$xb2
1879 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1880 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1881 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1882 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1883___
1884 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1885$code.=<<___;
1886 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1887 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1888 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1889 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1890 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1891 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1892 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1893 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1894___
1895 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1896 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1897$code.=<<___;
1898 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1899 vmovdqa $xa1,0x20(%rsp)
1900 vmovdqa 0x40(%rsp),$xc2 # $xa0
1901 vmovdqa 0x60(%rsp),$xc3 # $xa1
1902
1903 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1904 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1905 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1906 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1907
1908 vpunpckldq $xc1,$xc0,$xt2
1909 vpunpckldq $xc3,$xc2,$xt3
1910 vpunpckhdq $xc1,$xc0,$xc0
1911 vpunpckhdq $xc3,$xc2,$xc2
1912 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1913 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1914 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1915 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1916___
1917 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1918$code.=<<___;
1919 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1920 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1921 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1922 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1923
1924 vpunpckldq $xd1,$xd0,$xt2
1925 vpunpckldq $xd3,$xd2,$xt3
1926 vpunpckhdq $xd1,$xd0,$xd0
1927 vpunpckhdq $xd3,$xd2,$xd2
1928 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1929 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1930 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1931 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1932___
1933 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1934$code.=<<___;
1935 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1936 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1937 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1938 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1939 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1940 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1941 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1942 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1943___
1944 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1945 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1946 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1947 ($xa0,$xa1)=($xt2,$xt3);
1948$code.=<<___;
1949 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1950 vmovdqa 0x20(%rsp),$xa1
1951
1952 cmp \$64*8,$len
1953 jb .Ltail8x
1954
1955 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1956 vpxor 0x20($inp),$xb0,$xb0
1957 vpxor 0x40($inp),$xc0,$xc0
1958 vpxor 0x60($inp),$xd0,$xd0
1959 lea 0x80($inp),$inp # size optimization
1960 vmovdqu $xa0,0x00($out)
1961 vmovdqu $xb0,0x20($out)
1962 vmovdqu $xc0,0x40($out)
1963 vmovdqu $xd0,0x60($out)
1964 lea 0x80($out),$out # size optimization
1965
1966 vpxor 0x00($inp),$xa1,$xa1
1967 vpxor 0x20($inp),$xb1,$xb1
1968 vpxor 0x40($inp),$xc1,$xc1
1969 vpxor 0x60($inp),$xd1,$xd1
1970 lea 0x80($inp),$inp # size optimization
1971 vmovdqu $xa1,0x00($out)
1972 vmovdqu $xb1,0x20($out)
1973 vmovdqu $xc1,0x40($out)
1974 vmovdqu $xd1,0x60($out)
1975 lea 0x80($out),$out # size optimization
1976
1977 vpxor 0x00($inp),$xa2,$xa2
1978 vpxor 0x20($inp),$xb2,$xb2
1979 vpxor 0x40($inp),$xc2,$xc2
1980 vpxor 0x60($inp),$xd2,$xd2
1981 lea 0x80($inp),$inp # size optimization
1982 vmovdqu $xa2,0x00($out)
1983 vmovdqu $xb2,0x20($out)
1984 vmovdqu $xc2,0x40($out)
1985 vmovdqu $xd2,0x60($out)
1986 lea 0x80($out),$out # size optimization
1987
1988 vpxor 0x00($inp),$xa3,$xa3
1989 vpxor 0x20($inp),$xb3,$xb3
1990 vpxor 0x40($inp),$xc3,$xc3
1991 vpxor 0x60($inp),$xd3,$xd3
1992 lea 0x80($inp),$inp # size optimization
1993 vmovdqu $xa3,0x00($out)
1994 vmovdqu $xb3,0x20($out)
1995 vmovdqu $xc3,0x40($out)
1996 vmovdqu $xd3,0x60($out)
1997 lea 0x80($out),$out # size optimization
1998
1999 sub \$64*8,$len
2000 jnz .Loop_outer8x
2001
2002 jmp .Ldone8x
2003
2004.Ltail8x:
2005 cmp \$448,$len
2006 jae .L448_or_more8x
2007 cmp \$384,$len
2008 jae .L384_or_more8x
2009 cmp \$320,$len
2010 jae .L320_or_more8x
2011 cmp \$256,$len
2012 jae .L256_or_more8x
2013 cmp \$192,$len
2014 jae .L192_or_more8x
2015 cmp \$128,$len
2016 jae .L128_or_more8x
2017 cmp \$64,$len
2018 jae .L64_or_more8x
2019
2020 xor %r10,%r10
2021 vmovdqa $xa0,0x00(%rsp)
2022 vmovdqa $xb0,0x20(%rsp)
2023 jmp .Loop_tail8x
2024
2025.align 32
2026.L64_or_more8x:
2027 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2028 vpxor 0x20($inp),$xb0,$xb0
2029 vmovdqu $xa0,0x00($out)
2030 vmovdqu $xb0,0x20($out)
2031 je .Ldone8x
2032
2033 lea 0x40($inp),$inp # inp+=64*1
2034 xor %r10,%r10
2035 vmovdqa $xc0,0x00(%rsp)
2036 lea 0x40($out),$out # out+=64*1
2037 sub \$64,$len # len-=64*1
2038 vmovdqa $xd0,0x20(%rsp)
2039 jmp .Loop_tail8x
2040
2041.align 32
2042.L128_or_more8x:
2043 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2044 vpxor 0x20($inp),$xb0,$xb0
2045 vpxor 0x40($inp),$xc0,$xc0
2046 vpxor 0x60($inp),$xd0,$xd0
2047 vmovdqu $xa0,0x00($out)
2048 vmovdqu $xb0,0x20($out)
2049 vmovdqu $xc0,0x40($out)
2050 vmovdqu $xd0,0x60($out)
2051 je .Ldone8x
2052
2053 lea 0x80($inp),$inp # inp+=64*2
2054 xor %r10,%r10
2055 vmovdqa $xa1,0x00(%rsp)
2056 lea 0x80($out),$out # out+=64*2
2057 sub \$128,$len # len-=64*2
2058 vmovdqa $xb1,0x20(%rsp)
2059 jmp .Loop_tail8x
2060
2061.align 32
2062.L192_or_more8x:
2063 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2064 vpxor 0x20($inp),$xb0,$xb0
2065 vpxor 0x40($inp),$xc0,$xc0
2066 vpxor 0x60($inp),$xd0,$xd0
2067 vpxor 0x80($inp),$xa1,$xa1
2068 vpxor 0xa0($inp),$xb1,$xb1
2069 vmovdqu $xa0,0x00($out)
2070 vmovdqu $xb0,0x20($out)
2071 vmovdqu $xc0,0x40($out)
2072 vmovdqu $xd0,0x60($out)
2073 vmovdqu $xa1,0x80($out)
2074 vmovdqu $xb1,0xa0($out)
2075 je .Ldone8x
2076
2077 lea 0xc0($inp),$inp # inp+=64*3
2078 xor %r10,%r10
2079 vmovdqa $xc1,0x00(%rsp)
2080 lea 0xc0($out),$out # out+=64*3
2081 sub \$192,$len # len-=64*3
2082 vmovdqa $xd1,0x20(%rsp)
2083 jmp .Loop_tail8x
2084
2085.align 32
2086.L256_or_more8x:
2087 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2088 vpxor 0x20($inp),$xb0,$xb0
2089 vpxor 0x40($inp),$xc0,$xc0
2090 vpxor 0x60($inp),$xd0,$xd0
2091 vpxor 0x80($inp),$xa1,$xa1
2092 vpxor 0xa0($inp),$xb1,$xb1
2093 vpxor 0xc0($inp),$xc1,$xc1
2094 vpxor 0xe0($inp),$xd1,$xd1
2095 vmovdqu $xa0,0x00($out)
2096 vmovdqu $xb0,0x20($out)
2097 vmovdqu $xc0,0x40($out)
2098 vmovdqu $xd0,0x60($out)
2099 vmovdqu $xa1,0x80($out)
2100 vmovdqu $xb1,0xa0($out)
2101 vmovdqu $xc1,0xc0($out)
2102 vmovdqu $xd1,0xe0($out)
2103 je .Ldone8x
2104
2105 lea 0x100($inp),$inp # inp+=64*4
2106 xor %r10,%r10
2107 vmovdqa $xa2,0x00(%rsp)
2108 lea 0x100($out),$out # out+=64*4
2109 sub \$256,$len # len-=64*4
2110 vmovdqa $xb2,0x20(%rsp)
2111 jmp .Loop_tail8x
2112
2113.align 32
2114.L320_or_more8x:
2115 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2116 vpxor 0x20($inp),$xb0,$xb0
2117 vpxor 0x40($inp),$xc0,$xc0
2118 vpxor 0x60($inp),$xd0,$xd0
2119 vpxor 0x80($inp),$xa1,$xa1
2120 vpxor 0xa0($inp),$xb1,$xb1
2121 vpxor 0xc0($inp),$xc1,$xc1
2122 vpxor 0xe0($inp),$xd1,$xd1
2123 vpxor 0x100($inp),$xa2,$xa2
2124 vpxor 0x120($inp),$xb2,$xb2
2125 vmovdqu $xa0,0x00($out)
2126 vmovdqu $xb0,0x20($out)
2127 vmovdqu $xc0,0x40($out)
2128 vmovdqu $xd0,0x60($out)
2129 vmovdqu $xa1,0x80($out)
2130 vmovdqu $xb1,0xa0($out)
2131 vmovdqu $xc1,0xc0($out)
2132 vmovdqu $xd1,0xe0($out)
2133 vmovdqu $xa2,0x100($out)
2134 vmovdqu $xb2,0x120($out)
2135 je .Ldone8x
2136
2137 lea 0x140($inp),$inp # inp+=64*5
2138 xor %r10,%r10
2139 vmovdqa $xc2,0x00(%rsp)
2140 lea 0x140($out),$out # out+=64*5
2141 sub \$320,$len # len-=64*5
2142 vmovdqa $xd2,0x20(%rsp)
2143 jmp .Loop_tail8x
2144
2145.align 32
2146.L384_or_more8x:
2147 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2148 vpxor 0x20($inp),$xb0,$xb0
2149 vpxor 0x40($inp),$xc0,$xc0
2150 vpxor 0x60($inp),$xd0,$xd0
2151 vpxor 0x80($inp),$xa1,$xa1
2152 vpxor 0xa0($inp),$xb1,$xb1
2153 vpxor 0xc0($inp),$xc1,$xc1
2154 vpxor 0xe0($inp),$xd1,$xd1
2155 vpxor 0x100($inp),$xa2,$xa2
2156 vpxor 0x120($inp),$xb2,$xb2
2157 vpxor 0x140($inp),$xc2,$xc2
2158 vpxor 0x160($inp),$xd2,$xd2
2159 vmovdqu $xa0,0x00($out)
2160 vmovdqu $xb0,0x20($out)
2161 vmovdqu $xc0,0x40($out)
2162 vmovdqu $xd0,0x60($out)
2163 vmovdqu $xa1,0x80($out)
2164 vmovdqu $xb1,0xa0($out)
2165 vmovdqu $xc1,0xc0($out)
2166 vmovdqu $xd1,0xe0($out)
2167 vmovdqu $xa2,0x100($out)
2168 vmovdqu $xb2,0x120($out)
2169 vmovdqu $xc2,0x140($out)
2170 vmovdqu $xd2,0x160($out)
2171 je .Ldone8x
2172
2173 lea 0x180($inp),$inp # inp+=64*6
2174 xor %r10,%r10
2175 vmovdqa $xa3,0x00(%rsp)
2176 lea 0x180($out),$out # out+=64*6
2177 sub \$384,$len # len-=64*6
2178 vmovdqa $xb3,0x20(%rsp)
2179 jmp .Loop_tail8x
2180
2181.align 32
2182.L448_or_more8x:
2183 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2184 vpxor 0x20($inp),$xb0,$xb0
2185 vpxor 0x40($inp),$xc0,$xc0
2186 vpxor 0x60($inp),$xd0,$xd0
2187 vpxor 0x80($inp),$xa1,$xa1
2188 vpxor 0xa0($inp),$xb1,$xb1
2189 vpxor 0xc0($inp),$xc1,$xc1
2190 vpxor 0xe0($inp),$xd1,$xd1
2191 vpxor 0x100($inp),$xa2,$xa2
2192 vpxor 0x120($inp),$xb2,$xb2
2193 vpxor 0x140($inp),$xc2,$xc2
2194 vpxor 0x160($inp),$xd2,$xd2
2195 vpxor 0x180($inp),$xa3,$xa3
2196 vpxor 0x1a0($inp),$xb3,$xb3
2197 vmovdqu $xa0,0x00($out)
2198 vmovdqu $xb0,0x20($out)
2199 vmovdqu $xc0,0x40($out)
2200 vmovdqu $xd0,0x60($out)
2201 vmovdqu $xa1,0x80($out)
2202 vmovdqu $xb1,0xa0($out)
2203 vmovdqu $xc1,0xc0($out)
2204 vmovdqu $xd1,0xe0($out)
2205 vmovdqu $xa2,0x100($out)
2206 vmovdqu $xb2,0x120($out)
2207 vmovdqu $xc2,0x140($out)
2208 vmovdqu $xd2,0x160($out)
2209 vmovdqu $xa3,0x180($out)
2210 vmovdqu $xb3,0x1a0($out)
2211 je .Ldone8x
2212
2213 lea 0x1c0($inp),$inp # inp+=64*7
2214 xor %r10,%r10
2215 vmovdqa $xc3,0x00(%rsp)
2216 lea 0x1c0($out),$out # out+=64*7
2217 sub \$448,$len # len-=64*7
2218 vmovdqa $xd3,0x20(%rsp)
2219
2220.Loop_tail8x:
2221 movzb ($inp,%r10),%eax
2222 movzb (%rsp,%r10),%ecx
2223 lea 1(%r10),%r10
2224 xor %ecx,%eax
2225 mov %al,-1($out,%r10)
2226 dec $len
2227 jnz .Loop_tail8x
2228
2229.Ldone8x:
3c274a6e 2230 vzeroall
a98c648e
AP
2231___
2232$code.=<<___ if ($win64);
2233 lea 0x290+0x30(%rsp),%r11
2234 movaps -0x30(%r11),%xmm6
2235 movaps -0x20(%r11),%xmm7
2236 movaps -0x10(%r11),%xmm8
2237 movaps 0x00(%r11),%xmm9
2238 movaps 0x10(%r11),%xmm10
2239 movaps 0x20(%r11),%xmm11
2240 movaps 0x30(%r11),%xmm12
2241 movaps 0x40(%r11),%xmm13
2242 movaps 0x50(%r11),%xmm14
2243 movaps 0x60(%r11),%xmm15
2244___
2245$code.=<<___;
2246 mov 0x280(%rsp),%rsp
2247 ret
2248.size ChaCha20_8x,.-ChaCha20_8x
2249___
2250}
2251
abb8c44f
AP
2252########################################################################
2253# AVX512 code paths
2254if ($avx>2) {
3c274a6e
AP
2255# This one handles shorter inputs...
2256
2257my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2258my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2259
2260sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2261 &vpaddd ($a,$a,$b);
2262 &vpxord ($d,$d,$a);
2263 &vprold ($d,$d,16);
2264
2265 &vpaddd ($c,$c,$d);
2266 &vpxord ($b,$b,$c);
2267 &vprold ($b,$b,12);
2268
2269 &vpaddd ($a,$a,$b);
2270 &vpxord ($d,$d,$a);
2271 &vprold ($d,$d,8);
2272
2273 &vpaddd ($c,$c,$d);
2274 &vpxord ($b,$b,$c);
2275 &vprold ($b,$b,7);
2276}
2277
2278my $xframe = $win64 ? 32+32+8 : 24;
2279
2280$code.=<<___;
2281.type ChaCha20_avx512,\@function,5
2282.align 32
2283ChaCha20_avx512:
2284.LChaCha20_avx512:
2285 cmp \$512,$len
2286 ja .LChaCha20_16x
2287
2288 push %rbx # just to share SEH handler, no pops
2289 push %rbp
2290 push %r12
2291 push %r13
2292 push %r14
2293 push %r15
2294
2295 sub \$64+$xframe,%rsp
2296___
2297$code.=<<___ if ($win64);
2298 movaps %xmm6,64+32(%rsp)
2299 movaps %xmm7,64+48(%rsp)
2300___
2301$code.=<<___;
2302 vbroadcasti32x4 .Lsigma(%rip),$a
2303 vbroadcasti32x4 ($key),$b
2304 vbroadcasti32x4 16($key),$c
2305 vbroadcasti32x4 ($counter),$d
2306
2307 vmovdqa32 $a,$a_
2308 vmovdqa32 $b,$b_
2309 vmovdqa32 $c,$c_
2310 vpaddd .Lzeroz(%rip),$d,$d
2311 vmovdqa32 .Lfourz(%rip),$fourz
2312 mov \$10,$counter # reuse $counter
2313 vmovdqa32 $d,$d_
2314 jmp .Loop_avx512
2315
2316.align 16
2317.Loop_outer_avx512:
2318 vmovdqa32 $a_,$a
2319 vmovdqa32 $b_,$b
2320 vmovdqa32 $c_,$c
2321 vpaddd $fourz,$d_,$d
2322 mov \$10,$counter
2323 vmovdqa32 $d,$d_
2324 jmp .Loop_avx512
2325
2326.align 32
2327.Loop_avx512:
2328___
2329 &AVX512ROUND();
2330 &vpshufd ($c,$c,0b01001110);
2331 &vpshufd ($b,$b,0b00111001);
2332 &vpshufd ($d,$d,0b10010011);
2333
2334 &AVX512ROUND();
2335 &vpshufd ($c,$c,0b01001110);
2336 &vpshufd ($b,$b,0b10010011);
2337 &vpshufd ($d,$d,0b00111001);
2338
2339 &dec ($counter);
2340 &jnz (".Loop_avx512");
2341
2342$code.=<<___;
2343 vpaddd $a_,$a,$a
2344 vpaddd $b_,$b,$b
2345 vpaddd $c_,$c,$c
2346 vpaddd $d_,$d,$d
2347
2348 sub \$64,$len
2349 jb .Ltail64_avx512
2350
2351 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2352 vpxor 0x10($inp),%x#$b,$t1
2353 vpxor 0x20($inp),%x#$c,$t2
2354 vpxor 0x30($inp),%x#$d,$t3
2355 lea 0x40($inp),$inp # inp+=64
2356
2357 vmovdqu $t0,0x00($out) # write output
2358 vmovdqu $t1,0x10($out)
2359 vmovdqu $t2,0x20($out)
2360 vmovdqu $t3,0x30($out)
2361 lea 0x40($out),$out # out+=64
2362
2363 jz .Ldone_avx512
2364
2365 vextracti32x4 \$1,$a,$t0
2366 vextracti32x4 \$1,$b,$t1
2367 vextracti32x4 \$1,$c,$t2
2368 vextracti32x4 \$1,$d,$t3
2369
2370 sub \$64,$len
2371 jb .Ltail_avx512
2372
2373 vpxor 0x00($inp),$t0,$t0 # xor with input
2374 vpxor 0x10($inp),$t1,$t1
2375 vpxor 0x20($inp),$t2,$t2
2376 vpxor 0x30($inp),$t3,$t3
2377 lea 0x40($inp),$inp # inp+=64
2378
2379 vmovdqu $t0,0x00($out) # write output
2380 vmovdqu $t1,0x10($out)
2381 vmovdqu $t2,0x20($out)
2382 vmovdqu $t3,0x30($out)
2383 lea 0x40($out),$out # out+=64
2384
2385 jz .Ldone_avx512
2386
2387 vextracti32x4 \$2,$a,$t0
2388 vextracti32x4 \$2,$b,$t1
2389 vextracti32x4 \$2,$c,$t2
2390 vextracti32x4 \$2,$d,$t3
2391
2392 sub \$64,$len
2393 jb .Ltail_avx512
2394
2395 vpxor 0x00($inp),$t0,$t0 # xor with input
2396 vpxor 0x10($inp),$t1,$t1
2397 vpxor 0x20($inp),$t2,$t2
2398 vpxor 0x30($inp),$t3,$t3
2399 lea 0x40($inp),$inp # inp+=64
2400
2401 vmovdqu $t0,0x00($out) # write output
2402 vmovdqu $t1,0x10($out)
2403 vmovdqu $t2,0x20($out)
2404 vmovdqu $t3,0x30($out)
2405 lea 0x40($out),$out # out+=64
2406
2407 jz .Ldone_avx512
2408
2409 vextracti32x4 \$3,$a,$t0
2410 vextracti32x4 \$3,$b,$t1
2411 vextracti32x4 \$3,$c,$t2
2412 vextracti32x4 \$3,$d,$t3
2413
2414 sub \$64,$len
2415 jb .Ltail_avx512
2416
2417 vpxor 0x00($inp),$t0,$t0 # xor with input
2418 vpxor 0x10($inp),$t1,$t1
2419 vpxor 0x20($inp),$t2,$t2
2420 vpxor 0x30($inp),$t3,$t3
2421 lea 0x40($inp),$inp # inp+=64
2422
2423 vmovdqu $t0,0x00($out) # write output
2424 vmovdqu $t1,0x10($out)
2425 vmovdqu $t2,0x20($out)
2426 vmovdqu $t3,0x30($out)
2427 lea 0x40($out),$out # out+=64
2428
2429 jnz .Loop_outer_avx512
2430
2431 jmp .Ldone_avx512
2432
2433.align 16
2434.Ltail64_avx512:
2435 vmovdqa %x#$a,0x00(%rsp)
2436 vmovdqa %x#$b,0x10(%rsp)
2437 vmovdqa %x#$c,0x20(%rsp)
2438 vmovdqa %x#$d,0x30(%rsp)
2439 add \$64,$len
2440 jmp .Loop_tail_avx512
2441
2442.align 16
2443.Ltail_avx512:
2444 vmovdqa $t0,0x00(%rsp)
2445 vmovdqa $t1,0x10(%rsp)
2446 vmovdqa $t2,0x20(%rsp)
2447 vmovdqa $t3,0x30(%rsp)
2448 add \$64,$len
2449
2450.Loop_tail_avx512:
2451 movzb ($inp,$counter),%eax
2452 movzb (%rsp,$counter),%ecx
2453 lea 1($counter),$counter
2454 xor %ecx,%eax
2455 mov %al,-1($out,$counter)
2456 dec $len
2457 jnz .Loop_tail_avx512
2458
2459 vmovdqa32 $a_,0x00(%rsp)
2460
2461.Ldone_avx512:
2462 vzeroall
2463___
2464$code.=<<___ if ($win64);
2465 movaps 64+32(%rsp),%xmm6
2466 movaps 64+48(%rsp),%xmm7
2467___
2468$code.=<<___;
2469 add \$64+$xframe+48,%rsp
2470 ret
2471.size ChaCha20_avx512,.-ChaCha20_avx512
2472___
2473}
2474if ($avx>2) {
2475# This one handles longer inputs...
2476
abb8c44f
AP
2477my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2478 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2479my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2480 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2481my @key=map("%zmm$_",(16..31));
2482my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2483
2484sub AVX512_lane_ROUND {
2485my ($a0,$b0,$c0,$d0)=@_;
2486my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2487my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2488my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2489my @x=map("\"$_\"",@xx);
2490
2491 (
2492 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2493 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2494 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2495 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2496 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2497 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2498 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2499 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2500 "&vprold (@x[$d0],@x[$d0],16)",
2501 "&vprold (@x[$d1],@x[$d1],16)",
2502 "&vprold (@x[$d2],@x[$d2],16)",
2503 "&vprold (@x[$d3],@x[$d3],16)",
2504
2505 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2506 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2507 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2508 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2509 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2510 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2511 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2512 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2513 "&vprold (@x[$b0],@x[$b0],12)",
2514 "&vprold (@x[$b1],@x[$b1],12)",
2515 "&vprold (@x[$b2],@x[$b2],12)",
2516 "&vprold (@x[$b3],@x[$b3],12)",
2517
2518 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2519 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2520 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2521 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2522 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2523 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2524 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2525 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2526 "&vprold (@x[$d0],@x[$d0],8)",
2527 "&vprold (@x[$d1],@x[$d1],8)",
2528 "&vprold (@x[$d2],@x[$d2],8)",
2529 "&vprold (@x[$d3],@x[$d3],8)",
2530
2531 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2532 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2533 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2534 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2535 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2536 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2537 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2538 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2539 "&vprold (@x[$b0],@x[$b0],7)",
2540 "&vprold (@x[$b1],@x[$b1],7)",
2541 "&vprold (@x[$b2],@x[$b2],7)",
2542 "&vprold (@x[$b3],@x[$b3],7)"
2543 );
2544}
2545
2546my $xframe = $win64 ? 0xb0 : 8;
2547
2548$code.=<<___;
2549.type ChaCha20_16x,\@function,5
2550.align 32
2551ChaCha20_16x:
2552.LChaCha20_16x:
2553 mov %rsp,%r11
2554 sub \$64+$xframe,%rsp
2555 and \$-64,%rsp
2556___
2557$code.=<<___ if ($win64);
2558 lea 0x290+0x30(%rsp),%r11
2559 movaps %xmm6,-0x30(%r11)
2560 movaps %xmm7,-0x20(%r11)
2561 movaps %xmm8,-0x10(%r11)
2562 movaps %xmm9,0x00(%r11)
2563 movaps %xmm10,0x10(%r11)
2564 movaps %xmm11,0x20(%r11)
2565 movaps %xmm12,0x30(%r11)
2566 movaps %xmm13,0x40(%r11)
2567 movaps %xmm14,0x50(%r11)
2568 movaps %xmm15,0x60(%r11)
2569___
2570$code.=<<___;
2571 vzeroupper
2572
2573 lea .Lsigma(%rip),%r10
2574 vbroadcasti32x4 (%r10),$xa3 # key[0]
2575 vbroadcasti32x4 ($key),$xb3 # key[1]
2576 vbroadcasti32x4 16($key),$xc3 # key[2]
2577 vbroadcasti32x4 ($counter),$xd3 # key[3]
2578
2579 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2580 vpshufd \$0x55,$xa3,$xa1
2581 vpshufd \$0xaa,$xa3,$xa2
2582 vpshufd \$0xff,$xa3,$xa3
2583 vmovdqa64 $xa0,@key[0]
2584 vmovdqa64 $xa1,@key[1]
2585 vmovdqa64 $xa2,@key[2]
2586 vmovdqa64 $xa3,@key[3]
2587
2588 vpshufd \$0x00,$xb3,$xb0
2589 vpshufd \$0x55,$xb3,$xb1
2590 vpshufd \$0xaa,$xb3,$xb2
2591 vpshufd \$0xff,$xb3,$xb3
2592 vmovdqa64 $xb0,@key[4]
2593 vmovdqa64 $xb1,@key[5]
2594 vmovdqa64 $xb2,@key[6]
2595 vmovdqa64 $xb3,@key[7]
2596
2597 vpshufd \$0x00,$xc3,$xc0
2598 vpshufd \$0x55,$xc3,$xc1
2599 vpshufd \$0xaa,$xc3,$xc2
2600 vpshufd \$0xff,$xc3,$xc3
2601 vmovdqa64 $xc0,@key[8]
2602 vmovdqa64 $xc1,@key[9]
2603 vmovdqa64 $xc2,@key[10]
2604 vmovdqa64 $xc3,@key[11]
2605
2606 vpshufd \$0x00,$xd3,$xd0
2607 vpshufd \$0x55,$xd3,$xd1
2608 vpshufd \$0xaa,$xd3,$xd2
2609 vpshufd \$0xff,$xd3,$xd3
2610 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2611 vmovdqa64 $xd0,@key[12]
2612 vmovdqa64 $xd1,@key[13]
2613 vmovdqa64 $xd2,@key[14]
2614 vmovdqa64 $xd3,@key[15]
2615
2616 mov \$10,%eax
2617 jmp .Loop16x
2618
2619.align 32
2620.Loop_outer16x:
2621 vpbroadcastd 0(%r10),$xa0 # reload key
2622 vpbroadcastd 4(%r10),$xa1
2623 vpbroadcastd 8(%r10),$xa2
2624 vpbroadcastd 12(%r10),$xa3
2625 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2626 vmovdqa64 @key[4],$xb0
2627 vmovdqa64 @key[5],$xb1
2628 vmovdqa64 @key[6],$xb2
2629 vmovdqa64 @key[7],$xb3
2630 vmovdqa64 @key[8],$xc0
2631 vmovdqa64 @key[9],$xc1
2632 vmovdqa64 @key[10],$xc2
2633 vmovdqa64 @key[11],$xc3
2634 vmovdqa64 @key[12],$xd0
2635 vmovdqa64 @key[13],$xd1
2636 vmovdqa64 @key[14],$xd2
2637 vmovdqa64 @key[15],$xd3
2638
2639 vmovdqa64 $xa0,@key[0]
2640 vmovdqa64 $xa1,@key[1]
2641 vmovdqa64 $xa2,@key[2]
2642 vmovdqa64 $xa3,@key[3]
2643
2644 mov \$10,%eax
2645 jmp .Loop16x
2646
2647.align 32
2648.Loop16x:
2649___
2650 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2651 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2652$code.=<<___;
2653 dec %eax
2654 jnz .Loop16x
2655
2656 vpaddd @key[0],$xa0,$xa0 # accumulate key
2657 vpaddd @key[1],$xa1,$xa1
2658 vpaddd @key[2],$xa2,$xa2
2659 vpaddd @key[3],$xa3,$xa3
2660
2661 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2662 vpunpckldq $xa3,$xa2,$xt3
2663 vpunpckhdq $xa1,$xa0,$xa0
2664 vpunpckhdq $xa3,$xa2,$xa2
2665 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2666 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2667 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2668 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2669___
2670 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2671$code.=<<___;
2672 vpaddd @key[4],$xb0,$xb0
2673 vpaddd @key[5],$xb1,$xb1
2674 vpaddd @key[6],$xb2,$xb2
2675 vpaddd @key[7],$xb3,$xb3
2676
2677 vpunpckldq $xb1,$xb0,$xt2
2678 vpunpckldq $xb3,$xb2,$xt3
2679 vpunpckhdq $xb1,$xb0,$xb0
2680 vpunpckhdq $xb3,$xb2,$xb2
2681 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2682 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2683 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2684 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2685___
2686 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2687$code.=<<___;
2688 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2689 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2690 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2691 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2692 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2693 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2694 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2695 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2696___
2697 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2698$code.=<<___;
2699 vpaddd @key[8],$xc0,$xc0
2700 vpaddd @key[9],$xc1,$xc1
2701 vpaddd @key[10],$xc2,$xc2
2702 vpaddd @key[11],$xc3,$xc3
2703
2704 vpunpckldq $xc1,$xc0,$xt2
2705 vpunpckldq $xc3,$xc2,$xt3
2706 vpunpckhdq $xc1,$xc0,$xc0
2707 vpunpckhdq $xc3,$xc2,$xc2
2708 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2709 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2710 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2711 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2712___
2713 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2714$code.=<<___;
2715 vpaddd @key[12],$xd0,$xd0
2716 vpaddd @key[13],$xd1,$xd1
2717 vpaddd @key[14],$xd2,$xd2
2718 vpaddd @key[15],$xd3,$xd3
2719
2720 vpunpckldq $xd1,$xd0,$xt2
2721 vpunpckldq $xd3,$xd2,$xt3
2722 vpunpckhdq $xd1,$xd0,$xd0
2723 vpunpckhdq $xd3,$xd2,$xd2
2724 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2725 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2726 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2727 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2728___
2729 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2730$code.=<<___;
2731 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2732 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2733 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2734 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2735 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2736 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2737 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2738 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2739___
2740 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2741$code.=<<___;
2742 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2743 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2744 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2745 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2746 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2747 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2748 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2749 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2750 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2751 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2752 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2753 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2754 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2755 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2756 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2757 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2758___
2759 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2760 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2761
2762 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2763 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2764 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2765 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2766$code.=<<___;
2767 cmp \$64*16,$len
2768 jb .Ltail16x
2769
2770 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2771 vpxord 0x40($inp),$xb0,$xb0
2772 vpxord 0x80($inp),$xc0,$xc0
2773 vpxord 0xc0($inp),$xd0,$xd0
2774 vmovdqu32 $xa0,0x00($out)
2775 vmovdqu32 $xb0,0x40($out)
2776 vmovdqu32 $xc0,0x80($out)
2777 vmovdqu32 $xd0,0xc0($out)
2778
2779 vpxord 0x100($inp),$xa1,$xa1
2780 vpxord 0x140($inp),$xb1,$xb1
2781 vpxord 0x180($inp),$xc1,$xc1
2782 vpxord 0x1c0($inp),$xd1,$xd1
2783 vmovdqu32 $xa1,0x100($out)
2784 vmovdqu32 $xb1,0x140($out)
2785 vmovdqu32 $xc1,0x180($out)
2786 vmovdqu32 $xd1,0x1c0($out)
2787
2788 vpxord 0x200($inp),$xa2,$xa2
2789 vpxord 0x240($inp),$xb2,$xb2
2790 vpxord 0x280($inp),$xc2,$xc2
2791 vpxord 0x2c0($inp),$xd2,$xd2
2792 vmovdqu32 $xa2,0x200($out)
2793 vmovdqu32 $xb2,0x240($out)
2794 vmovdqu32 $xc2,0x280($out)
2795 vmovdqu32 $xd2,0x2c0($out)
2796
2797 vpxord 0x300($inp),$xa3,$xa3
2798 vpxord 0x340($inp),$xb3,$xb3
2799 vpxord 0x380($inp),$xc3,$xc3
2800 vpxord 0x3c0($inp),$xd3,$xd3
2801 lea 0x400($inp),$inp
2802 vmovdqu32 $xa3,0x300($out)
2803 vmovdqu32 $xb3,0x340($out)
2804 vmovdqu32 $xc3,0x380($out)
2805 vmovdqu32 $xd3,0x3c0($out)
2806 lea 0x400($out),$out
2807
2808 sub \$64*16,$len
2809 jnz .Loop_outer16x
2810
2811 jmp .Ldone16x
2812
2813.align 32
2814.Ltail16x:
2815 xor %r10,%r10
2816 sub $inp,$out
2817 cmp \$64*1,$len
2818 jb .Less_than_64_16x
2819 vpxord ($inp),$xa0,$xa0 # xor with input
2820 vmovdqu32 $xa0,($out,$inp)
2821 je .Ldone16x
2822 vmovdqa32 $xb0,$xa0
2823 lea 64($inp),$inp
2824
2825 cmp \$64*2,$len
2826 jb .Less_than_64_16x
2827 vpxord ($inp),$xb0,$xb0
2828 vmovdqu32 $xb0,($out,$inp)
2829 je .Ldone16x
2830 vmovdqa32 $xc0,$xa0
2831 lea 64($inp),$inp
2832
2833 cmp \$64*3,$len
2834 jb .Less_than_64_16x
2835 vpxord ($inp),$xc0,$xc0
2836 vmovdqu32 $xc0,($out,$inp)
2837 je .Ldone16x
2838 vmovdqa32 $xd0,$xa0
2839 lea 64($inp),$inp
2840
2841 cmp \$64*4,$len
2842 jb .Less_than_64_16x
2843 vpxord ($inp),$xd0,$xd0
2844 vmovdqu32 $xd0,($out,$inp)
2845 je .Ldone16x
2846 vmovdqa32 $xa1,$xa0
2847 lea 64($inp),$inp
2848
2849 cmp \$64*5,$len
2850 jb .Less_than_64_16x
2851 vpxord ($inp),$xa1,$xa1
2852 vmovdqu32 $xa1,($out,$inp)
2853 je .Ldone16x
2854 vmovdqa32 $xb1,$xa0
2855 lea 64($inp),$inp
2856
2857 cmp \$64*6,$len
2858 jb .Less_than_64_16x
2859 vpxord ($inp),$xb1,$xb1
2860 vmovdqu32 $xb1,($out,$inp)
2861 je .Ldone16x
2862 vmovdqa32 $xc1,$xa0
2863 lea 64($inp),$inp
2864
2865 cmp \$64*7,$len
2866 jb .Less_than_64_16x
2867 vpxord ($inp),$xc1,$xc1
2868 vmovdqu32 $xc1,($out,$inp)
2869 je .Ldone16x
2870 vmovdqa32 $xd1,$xa0
2871 lea 64($inp),$inp
2872
2873 cmp \$64*8,$len
2874 jb .Less_than_64_16x
2875 vpxord ($inp),$xd1,$xd1
2876 vmovdqu32 $xd1,($out,$inp)
2877 je .Ldone16x
2878 vmovdqa32 $xa2,$xa0
2879 lea 64($inp),$inp
2880
2881 cmp \$64*9,$len
2882 jb .Less_than_64_16x
2883 vpxord ($inp),$xa2,$xa2
2884 vmovdqu32 $xa2,($out,$inp)
2885 je .Ldone16x
2886 vmovdqa32 $xb2,$xa0
2887 lea 64($inp),$inp
2888
2889 cmp \$64*10,$len
2890 jb .Less_than_64_16x
2891 vpxord ($inp),$xb2,$xb2
2892 vmovdqu32 $xb2,($out,$inp)
2893 je .Ldone16x
2894 vmovdqa32 $xc2,$xa0
2895 lea 64($inp),$inp
2896
2897 cmp \$64*11,$len
2898 jb .Less_than_64_16x
2899 vpxord ($inp),$xc2,$xc2
2900 vmovdqu32 $xc2,($out,$inp)
2901 je .Ldone16x
2902 vmovdqa32 $xd2,$xa0
2903 lea 64($inp),$inp
2904
2905 cmp \$64*12,$len
2906 jb .Less_than_64_16x
2907 vpxord ($inp),$xd2,$xd2
2908 vmovdqu32 $xd2,($out,$inp)
2909 je .Ldone16x
2910 vmovdqa32 $xa3,$xa0
2911 lea 64($inp),$inp
2912
2913 cmp \$64*13,$len
2914 jb .Less_than_64_16x
2915 vpxord ($inp),$xa3,$xa3
2916 vmovdqu32 $xa3,($out,$inp)
2917 je .Ldone16x
2918 vmovdqa32 $xb3,$xa0
2919 lea 64($inp),$inp
2920
2921 cmp \$64*14,$len
2922 jb .Less_than_64_16x
2923 vpxord ($inp),$xb3,$xb3
2924 vmovdqu32 $xb3,($out,$inp)
2925 je .Ldone16x
2926 vmovdqa32 $xc3,$xa0
2927 lea 64($inp),$inp
2928
2929 cmp \$64*15,$len
2930 jb .Less_than_64_16x
2931 vpxord ($inp),$xc3,$xc3
2932 vmovdqu32 $xc3,($out,$inp)
2933 je .Ldone16x
2934 vmovdqa32 $xd3,$xa0
2935 lea 64($inp),$inp
2936
2937.Less_than_64_16x:
2938 vmovdqa32 $xa0,0x00(%rsp)
2939 lea ($out,$inp),$out
2940 and \$63,$len
2941
2942.Loop_tail16x:
2943 movzb ($inp,%r10),%eax
2944 movzb (%rsp,%r10),%ecx
2945 lea 1(%r10),%r10
2946 xor %ecx,%eax
2947 mov %al,-1($out,%r10)
2948 dec $len
2949 jnz .Loop_tail16x
2950
3c274a6e
AP
2951 vpxord $xa0,$xa0,$xa0
2952 vmovdqa32 $xa0,0(%rsp)
2953
abb8c44f 2954.Ldone16x:
3c274a6e 2955 vzeroall
abb8c44f
AP
2956___
2957$code.=<<___ if ($win64);
2958 lea 0x290+0x30(%rsp),%r11
2959 movaps -0x30(%r11),%xmm6
2960 movaps -0x20(%r11),%xmm7
2961 movaps -0x10(%r11),%xmm8
2962 movaps 0x00(%r11),%xmm9
2963 movaps 0x10(%r11),%xmm10
2964 movaps 0x20(%r11),%xmm11
2965 movaps 0x30(%r11),%xmm12
2966 movaps 0x40(%r11),%xmm13
2967 movaps 0x50(%r11),%xmm14
2968 movaps 0x60(%r11),%xmm15
2969___
2970$code.=<<___;
2971 mov %r11,%rsp
2972 ret
2973.size ChaCha20_16x,.-ChaCha20_16x
2974___
2975}
2976
a98c648e 2977foreach (split("\n",$code)) {
3c274a6e 2978 s/\`([^\`]*)\`/eval $1/ge;
a98c648e 2979
3c274a6e 2980 s/%x#%[yz]/%x/g; # "down-shift"
a98c648e
AP
2981
2982 print $_,"\n";
2983}
2984
2985close STDOUT;